python 利用box-cox、log对数做数据预处理,重点处理数据中的极端分布,并实现可视化
import pandas as pd
import sklearn.preprocessing as preproc
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import copy
biz_df = pd.read_csv('D:\\data\\feature\\feature.csv', delimiter=',')
# biz_df =biz_df.drop(labels=['max_tr_dt','tr_dt'], axis=1)
df_save = copy.deepcopy(biz_df)
# 原始数据-文章中单词数量
# print(df['max_day_hour_count'])
featureName = "avg_activity_day_amt"
# 设置\lambda为0,使用对数变化(没有固定长度的位移)
rc_log = stats.boxcox(biz_df[featureName], lmbda=0)
biz_df['rc_log']=rc_log
cols = df_save.columns
for item in cols:
if(item == "acct"):
continue
# df_save[item] = stats.boxcox(biz_df[item] + 1, lmbda=0)
# df_save[item] = np.log10(df_save[item] + 1)/np.log10(df_save[item].max() + 1)
df_save[item] = np.log10(df_save[item] + 1)
# print(item)
# print(cols)
# rc_log = stats.boxcox(biz_df, lmbda=0)
# # biz_df['rc_log']=rc_log
# print(rc_log)
df3 = pd.DataFrame(data=df_save,columns=cols)
df3.to_csv('D:\\data\\log10\\result.csv',index=False)
# 默认情况下, Scipy在实现Box-Cox转换时,会找出使得输出最接近正态分布的\lambda函数
rc_bc, bc_params = stats.boxcox(biz_df[featureName])
bc_params
# 保存Box-Cox转换的结果
biz_df['rc_bc']=rc_bc
# log10
#log_feature
# biz_df['log_feature'] = np.log10(biz_df[featureName] + 1)
# biz_df['log_feature'] = np.log(biz_df[featureName] + 1)
# biz_df['log_feature'] = np.log10(biz_df[featureName] + 1)/np.log10(biz_df[featureName].max() + 1)
biz_df['log_feature'] = np.log(biz_df[featureName] + 1)/np.log(biz_df[featureName].max() + 1)
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1)
fig.tight_layout(pad=0, w_pad=4.0, h_pad=4.0)
# 原始数据
biz_df[featureName].hist(ax=ax1, bins=100)
ax1.set_yscale('log')
ax1.tick_params(labelsize=7)
ax1.set_title('Review Counts Histogram', fontsize=7)
ax1.set_xlabel('')
ax1.set_ylabel('Occurrence', fontsize=7)
# 对数转换
biz_df['rc_log'].hist(ax=ax2, bins=100)
ax2.set_yscale('log')
ax2.tick_params(labelsize=7)
ax2.set_title('Log Transformed Counts Histogram', fontsize=7)
ax2.set_xlabel('')
ax2.set_ylabel('Occurrence', fontsize=7)
# log10 对数
biz_df['log_feature'].hist(ax=ax3, bins=100)
ax3.tick_params(labelsize=7)
ax3.set_xlabel('log Article word count', fontsize=7)
# Box-Cox 转换
biz_df['rc_bc'].hist(ax=ax4, bins=100)
ax4.set_yscale('log')
ax4.tick_params(labelsize=7)
ax4.set_title('Box-Cox Transformed Counts Histogram', fontsize=7)
ax4.set_xlabel('')
ax4.set_ylabel('Occurrence', fontsize=7)
plt.show()
效果图展示
版权声明:本文为qq_31823889原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。