python 利用box-cox、log对数做数据预处理,重点处理数据中的极端分布,并实现可视化

python 利用box-cox、log对数做数据预处理,重点处理数据中的极端分布,并实现可视化

import pandas as pd
import sklearn.preprocessing as preproc
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import copy

biz_df = pd.read_csv('D:\\data\\feature\\feature.csv', delimiter=',')

# biz_df =biz_df.drop(labels=['max_tr_dt','tr_dt'], axis=1)

df_save = copy.deepcopy(biz_df)

# 原始数据-文章中单词数量
# print(df['max_day_hour_count'])

featureName = "avg_activity_day_amt"

#  设置\lambda为0,使用对数变化(没有固定长度的位移)
rc_log = stats.boxcox(biz_df[featureName], lmbda=0)
biz_df['rc_log']=rc_log

cols = df_save.columns
for item in cols:
    if(item == "acct"):
        continue
    # df_save[item] = stats.boxcox(biz_df[item] + 1, lmbda=0)
    
    # df_save[item] = np.log10(df_save[item] + 1)/np.log10(df_save[item].max() + 1)
    
    df_save[item] = np.log10(df_save[item] + 1)
    
    # print(item)
# print(cols)
# rc_log = stats.boxcox(biz_df, lmbda=0)
# # biz_df['rc_log']=rc_log
# print(rc_log)
df3 = pd.DataFrame(data=df_save,columns=cols)
df3.to_csv('D:\\data\\log10\\result.csv',index=False)

# 默认情况下, Scipy在实现Box-Cox转换时,会找出使得输出最接近正态分布的\lambda函数
rc_bc, bc_params = stats.boxcox(biz_df[featureName])
bc_params
#  保存Box-Cox转换的结果
biz_df['rc_bc']=rc_bc


# log10
#log_feature
# biz_df['log_feature'] = np.log10(biz_df[featureName] + 1)
# biz_df['log_feature'] = np.log(biz_df[featureName] + 1)

# biz_df['log_feature'] = np.log10(biz_df[featureName] + 1)/np.log10(biz_df[featureName].max() + 1)
biz_df['log_feature'] = np.log(biz_df[featureName] + 1)/np.log(biz_df[featureName].max() + 1)

fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1)
fig.tight_layout(pad=0, w_pad=4.0, h_pad=4.0)
# 原始数据
biz_df[featureName].hist(ax=ax1, bins=100)
ax1.set_yscale('log')
ax1.tick_params(labelsize=7)
ax1.set_title('Review Counts Histogram', fontsize=7)
ax1.set_xlabel('')
ax1.set_ylabel('Occurrence', fontsize=7)
 
# 对数转换
biz_df['rc_log'].hist(ax=ax2, bins=100)
ax2.set_yscale('log')
ax2.tick_params(labelsize=7)
ax2.set_title('Log Transformed Counts Histogram', fontsize=7)
ax2.set_xlabel('')
ax2.set_ylabel('Occurrence', fontsize=7)

# log10 对数
biz_df['log_feature'].hist(ax=ax3, bins=100)
ax3.tick_params(labelsize=7)
ax3.set_xlabel('log Article word count', fontsize=7)
 
# Box-Cox 转换
biz_df['rc_bc'].hist(ax=ax4, bins=100)
ax4.set_yscale('log')
ax4.tick_params(labelsize=7)
ax4.set_title('Box-Cox Transformed Counts Histogram', fontsize=7)
ax4.set_xlabel('')
ax4.set_ylabel('Occurrence', fontsize=7)

plt.show()

效果图展示

在这里插入图片描述


版权声明:本文为qq_31823889原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。