1.数据探索
#-*- coding: utf-8 -*-
#对数据进行基本的探索,返回缺失值以及最大值,最小值
import pandas as pd
datafile='G:/学习资料/统计/chapter7/demo/data/air_data.csv'
resultfile1='G:/学习资料/统计/chapter7/demo/tmp/explore1.xls'
#将数据导入
data = pd.read_csv(datafile,encoding='utf-8')
#查看数据的各个属性的相关信息
data.info()
#将每个属性的描述性统计量展示
explore=data.describe(percentiles=[],include='all').T
#计算属性对应的空值个数
explore['null']=len(data)-explore['count']
#选取部分探索内容
explore = explore[['null','max','min']]
#属性列名的重命名
explore.columns = [u'空值记录数',u'最大值',u'最小值']
#对索引命名
explore.index.name=u'属性名称'
#将探索的结果保存到excel文件
explore.to_excel(resultfile1,header=True)
2.数据清洗
#-*- coding:utf-8 -*-
#数据清洗,过滤掉不符合规则的数据
import pandas as pd
datafile='G:/学习资料/统计/chapter7/demo/data/air_data.csv'
cleanedfile1='G:/学习资料/统计/chapter7/demo/tmp/data_cleaned1.csv'
data=pd.read_csv(datafile, encoding='utf-8')#读取原始数据,指定UTF-8编码
data=data[data['SUM_YR_1'].notnull()*data['SUM_YR_2'].notnull()] #票价非空值才保留
#只保留票价非零,或者平均折扣率与总飞行公里数同时为0的记录
index1=data['SUM_YR_1']!=0
index2=data['SUM_YR_2']!=0
index3=(data['SEG_KM_SUM']==0)&(data['avg_discount']==0) #“与”
data=data[index1|index2|index3] #“或”
data.to_csv(cleanedfile1)
报错:UserWarning: evaluating in Python space because the '*' operator is not supported by numexpr for the bool dtype, use '&' instead
解决:将“*”替换为“&”即可。
报错:如果最后一行像书中一样是data.to_excel(),会出现ValueError: No engine for filetype: 'csv'
解决:改为data.to_csv()
3.数据变化
import numpy as np
import pandas as pd
inputfile='G:/学习资料/统计/chapter7/demo/tmp/data_cleaned1.csv'
outputfile='G:/学习资料/统计/chapter7/demo/tmp/transform_data.csv'
data=pd.read_csv(inputfile,encoding='utf-8')
data = data[['LOAD_TIME', 'FFP_DATE', 'LAST_TO_END', 'FLIGHT_COUNT', 'SEG_KM_SUM', 'avg_discount']]
#data['L']=pd.datetime(data['LOAD_TIME'])-pd.datetime(data['FFP_DATE'])
#data['L']=int(((parse(data['LOAD_TIME'])-parse(data['FFP_ADTE'])).days)/30)
d_ffp = pd.to_datetime(data['FFP_DATE'])
d_load = pd.to_datetime(data['LOAD_TIME'])
res = d_load - d_ffp
data['L'] = res.map(lambda x: x / np.timedelta64(30 * 24 * 60, 'm'))
data['R'] = data['LAST_TO_END']
data['F'] = data['FLIGHT_COUNT']
data['M'] = data['SEG_KM_SUM']
data['C'] = data['avg_discount']
data = data[['L', 'R', 'F', 'M', 'C']]
data.to_csv(outputfile)
print('finish')
引用链接:https://www.cnblogs.com/caicaihong/p/5853727.html
这段我有好多看不懂
结果:
| L | R | F | M | C | |
| 0 | 90.2 | 1 | 210 | 580717 | 0.961639043 |
| 1 | 86.56666667 | 7 | 140 | 293678 | 1.25231444 |
| 2 | 87.16666667 | 11 | 135 | 283712 | 1.254675516 |
| 3 | 68.23333333 | 97 | 23 | 281336 | 1.090869565 |
| 4 | 60.53333333 | 5 | 152 | 309928 | 0.970657895 |
| 5 | 74.7 | 79 | 92 | 294585 | 0.967692483 |
| 6 | 97.7 | 1 | 101 | 287042 | 0.965346535 |
| 7 | 48.4 | 3 | 73 | 287230 | 0.962070222 |
4.标准差标准化
#-*- coding: utf-8 -*-
#标准差标准化
import pandas as pd
datafile='G:/学习资料/统计/chapter7/demo/data/zscoredata.xls'
zscoredfile1='G:/学习资料/统计/chapter7/demo/tmp/zscoreddata1.xls'
#标准化处理
data=pd.read_excel(datafile)
data=(data-data.mean(axis=0))/(data.std(axis=0))
data.columns=['Z'+i for i in data.columns] #表头重命名
data.to_excel(zscoredfile1, index=False) #数据写入
5.客户聚类
# -*- coding:utf-8 -*-
#导入相关包
import pandas as pd
from sklearn.cluster import KMeans
#初始化数据文件路径
datafile='G:/学习资料/统计/chapter7/demo/tmp/zscoreddata1.xls'
#读出数据
k = 5 #设置类别个数5
data = pd.read_excel(datafile)
#创建聚类模型对象
model = KMeans(n_clusters=k)
model.fit(data) #训练模型
#输出模型的聚类中心,查看各样本的类别
print('model.cluster_centers_:',model.cluster_centers_)
print('model.labels_:',model.labels_)
输出:
model.cluster_centers_: [[ 0.05149901 -0.00271409 -0.23091725 -0.23512403 2.1772287 ]
[-0.3140856 1.6865205 -0.57387353 -0.5366343 -0.17248573]
[ 0.48328124 -0.79944829 2.48261709 2.42380768 0.30918674]
[ 1.16070557 -0.3773381 -0.08693346 -0.09487644 -0.15850472]
[-0.70061986 -0.41524869 -0.16070751 -0.16048066 -0.25656109]]
model.labels_: [3 3 3 ... 3 3 3]
放到excel中,做成雷达图
版权声明:本文为weixin_42764993原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。