sklearn-二手汽车交易价格预测

任务

以二手车市场为背景,要求预测二手汽车的交易价格。
题目地址

https://tianchi.aliyun.com/competition/entrance/231784/information

该数据来自某交易平台的二手车交易记录,总数据量超过40w,包含31列变量信息,其中15列为匿名变量。为了保证比赛的公平性,将会从中抽取15万条作为训练集,5万条作为测试集A,5万条作为测试集B,同时会对name、model、brand和regionCode等信息进行脱敏。

数据描述

字段表
Field Description
SaleID 交易ID,唯一编码
name 汽车交易名称,已脱敏
regDate 汽车注册日期,例如20160101,2016年01月01日
model 车型编码,已脱敏
brand 汽车品牌,已脱敏
bodyType 车身类型:豪华轿车:0,微型车:1,厢型车:2,大巴车:3,敞篷车:4,双门汽车:5,商务车:6,搅拌车:7
fuelType 燃油类型:汽油:0,柴油:1,液化石油气:2,天然气:3,混合动力:4,其他:5,电动:6
gearbox 变速箱:手动:0,自动:1
power 发动机功率:范围 [ 0, 600 ]
kilometer 汽车已行驶公里,单位万km
notRepairedDamage 汽车有尚未修复的损坏:是:0,否:1
regionCode 地区编码,已脱敏
seller 销售方:个体:0,非个体:1
offerType 报价类型:提供:0,请求:1
creatDate 汽车上线时间,即开始售卖时间
price 二手车交易价格(预测目标)
v系列特征 匿名特征,包含v0-14在内15个匿名特征

评测标准

评价标准为MAE(Mean Absolute Error)。
enter image description here
MAE越小,说明模型预测得越准确。

代码

回归预测基本流程

import pandas as pd
#读取数据
train_data = pd.read_csv('train.csv', sep = " ") #sep 分割符,默认 “,”
#train_data = train_data[:50000]
test_data = pd.read_csv('test.csv', sep = " ")

def preView(data):
    print("data.shape",data.shape)
    print("data.columns",data.columns)
    # 非数值和数值型特征
    numericalCols = []
    categoricalCols = []
    for col in train_data.columns:
        if train_data[col].dtype == 'object':
            categoricalCols.append(col)
        else:
            numericalCols.append(col)

    # 查看非数值型特征缺失值
    # 删除缺失值达到60%以上的
    miss_cols = []
    for col in train_data.columns:
        missSum = train_data[col].isnull().sum()
        missRatio = 100*missSum/train_data.shape[0]
        if missRatio >= 60:
            train_data.drop(col,axis=1,inplace=True)
        elif missRatio>0:
            miss_cols.append(col)
            if col in numericalCols:
                print("numericalCols:{} :缺失数 {} ,占比 :{:.1f}%".format(col,missSum,missRatio))
            else:
                print("categoricalCols:{} :缺失数 {} ,占比 :{:.1f}%".format(col,missSum,missRatio))

    print("categoricalCols : ",len(categoricalCols))
    print("numericalCols : ",len(numericalCols))

preView(train_data)
preView(test_data)

# 查看相关性
import seaborn as sns
import matplotlib.pyplot as plt
abs(train_data.corr()['price']).sort_values(ascending = False)

"""numericalCols:model :缺失数 1 ,占比 :0.0%
numericalCols:bodyType :缺失数 4506 ,占比 :3.0%
numericalCols:fuelType :缺失数 8680 ,占比 :5.8%
numericalCols:gearbox :缺失数 5981 ,占比 :4.0%"""
from sklearn.impute import SimpleImputer,KNNImputer
from sklearn.preprocessing import LabelEncoder

# 缺失值处理
def medianImputer(data):
    imputer  = SimpleImputer(strategy='median')
    data[miss_cols] = imputer.fit_transform(data[miss_cols])
    return data

def knnImputer(data):
    imputer = KNNImputer()
    data[miss_cols] = imputer.fit_transform(data[miss_cols])
    return data

def preprocesssing(data):
    #drop  SaleID
    data = data.drop(['SaleID'],axis = 1)
    
    # 缺失值处理
    data = medianImputer(data)
    
    # 对于两个时间特征
    # drop regDate 用regYear替代
    data['regYear'] = round(data['regDate']/10000,0)
    data = data.drop(['regDate'], axis = 1)
    # 发现creatDate仅有几十个类别的数值 用labelencoder处理方便之后归一化
    encoder = LabelEncoder()
    data['creatDate'] = encoder.fit_transform(data['creatDate'])
    #Object categorical
    data['notRepairedDamage'] = encoder.fit_transform(data['notRepairedDamage'])
    return data

train_data = preprocesssing(train_data)
test_data = preprocesssing(test_data)

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate

X = train_data.drop(['price'],axis = 1)
y = train_data['price']
# 将特征标准化
scaler = StandardScaler()
X = scaler.fit_transform(X)
test_data = scaler.fit_transform(test_data)
# price取值太大了,这里将price 用np.log1p缩放
y = np.log1p(y)

from sklearn.metrics import make_scorer,mean_absolute_error
def mae_cv(model):
    mae = cross_validate(model, X, y, scoring = make_scorer(mean_absolute_error), cv = 10)
    return mae['test_score']

from sklearn.decomposition import PCA
# pca降维 使得保留大部分有用特征信息的情况下 加快模型收敛
pca = PCA(n_components = 10)
pca_X = pca.fit_transform(X)

from sklearn.linear_model import Lasso,Ridge,ElasticNet,LinearRegression,SGDRegressor

LR = LinearRegression()
LR = LR.fit(pca_X, y)
print(mae_cv(LR))

lasso = Lasso(alpha = 0.1)
lasso = lasso.fit(pca_X, y)
print(mae_cv(lasso))

ridge = Ridge(alpha = 0.1)
ridge = ridge.fit(pca_X, y)
print(mae_cv(ridge))

elastic = ElasticNet(random_state=0)
elastic = elastic.fit(pca_X, y)
print(mae_cv(elastic))

SGD = SGDRegressor()
SGD = SGD.fit(pca_X, y)
print(mae_cv(SGD))

# 数据量有点大,模型计算复杂迭代次数多了这里很慢
from sklearn.ensemble import GradientBoostingRegressor
GBR = GradientBoostingRegressor()
GBR = GBR.fit(pca_X, y)
print(mae_cv(GBR))

import xgboost as xgb
XGBR = xgb.XGBRegressor()
XGBR = XGBR.fit(x_train, y)

#用训练好的模型预测test数据
y_pred = model.predict(test_data)
# y = np.log1p(y)
y_pred = np.expm1(y_pred)
predict = pd.DataFrame()
predict['SaleID'] = range(150000,200000)
predict['price'] = y_pred
predict.to_csv('predict.csv',index = None)

版权声明:本文为qq_43402639原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。