在机器学习的特征筛选过程中试图保证穷尽所有有用的特征,选择了双向逐步回归的方法,在网上找双向逐步回归的代码时废了一番力气,最终采用了这一篇【python、R如何实现逐步回归(前向、后向、双向)】
在其基础上改动了一下,添加了N(训练集测试集划分依据)、lag、random参数,可用于时序数据机器学习的特征筛选,贴上来以作分享。
################################习惯性import所有可能用到的包
import pandas as pd
import os
import numpy as np
import scipy
import scipy.stats
from statsmodels.formula.api import ols
import matplotlib.pyplot as plt
from matplotlib import colors as mcolors
%pylab
%matplotlib inline
matplotlib.rcParams['font.sans-serif'] = ['Microsoft YaHei'] #指定默认字体
matplotlib.rc("savefig", dpi=400)
from datetime import datetime
import statsmodels.api as sm
import statsmodels.formula.api as smf
# from pandas import Int64Index as NumericIndex
import winreg
from sklearn.linear_model import Lasso###导入岭回归算法
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNetCV, ElasticNet
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
#################################### 逐步回归
def stepwise_select(data,nlist,Y,N,lag=False,random=2,method='forward'):
#from tqdm import tqdm
'''
args:
data:数据源,df
nlist:逐步回归的全部字段
Y:目标,str
N:测试集数量,可为int或float,分别代表以某一节点划分测试集训练集,以及按照比例划分
lag:滞后阶数,当时序数据需要自相关回归时设置,int
random:随机种子
methrod:方法,forward:向前,backward:向后,both:双向
return:
select_col:最终保留的字段列表,list
rmsError:RMSE
test_r2_score:测试集R方
train_r2_score:训练集R方
test_rmse:测试集误差
train_rmse:训练集误差
'''
if lag==False:
data_new=data.copy()
nlist=nlist.copy()
elif type(lag)==int:
data_new=data.copy()
for i in range(1,lag+1):
data_new["lag_{}".format(i)] = data_new[Y].shift(i).fillna(0)
nlist=nlist.copy()+["lag_{}".format(i) for i in range(1,lag+1)]
else:
print('lag需要输入int类型')
labels = data_new[Y]
# labels = [float(label) for label in data_new[Y].tolist()] # 提取出data_new中的标签集并放入列表中
# names = nlist # 提取出data_new中所有x属性的名称并放入列表中
# import statsmodels.api as sm
cv_model = make_pipeline(StandardScaler(with_mean=False), ElasticNetCV(l1_ratio=0.5, eps=1e-3, n_alphas=200,
fit_intercept=True, precompute='auto',
max_iter=2000, tol=0.006, cv=10, copy_X=True,
verbose=0, n_jobs=-1, positive=False,
random_state=0))
num=0
######################## 1.前向回归
# 前向回归:从一个变量都没有开始,一个变量一个变量的加入到模型中,直至没有可以再加入的变量结束
if method == 'forward':
add_col = []
rmsError_None_value = np.inf
while nlist:
num+=1
now=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
print(now,' round',num,)
# 单个变量加入,计算rmsError
rmsError = {}
for col in nlist:
# print(col)
X_col= add_col.copy() # 构造用于存放属性集的列表
X_col.append(col)
xList=data_new[col] # 列表xList中的每个元素对应着data_new中除去标签列的每一行
# X = sm.add_constant(data[X_col])
if type(N)==int:
X_train=xList[:N]
X_test=xList[N:]
y_train=labels[:N]
y_test=labels[N:]
elif type(N)==float:
X_train,X_test,y_train,y_test=train_test_split(xList,labels,test_size=N,random_state=random,shuffle=True)
cv_model.fit(X_train,y_train)
rmsError[col] = np.linalg.norm((y_test-cv_model.predict(X_test)), 2)/sqrt(len(y_test))
rmsError_min_value = min(rmsError.values())
rmsError_min_key = min(rmsError,key=rmsError.get)
# 如果最小的rmsError小于不加该变量时的rmsError,则加入变量,否则停止
if rmsError_min_value < rmsError_None_value:
nlist.remove(rmsError_min_key)
add_col.append(rmsError_min_key)
rmsError_None_value = rmsError_min_value
else:
break
select_col = add_col
######################## 2.后向回归
# 从全部变量都在模型中开始,一个变量一个变量的删除,直至没有可以再删除的变量结束
elif method == 'backward':
p = True
# 全部变量,一个都不剔除,计算初始rmsError
xList=data_new[nlist]
if type(N)==int:
X_train=xList[:N]
X_test=xList[N:]
y_train=labels[:N]
y_test=labels[N:]
elif type(N)==float:
X_train,X_test,y_train,y_test=train_test_split(xList,labels,test_size=N,random_state=random,shuffle=True)
cv_model.fit(X_train,y_train)
rmsError_None_value= np.linalg.norm((y_test-cv_model.predict(X_test)), 2)/sqrt(len(y_test))
while p: # 删除一个字段提取rmsError最小的字段
num+=1
now=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
print(now,' round',num,)
rmsError = {}
for col in nlist:
# print(col)
X_col = [c for c in nlist if c !=col]
xList= data_new[X_col]
if type(N)==int:
X_train=xList[:N]
X_test=xList[N:]
y_train=labels[:N]
y_test=labels[N:]
elif type(N)==float:
X_train,X_test,y_train,y_test=train_test_split(xList,labels,test_size=N,random_state=random,shuffle=True)
cv_model.fit(X_train,y_train)
rmsError[col] = np.linalg.norm((y_test-cv_model.predict(X_test)), 2)/sqrt(len(y_test))
rmsError_min_value = min(rmsError.values())
rmsError_min_key = min(rmsError, key=rmsError.get)
# 如果最小的rmsError小于不删除该变量时的rmsError,则删除该变量,否则停止
if rmsError_min_value < rmsError_None_value:
nlist.remove(rmsError_min_key)
rmsError_None_value = rmsError_min_value
p = True
else:
break
select_col = nlist
######################## 3.双向回归
elif method == 'both':
p = True
add_col = []
# 全部变量,一个都不剔除,计算初始rmsError
X_col = nlist.copy()
# for i in range(len(data_new)):
xList= data_new[X_col]
# print(len(data_new))
# print(len(xList))
# print(len(labels))
if type(N)==int:
X_train=xList[:N]
X_test=xList[N:]
y_train=labels[:N]
y_test=labels[N:]
elif type(N)==float:
X_train,X_test,y_train,y_test=train_test_split(xList,labels,test_size=N,random_state=random,shuffle=True)
cv_model.fit(X_train,y_train)
rmsError_None_value = np.linalg.norm((y_test-cv_model.predict(X_test)), 2)/sqrt(len(y_test))
while p:
num+=1
now=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
print(now,' round',num,)
# 删除一个字段提取rmsError最小的字段
rmsError={}
for col in nlist:
# print(col)
X_col = [c for c in nlist if c !=col]
xList= data_new[X_col]
if type(N)==int:
X_train=xList[:N]
X_test=xList[N:]
y_train=labels[:N]
y_test=labels[N:]
elif type(N)==float:
X_train,X_test,y_train,y_test=train_test_split(xList,labels,test_size=N,random_state=random,shuffle=True)
cv_model.fit(X_train,y_train)
rmsError[col] = np.linalg.norm((y_test-cv_model.predict(X_test)), 2)/sqrt(len(y_test))
rmsError_min_value = min(rmsError.values())
rmsError_min_key = min(rmsError, key=rmsError.get)
if len(add_col) == 0: # 第一次只有删除操作,不循环加入变量
if rmsError_min_value < rmsError_None_value:
nlist.remove(rmsError_min_key)
add_col.append(rmsError_min_key)
rmsError_None_value = rmsError_min_value
p = True
else:
break
else:
# 单个变量加入,计算rmsError
for col in add_col:
# print(col)
X_col = nlist.copy()
X_col.append(col)
xList= data_new[X_col]
if type(N)==int:
X_train=xList[:N]
X_test=xList[N:]
y_train=labels[:N]
y_test=labels[N:]
elif type(N)==float:
X_train,X_test,y_train,y_test=train_test_split(xList,labels,test_size=N,random_state=random,shuffle=True)
cv_model.fit(X_train,y_train)
rmsError[col] = np.linalg.norm((y_test-cv_model.predict(X_test)), 2)/sqrt(len(y_test))
rmsError_min_value = min(rmsError.values())
rmsError_min_key = min(rmsError, key=rmsError.get)
if rmsError_min_value < rmsError_None_value:
# 如果rmsError最小的字段在添加变量阶段产生,则加入该变量,如果rmsError最小的字段在删除阶段产生,则删除该变量
if rmsError_min_key in add_col:
nlist.append(rmsError_min_key)
add_col = list(set(add_col)-set(rmsError_min_key))
p = True
else:
nlist.remove(rmsError_min_key)
add_col.append(rmsError_min_key)
p = True
rmsError_None_value = rmsError_min_value
else:
break
select_col = list(set(nlist))
######################## 模型
labels = data_new[Y]
xList= data_new[select_col]
if type(N)==int:
X_train=xList[:N]
X_test=xList[N:]
y_train=labels[:N]
y_test=labels[N:]
elif type(N)==float:
X_train,X_test,y_train,y_test=train_test_split(xList,labels,test_size=N,random_state=random,shuffle=True)
cv_model.fit(X_train,y_train)
y_train_pred = cv_model.predict(X_train)
y_pred = cv_model.predict(X_test)
#计算误差
train_mse = mean_squared_error(y_train_pred, y_train)
test_mse = mean_squared_error(y_pred, y_test)
train_rmse = np.sqrt(train_mse)
test_rmse = np.sqrt(test_mse)
#计算R2
train_r2_score=r2_score(y_train_pred, y_train)
test_r2_score=r2_score(y_test, y_pred)
return select_col,test_r2_score,train_r2_score,test_rmse,train_rmse
#实际使用样例
#从data_2中剔除数据异常的天数
bad_date=[datetime(2020,1,26),datetime(2020,2,4),datetime(2020,2,10),
datetime(2020,2,12),datetime(2020,2,17),datetime(2020,3,8),
datetime(2020,3,28),datetime(2020,4,9),datetime(2020,4,16)]
select_col,test_r2_score,train_r2_score,test_rmse,train_rmse=stepwise_select(data_2[~data_2['发布日期'].isin (bad_date)],Xs,'销量',0.3,lag=False,random=2,method='both')
#查看最终筛选得到的特征
select_col
#查看测试集R方
test_r2_score
#查看训练集R方
train_r2_score
#查看测试集RMSE
test_rmse,
#查看训练集RMSE
train_rmse
在我的数据中同时存在线性特征和非线性特征,使用弹性网络能够较好地进行预测,因而使用弹性网络做逐步回归,如需使用其他机器学习回归方法,在上面代码中的这一行进行替换即可。
#此处是用pipeline将数据标准化后投入模型,本质上最重要的需要更换的是ElasticNetCV模型及后面的参数,
#如不需进行数据标准化,也可以删除掉make_pipeline(StandardScaler(with_mean=False)仅保留后面的模型信息
cv_model = make_pipeline(StandardScaler(with_mean=False), ElasticNetCV(l1_ratio=0.5, eps=1e-3, n_alphas=200,
fit_intercept=True, precompute='auto',
max_iter=2000, tol=0.006, cv=10, copy_X=True,
verbose=0, n_jobs=-1, positive=False,
random_state=0))
版权声明:本文为weixin_42490142原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。