【前向、后向、双向逐步回归-应用于弹性网络机器学习】

在机器学习的特征筛选过程中试图保证穷尽所有有用的特征,选择了双向逐步回归的方法,在网上找双向逐步回归的代码时废了一番力气,最终采用了这一篇【python、R如何实现逐步回归(前向、后向、双向)】
在其基础上改动了一下,添加了N(训练集测试集划分依据)、lag、random参数,可用于时序数据机器学习的特征筛选,贴上来以作分享。

################################习惯性import所有可能用到的包
import pandas as pd
import os
import numpy as np
import scipy
import scipy.stats
from statsmodels.formula.api import ols  
import matplotlib.pyplot as plt
from matplotlib import colors as mcolors
%pylab
%matplotlib inline
matplotlib.rcParams['font.sans-serif'] = ['Microsoft YaHei'] #指定默认字体 
matplotlib.rc("savefig", dpi=400)
from datetime import datetime
import statsmodels.api as sm
import statsmodels.formula.api as smf
# from pandas import Int64Index as NumericIndex
import winreg
from sklearn.linear_model import Lasso###导入岭回归算法
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNetCV, ElasticNet
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
#################################### 逐步回归
def stepwise_select(data,nlist,Y,N,lag=False,random=2,method='forward'):
    #from tqdm import tqdm
    '''
    args:
        data:数据源,df
        nlist:逐步回归的全部字段
        Y:目标,str
        N:测试集数量,可为int或float,分别代表以某一节点划分测试集训练集,以及按照比例划分
        lag:滞后阶数,当时序数据需要自相关回归时设置,int
        random:随机种子
        methrod:方法,forward:向前,backward:向后,both:双向
    return:
        select_col:最终保留的字段列表,list 
        rmsError:RMSE
        test_r2_score:测试集R方
        train_r2_score:训练集R方
        test_rmse:测试集误差
        train_rmse:训练集误差
    '''
    if lag==False:
        data_new=data.copy()
        nlist=nlist.copy()
    elif type(lag)==int:
        data_new=data.copy()
        for i in range(1,lag+1):
            data_new["lag_{}".format(i)] = data_new[Y].shift(i).fillna(0)
        nlist=nlist.copy()+["lag_{}".format(i) for i in range(1,lag+1)]
    else:
        print('lag需要输入int类型')     
    labels = data_new[Y]
#     labels = [float(label) for label in data_new[Y].tolist()]  # 提取出data_new中的标签集并放入列表中
#     names = nlist          # 提取出data_new中所有x属性的名称并放入列表中
#     import statsmodels.api as sm
    cv_model = make_pipeline(StandardScaler(with_mean=False), ElasticNetCV(l1_ratio=0.5, eps=1e-3, n_alphas=200,
                                                                               fit_intercept=True, precompute='auto',
                                                                               max_iter=2000, tol=0.006, cv=10, copy_X=True,
                                                                               verbose=0, n_jobs=-1, positive=False, 
                                                                               random_state=0))
    num=0
    ######################## 1.前向回归
    # 前向回归:从一个变量都没有开始,一个变量一个变量的加入到模型中,直至没有可以再加入的变量结束
    if method == 'forward':  
        add_col = [] 
        rmsError_None_value = np.inf
        while nlist:
            num+=1
            now=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            print(now,'    round',num,)
            # 单个变量加入,计算rmsError
            rmsError = {}
            for col in nlist:
#                 print(col)
                X_col= add_col.copy()  # 构造用于存放属性集的列表
                X_col.append(col)
                xList=data_new[col]    # 列表xList中的每个元素对应着data_new中除去标签列的每一行
#                 X = sm.add_constant(data[X_col]) 
                if type(N)==int:    
                    X_train=xList[:N]
                    X_test=xList[N:]
                    y_train=labels[:N]
                    y_test=labels[N:]
                elif type(N)==float:
                    X_train,X_test,y_train,y_test=train_test_split(xList,labels,test_size=N,random_state=random,shuffle=True)       
                cv_model.fit(X_train,y_train)
                rmsError[col] = np.linalg.norm((y_test-cv_model.predict(X_test)), 2)/sqrt(len(y_test))
            rmsError_min_value = min(rmsError.values())   
            rmsError_min_key = min(rmsError,key=rmsError.get)
            # 如果最小的rmsError小于不加该变量时的rmsError,则加入变量,否则停止
            if rmsError_min_value < rmsError_None_value:
                nlist.remove(rmsError_min_key)
                add_col.append(rmsError_min_key)
                rmsError_None_value = rmsError_min_value
            else:
                break
        select_col = add_col
    ######################## 2.后向回归
    # 从全部变量都在模型中开始,一个变量一个变量的删除,直至没有可以再删除的变量结束
    elif method == 'backward': 
        p = True  
        # 全部变量,一个都不剔除,计算初始rmsError
        xList=data_new[nlist]
        if type(N)==int:    
            X_train=xList[:N]
            X_test=xList[N:]
            y_train=labels[:N]
            y_test=labels[N:]
        elif type(N)==float:
            X_train,X_test,y_train,y_test=train_test_split(xList,labels,test_size=N,random_state=random,shuffle=True)   
        cv_model.fit(X_train,y_train)
        rmsError_None_value= np.linalg.norm((y_test-cv_model.predict(X_test)), 2)/sqrt(len(y_test))
      
        while p:  # 删除一个字段提取rmsError最小的字段
            num+=1
            now=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            print(now,'    round',num,)
            rmsError = {}
            for col in nlist:
#                 print(col)
                X_col = [c for c in nlist if c !=col]
                xList= data_new[X_col]
                if type(N)==int:    
                    X_train=xList[:N]
                    X_test=xList[N:]
                    y_train=labels[:N]
                    y_test=labels[N:]
                elif type(N)==float:
                    X_train,X_test,y_train,y_test=train_test_split(xList,labels,test_size=N,random_state=random,shuffle=True)   
                cv_model.fit(X_train,y_train)
                rmsError[col] = np.linalg.norm((y_test-cv_model.predict(X_test)), 2)/sqrt(len(y_test))
            rmsError_min_value = min(rmsError.values()) 
            rmsError_min_key = min(rmsError, key=rmsError.get)  
           # 如果最小的rmsError小于不删除该变量时的rmsError,则删除该变量,否则停止
            if rmsError_min_value < rmsError_None_value:
                nlist.remove(rmsError_min_key)
                rmsError_None_value = rmsError_min_value
                p = True
            else:
                break 
        select_col = nlist             
    ######################## 3.双向回归
    elif method == 'both': 
        p = True
        add_col = []
        # 全部变量,一个都不剔除,计算初始rmsError
        X_col = nlist.copy()
#         for i in range(len(data_new)):
        xList= data_new[X_col]
#         print(len(data_new))
#         print(len(xList))
#         print(len(labels))
        if type(N)==int:    
            X_train=xList[:N]
            X_test=xList[N:]
            y_train=labels[:N]
            y_test=labels[N:]
        elif type(N)==float:
            X_train,X_test,y_train,y_test=train_test_split(xList,labels,test_size=N,random_state=random,shuffle=True)   
        cv_model.fit(X_train,y_train)
        rmsError_None_value = np.linalg.norm((y_test-cv_model.predict(X_test)), 2)/sqrt(len(y_test))     
        
        while p: 
            num+=1
            now=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            print(now,'    round',num,)
            # 删除一个字段提取rmsError最小的字段
            rmsError={}
            for col in nlist:
#                 print(col)
                X_col = [c for c in nlist if c !=col]
                xList= data_new[X_col]
                if type(N)==int:    
                    X_train=xList[:N]
                    X_test=xList[N:]
                    y_train=labels[:N]
                    y_test=labels[N:]
                elif type(N)==float:
                    X_train,X_test,y_train,y_test=train_test_split(xList,labels,test_size=N,random_state=random,shuffle=True)   
                cv_model.fit(X_train,y_train)
                rmsError[col] = np.linalg.norm((y_test-cv_model.predict(X_test)), 2)/sqrt(len(y_test))     
            rmsError_min_value = min(rmsError.values())
            rmsError_min_key = min(rmsError, key=rmsError.get)
            if len(add_col) == 0: # 第一次只有删除操作,不循环加入变量
                if rmsError_min_value < rmsError_None_value:
                    nlist.remove(rmsError_min_key)
                    add_col.append(rmsError_min_key)
                    rmsError_None_value = rmsError_min_value
                    p = True
                else:
                    break
            else:
                # 单个变量加入,计算rmsError
                for col in add_col:
#                     print(col)
                    X_col = nlist.copy()
                    X_col.append(col)
                    xList= data_new[X_col]
                    if type(N)==int:    
                        X_train=xList[:N]
                        X_test=xList[N:]
                        y_train=labels[:N]
                        y_test=labels[N:]
                    elif type(N)==float:
                        X_train,X_test,y_train,y_test=train_test_split(xList,labels,test_size=N,random_state=random,shuffle=True)   
                    cv_model.fit(X_train,y_train)
                    rmsError[col] = np.linalg.norm((y_test-cv_model.predict(X_test)), 2)/sqrt(len(y_test))
                rmsError_min_value = min(rmsError.values())
                rmsError_min_key = min(rmsError, key=rmsError.get)
                if rmsError_min_value < rmsError_None_value:
                    # 如果rmsError最小的字段在添加变量阶段产生,则加入该变量,如果rmsError最小的字段在删除阶段产生,则删除该变量
                    if rmsError_min_key in add_col:
                        nlist.append(rmsError_min_key)
                        add_col = list(set(add_col)-set(rmsError_min_key))
                        p = True                    
                    else: 
                        nlist.remove(rmsError_min_key)
                        add_col.append(rmsError_min_key)
                        p = True
                    rmsError_None_value = rmsError_min_value
                else:
                    break
        select_col = list(set(nlist))
    ######################## 模型
    labels = data_new[Y]
    xList= data_new[select_col] 
    if type(N)==int:    
        X_train=xList[:N]
        X_test=xList[N:]
        y_train=labels[:N]
        y_test=labels[N:]
    elif type(N)==float:
        X_train,X_test,y_train,y_test=train_test_split(xList,labels,test_size=N,random_state=random,shuffle=True)   
    cv_model.fit(X_train,y_train)
    y_train_pred = cv_model.predict(X_train)
    y_pred = cv_model.predict(X_test)
    #计算误差
    train_mse = mean_squared_error(y_train_pred, y_train)
    test_mse = mean_squared_error(y_pred, y_test)
    train_rmse = np.sqrt(train_mse)
    test_rmse = np.sqrt(test_mse)
    #计算R2
    train_r2_score=r2_score(y_train_pred, y_train)
    test_r2_score=r2_score(y_test, y_pred)
    return select_col,test_r2_score,train_r2_score,test_rmse,train_rmse
#实际使用样例
#从data_2中剔除数据异常的天数
bad_date=[datetime(2020,1,26),datetime(2020,2,4),datetime(2020,2,10),
		datetime(2020,2,12),datetime(2020,2,17),datetime(2020,3,8),
		datetime(2020,3,28),datetime(2020,4,9),datetime(2020,4,16)]

select_col,test_r2_score,train_r2_score,test_rmse,train_rmse=stepwise_select(data_2[~data_2['发布日期'].isin (bad_date)],Xs,'销量',0.3,lag=False,random=2,method='both')
#查看最终筛选得到的特征
select_col
#查看测试集R方
test_r2_score
#查看训练集R方
train_r2_score
#查看测试集RMSE
test_rmse,
#查看训练集RMSE
train_rmse

在我的数据中同时存在线性特征和非线性特征,使用弹性网络能够较好地进行预测,因而使用弹性网络做逐步回归,如需使用其他机器学习回归方法,在上面代码中的这一行进行替换即可。

#此处是用pipeline将数据标准化后投入模型,本质上最重要的需要更换的是ElasticNetCV模型及后面的参数,
#如不需进行数据标准化,也可以删除掉make_pipeline(StandardScaler(with_mean=False)仅保留后面的模型信息

    cv_model = make_pipeline(StandardScaler(with_mean=False), ElasticNetCV(l1_ratio=0.5, eps=1e-3, n_alphas=200,
                                                                               fit_intercept=True, precompute='auto',
                                                                               max_iter=2000, tol=0.006, cv=10, copy_X=True,
                                                                               verbose=0, n_jobs=-1, positive=False, 
                                                                               random_state=0))
                                                                               

版权声明:本文为weixin_42490142原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。