在进行分类的时候,通常我们会选择对多个分类模型进行融合以获得更高的分类效果.这里采用stacking的方法,具体的方法详见之前的博客:https://blog.csdn.net/m0_37548423/article/details/86656070(里面有stacking的介绍)
python中我们可以用mlxtend.classifier中的StackingClassifier来进行模型融合,这里选择LogisticRegression作为基模型。
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
#from sklearn.model_selection import GridSearchCV
from mlxtend.classifier import StackingClassifier#模型融合
from sklearn import model_selection
def metrics_result(true, predict):#定义并输出正确率、精确率、召回率、AUC
acc = metrics.accuracy_score(true, predict)
pre = metrics.precision_score(true, predict)
reca = metrics.recall_score(true, predict)
f_sco = metrics.f1_score(true, predict)
auc_ = metrics.roc_auc_score(true, predict)
return acc, pre, reca, f_sco, auc_
def plot_roc(train_label, train_predict, test_label, test_predict):
false_positive_rate1,true_positive_rate1,thresholds1=roc_curve(train_label, train_predict)
false_positive_rate2,true_positive_rate2,thresholds2=roc_curve(test_label, test_predict)
roc_auc1=auc(false_positive_rate1, true_positive_rate1)
roc_auc2=auc(false_positive_rate2, true_positive_rate2)
plt.title('ROC')
plt.plot(false_positive_rate1, true_positive_rate1,'b',label=' Train AUC = %0.4f'% roc_auc1)
plt.plot(false_positive_rate2, true_positive_rate2,'y',label=' Train AUC = %0.4f'% roc_auc2)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.ylabel('TPR')
plt.xlabel('FPR')
data = pd.read_csv('data_imp.csv')#读入已经处理(删除,填补,类型转换,归一化)过的数据
label = data['status']
data = data.iloc[:,:-1]
data_=RFE(estimator=RandomForestClassifier(), n_features_to_select=30).fit_transform(data,label)
ac_train = []
pr_train = []
rec_train = []
f_sc_train = []
au_train = []
ac_test = []
pr_test = []
rec_test = []
f_sc_test = []
au_test = []
sky = StratifiedKFold(n_splits = 4)
for train_index, test_index in sky.split(data_, label):
train_data, test_data = data.iloc[train_index,:], data.iloc[test_index,:]
train_label, test_label = label[train_index], label[test_index]
lr_ = LogisticRegression(C=1.0, max_iter=1000).fit(train_data, train_label)
svm_ = svm.SVC(C=1.0, kernel='linear', gamma=20, probability=True).fit(train_data, train_label)
dt_ = DecisionTreeClassifier(max_depth=5).fit(train_data, train_label)
rf_ = RandomForestClassifier(n_estimators=8, random_state=5, max_depth=6, min_samples_split=2).fit(train_data, train_label)
xgb_ = XGBClassifier(n_estimators=8,learning_rate= 0.25, max_depth=20,subsample=1,gamma=13, seed=1000,num_class=1).fit(train_data, train_label)
sclf=StackingClassifier(classifiers=[dt_, rf_, svm_,xgb_, lr_],meta_classifier=lr_,use_probas=True)#模型融合的主要代码
train_p = sclf.fit(train_data, train_label).predict(train_data)
test_p = sclf.fit(train_data, train_label).predict(test_data)
acc_train, pre_train, reca_train, f_sco_train, auc_train = metrics_result(train_label, train_p)
ac_train.append(acc_train)
pr_train.append(pre_train)
rec_train.append(reca_train)
f_sc_train.append(f_sco_train)
au_train.append(auc_train)
acc_test, pre_test, reca_test, f_sco_test, auc_test = metrics_result(test_label, test_p)
ac_test.append(acc_test)
pr_test.append(pre_test)
rec_test.append(reca_test)
f_sc_test.append(f_sco_test)
au_test.append(auc_test)
print('accuracy train:{:.4}, accuracy test:{:.4}'.format(np.mean(ac_train), np.mean(ac_test)))
print('precision train:{:.4}, precision test:{:.4}'.format(np.mean(pr_train), np.mean(pr_test)))
print('recall train:{:.4}, recall test:{:.4}'.format(np.mean(rec_train), np.mean(rec_test)))
print('f1 score train:{:.4}, f1 score test:{:.4}'.format(np.mean(f_sc_train), np.mean(f_sc_test)))
print('auc train:{:.4}, auc test:{:.4}'.format(np.mean(au_train), np.mean(au_test)))
plot_roc(train_label, train_p, test_label, test_p)accuracy train:0.8389, accuracy test:0.7778
precision train:0.7632, precision test:0.5831
recall train:0.522, recall test:0.4106
f1 score train:0.6199, f1 score test:0.4814
auc train:0.7338, auc test:0.6559

版权声明:本文为m0_37548423原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。