# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy # linear algebra
import pandas # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
train_data = pandas.read_csv('/kaggle/input/titanic/train.csv')
train_data.head()
train_data.info()
age_median = train_data["Age"].median()
train_data["Age"].fillna(age_median, inplace = True)
train_data.info()
#训练集预测器和标签分离
train_survived = train_data["Survived"]
features = ["Pclass", "Sex", "SibSp", "Parch", "Age"]
train_features = pandas.get_dummies(train_data[features])
#学习随机森林模型
from sklearn.ensemble import RandomForestClassifier
forest_model = RandomForestClassifier(random_state = 24)
forest_model.fit(train_features, train_survived)
#模型随机搜索参数
from sklearn.model_selection import RandomizedSearchCV
#随机搜索最优参数
param_rand = [
{"n_estimators":range(10,1000,20), "max_depth":range(1,6,1), "bootstrap": [True, False]}
]
random_search = RandomizedSearchCV(forest_model, param_distributions = param_rand, n_iter = 20, cv = 3)
random_search.fit(train_features, train_survived)
forest_final_model = random_search.best_estimator_
#执行K折交叉验证
from sklearn.model_selection import cross_val_predict
#ROC曲线
train_probas_forest = cross_val_predict(forest_final_model, train_features, train_survived,cv = 3, method = "predict_proba")
train_scores_forest = train_probas_forest[:,1]
from sklearn.metrics import roc_curve
fpr_forest, tpr_forest, threholds_forest = roc_curve(train_survived, train_scores_forest)
import matplotlib.pyplot
def plot_roc_curve(fpr, tpr, label = None):
matplotlib.pyplot.plot(fpr, tpr, linewidth = 2, label = label)
matplotlib.pyplot.plot([0, 1], [0, 1], "k--")
[...]
plot_roc_curve(fpr_forest, tpr_forest)
matplotlib.pyplot.show()
#ROC分数
from sklearn.metrics import roc_auc_score
roc_auc_score(train_survived, train_scores_forest)
from sklearn.linear_model import SGDClassifier
sgd_model = SGDClassifier(random_state = 24)
sgd_model.fit(train_features, train_survived)
train_scores_sgd = cross_val_predict(sgd_model, train_features, train_survived, cv = 3, method = "decision_function")
fpr_sgd, tpr_sgd, threholds_sgd = roc_curve(train_survived, train_scores_sgd)
plot_roc_curve(fpr_sgd, tpr_sgd)
matplotlib.pyplot.show()
roc_auc_score(train_survived, train_scores_sgd)
test_data = pandas.read_csv('/kaggle/input/titanic/test.csv')
test_data.head()
test_data.info()
test_data["Age"].fillna(age_median, inplace = True)
test_features = pandas.get_dummies(test_data[features])
Last_predic = forest_final_model.predict(test_features)
csv_out = pandas.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': Last_predic})
csv_out.to_csv('my_submission.csv', index = False)
print("Your submission was successfully saved!")
版权声明:本文为wu_xying原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。