kaggle Categorical Feature Encoding Challenge

df_train = pd.read_csv(train.csv')
df_test = pd.read_csv('test.csv')

id_test = df_test['id']

y_train=df_train['target']

df_train.drop(['id','target'],axis=1,inplace=True)
df_test.drop(['id'],axis=1,inplace=True)

df_train['ord_6']=df_train['ord_5'].map(lambda x:x[0])
df_train['ord_7']=df_train['ord_5'].map(lambda x:x[1])
df_test['ord_6']=df_test['ord_5'].map(lambda x:x[0])
df_test['ord_7']=df_test['ord_5'].map(lambda x:x[1])

obj=[col for col in df_train.columns if 'ord' in col]
con=[col for col in df_train.columns if 'ord' not in col]
con.extend(obj[:3])
obj=obj[3:]

ord=OrdinalEncoder()
ord.fit(list(df_train[obj].values) + list(df_test[obj].values))
df_train[obj]=pd.DataFrame(ord.transform(df_train[obj].values))
df_test[obj]=pd.DataFrame(ord.transform(df_test[obj].values))

scaler=MinMaxScaler()
scaler.fit(list(df_train[obj].values) + list(df_test[obj].values))
df_train[obj]=pd.DataFrame(scaler.transform(df_train[obj].values))
df_test[obj]=pd.DataFrame(scaler.transform(df_test[obj].values))

hot=OneHotEncoder(sparse=True)
hot.fit(list(df_train[con].values) + list(df_test[con].values))
train=hot.transform(df_train[con].values)
test=hot.transform(df_test[con].values)

df_train.drop(con,axis=1,inplace=True)
df_test.drop(con,axis=1,inplace=True)

df_train =scipy.sparse.hstack([train,scipy.sparse.coo_matrix(df_train)]).tocsr()
df_test =scipy.sparse.hstack([test,scipy.sparse.coo_matrix(df_test)]).tocsr()

model = LogisticRegression(penalty='l2', C=float(0.1217), class_weight=None, random_state=42, solver='lbfgs', max_iter=2011,  verbose=0,n_jobs=-1)
model.fit(df_train,y_train)
mean_pred = pd.DataFrame(model.predict_proba(df_test)[:,1])
mean_pred.index = id_test
mean_pred.columns = ['target']
mean_pred.to_csv('sub.csv', index_label='id', index=True) 

score:0.80278


版权声明:本文为ts_sky原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。