
本文章的内容由"天线宝宝"团队提供,这是我第一个NLP比赛,感谢攻城狮大佬和我亲哥在爬虫上帮助,还有冯楠坪大佬在BERT的帮助。
赛题网址:
2019 iFLYTEK AI开发者大赛-讯飞开放平台challenge.xfyun.cn赛题背景:
对应用的精准分类不仅有利于应用的管理,还有诸多用途,如竞品分析、市场行情分析、应用监管及反作弊等,但如何进行精准分类一直是业界难题,不仅涉及数据爬取、数据清洗、机器学习等多项技术,不同应用领域关注的应用分类类别也不尽相同,导致了当前业界还是以人工标注为主获取准确的应用分类信息。
赛事任务:
选手基于提供的应用二级分类标签以及若干随机应用标注样本(加密的应用名称和应用描述及对应的分类标签)实现应用分类标注算法(每个应用一个标签,以应用最主要属性对应的标签为该应用的标签)。
赛题分析:
赛题初赛由主办方提供数据,复赛主办方只提供了关键词,需要选手们自行爬去数据,业界存在着一个说法,“数据和特征决定机器学习的上限,而算法和模型只是逼近这个上限而已”
爬虫:
主要爬去百度的数据,通过关键字搜索APP的描述文本,主要百度展示的前3条。
def get_data_from_bd(name):
"""name 搜索的关键字"""
try:
from lxml import html
import requests
from requests.utils import quote
name_new = quote(name)
url2 = 'http://www.baidu.com/s?wd={}%20描述%20安卓'.format(name_new)
response = requests.get(url2, timeout=10)
tree = html.fromstring(response.content)
desc = tree.xpath('//*[@id="1" or @id="2" or @id="3"]//div[@class="c-abstract"]//text()')
desc = ''.join(desc)
import re
desc = re.sub('[0-9]*年[0-9]*月[0-9]*日.-.', '', desc)
if len(desc) == 0 :
desc = '{name},_fix'
res = {
'name': name,
'desc_bd': desc ,
'ct': pd.to_datetime('now'),
}
except Exception as e:
print(name, url2)
print(e)
if name in ['ajustaments','htripehotel',
'htripehotel' ,'penguruskenalan',
'psiphon' ,'soyeaevbus','unidrive' ]:
desc = '{},_fix'.format(name)
else:
desc = ''
res = {'name': name, 'ct': pd.to_datetime('now'), 'desc_bd':desc }
return resBERT代码
主要在keras-bert的基础上fineturn ,可以采用不同的权重RoBERTa、bert-wwm-ext等来提高模型的精度,也可以把提取BERT不同层的特征放入二级分类器继续train。
from keras_bert import load_trained_model_from_checkpoint, Tokenizer
import codecs
from keras.utils.np_utils import to_categorical
import gc
from keras import backend as K
import heapq
import pandas as pd
import keras
import numpy as np
from sklearn import metrics
import re
from keras import metrics as mt
from sklearn.model_selection import StratifiedKFold
from keras.layers import *
from keras.optimizers import *
from keras.callbacks import *
from keras.models import Model
from keras.optimizers import Adam
from keras.utils.np_utils import to_categorical
from keras.models import *
from keras.optimizers import *
from keras.callbacks import *
path_drive = '../chinese_L-12_H-768_A-12/'
def top_2_acc(y_true, y_pred):
return mt.top_k_categorical_accuracy(y_true, y_pred, k=2)
gc.collect()
maxlen = 500
config_path = path_drive + 'bert_config.json'
checkpoint_path = path_drive + 'bert_model.ckpt'
dict_path = path_drive + 'vocab.txt'
class OurTokenizer(Tokenizer):
def _tokenize(self, text):
R = []
for c in text:
if c in self._token_dict:
R.append(c)
elif self._is_space(c):
R.append('[unused1]')
else:
R.append('[UNK]')
return R
def creat_data(data):
X1, X2, Y = [], [], []
indexs = []
for i in range(len(data)):
d = data.loc[i, 'content']
text = d[:maxlen]
x1, x2 = tokenizer.encode(first=text)
y = data.loc[i, 'label1']
if data.loc[i, 'label2'] != -99:
indexs.append([i, data.loc[i, 'label2']])
X1.append(x1)
X2.append(x2)
Y.append([y])
X1 = seq_padding(X1)
X2 = seq_padding(X2)
Y = seq_padding(Y)
z = to_categorical(Y)
return [X1, X2], z
def creat_data_test(data):
X1, X2 = [], []
for i in range(len(data)):
d = data.loc[i, 'content']
text = d[:maxlen]
x1, x2 = tokenizer.encode(first=text)
X1.append(x1)
X2.append(x2)
X1 = seq_padding(X1)
X2 = seq_padding(X2)
return [X1, X2]
def seq_padding(X, padding=0):
L = [len(x) for x in X]
ML = max(L)
return np.array([np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X])
class data_generator:
def __init__(self, data, batch_size=10):
self.data = data
self.batch_size = batch_size
self.steps = len(self.data) // self.batch_size
if len(self.data) % self.batch_size != 0:
self.steps += 1
def __len__(self):
return self.steps
def __iter__(self):
while True:
idxs = range(len(self.data))
X1, X2 = [], []
for i in idxs:
d = self.data.loc[i, 'content']
text = d[:maxlen]
x1, x2 = tokenizer.encode(first=text)
X1.append(x1)
X2.append(x2)
if len(X1) == self.batch_size or i == idxs[-1]:
X1 = seq_padding(X1)
X2 = seq_padding(X2)
yield [X1, X2]
X1, X2 = [], []
token_dict = {}
with codecs.open(dict_path, 'r', 'utf8') as reader:
for line in reader:
token = line.strip()
token_dict[token] = len(token_dict)
tokenizer = OurTokenizer(token_dict)
#爬取的训练集和测试集
traindata = pd.read_csv('../data/train_new_2.csv')
testdata = pd.read_csv('../data/test_new.csv')
testdata['content'] = testdata['content'].astype(str)
traindata['label2'] = -1
traindata['content'] = traindata['content'].apply(lambda x: re.sub('"', '', x))
testdata['content'] = testdata['content'].apply(lambda x: re.sub('"', '', x))
from sklearn import preprocessing
lbl = preprocessing.LabelEncoder()
lbl.fit(traindata['label1'].values)
traindata['label1'] = lbl.transform(traindata['label1'].values)
num_class = traindata['label1'].max() + 1
print('num_class', num_class)
def bert_model():
bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None)
for l in bert_model.layers:
l.trainable = True
x1_in = Input(shape=(None,))
x2_in = Input(shape=(None,))
x = bert_model([x1_in, x2_in])
x = Lambda(lambda x: x[:, 0])(x)
p = Dense(num_class, activation='softmax')(x)
model = Model([x1_in, x2_in], p)
model.compile(loss='categorical_crossentropy',optimizer=Adam(5e-6),metrics=['accuracy',top_2_acc])
# model.summary()
return model
column = 'content'
label = traindata.label1.values
early_stopping =EarlyStopping(monitor='top_2_acc', patience=1)
plateau = ReduceLROnPlateau(monitor="top_2_acc", verbose=1, mode='max', factor=0.3, patience=1)
dtest = creat_data_test(testdata)
N = 5
stack_train = np.zeros((traindata.shape[0], num_class))
stack_test = np.zeros((testdata.shape[0], num_class))
skf = StratifiedKFold(n_splits=N, shuffle=True, random_state=167)
for i, (tr, va) in enumerate(skf.split(traindata[column], label)):
print('stack:%d/%d' % ((i + 1), N))
file_path= "../BERT_MODEL/"+ '_'+str(i)+'_'+"_bert__78.hdf"
train_D = creat_data(traindata.loc[tr].reset_index(drop=True))
valid_D = creat_data(traindata.loc[va].reset_index(drop=True))
model = bert_model()
if not os.path.exists(file_path):
checkpoint = ModelCheckpoint(file_path, monitor='top_2_acc', verbose=1, save_best_only=True, mode='max', save_weights_only=True)
model.fit(train_D[0], train_D[1],
validation_data=(valid_D[0], valid_D[1]),
epochs=10,
batch_size=4,
shuffle=True,
callbacks=[early_stopping,plateau,checkpoint])
model.load_weights(file_path)
score_te = model.predict(dtest,batch_size=12, verbose=1)
# score_va = model.predict(valid_D[0],batch_size=2, verbose=1)
gc.collect()
K.clear_session()
# stack_train[va] += score_va
stack_test += score_te
results = pd.DataFrame(stack_test)
first = []
second = []
for j, row in results.iterrows():
zz = list(np.argsort(row))
first.append(row.index[zz[-1]])
second.append(row.index[zz[-2]])
results['label1'] = first
results['label2'] = second
try:
results['label1'] = results['label1'].apply(lambda x: lbl.inverse_transform(int(x)))
results['label2'] = results['label2'].apply(lambda x: lbl.inverse_transform(int(x)))
except:
results['label1'] = lbl.inverse_transform(results['label1'])
results['label2'] = lbl.inverse_transform(results['label2'])
pd.concat([testdata[['id']], results[['label1', 'label2']]], axis=1).to_csv('../submit/submit_bert_label_2.csv', index=None, encoding='utf8')