标签噪音数据集分类------ Adasampling in Python

Adasampling 方法于Adaboosting相似，Adaboosting中心思想是对于多次训练并不是很好得数据，但这种方法并不适用于带有label noise数据，因为模型对于带有label noise sample 训练一般都不是很好，多次训练noisy sample只会对模型造成相反作用。而Adasampling而是降低取训练差样本概率，这样大概率可以剔除mislabel sample，增加模型准确率。以下是在python手撸 Adasampling。

def singleIter(Ps, Ns, dat, test, cls_test, pos_probs, una_probs, classifier, sampleFactor):
"""
input:
Ps and Ns: postive class and nagetive class index in the array
dat: trainning dataset (not including the label)
test: test datset (not including the label)
cls_test： test label
pos_probs and una_probs: probability of postive and nagetive sample been labelled corretly
classifier: 'logistic' or 'SVM'
sampleFactor: control the sample size for each iterate

output:
logit_pred: predicted test class
logit_pred_prob: the probability of each sample been label correctly
idx_pl and idx_dl: the index of positive class and nagetive class for next iteration.
accuracy: accuracy of model
"""
positive_train = []
positive_cls = []
# determine the proper sample size for creating a balanced dataset
sampleN = len(Ps) if len(Ps)<len(Ns) else len(Ns)


# bootstrap sampling to build the positive training set (labeled as '0')&remove duplicates

idx_pl = np.unique(np.random.choice(Ps, size=sampleFactor*sampleN, replace=True, p= ((pos_probs)/np.sum(pos_probs))))

positive_train = dat[idx_pl,]
positive_cls = np.repeat(0, len(positive_train), axis=0)
# bootstrap sampling to build the "unannotate" or "negative" training set (labeled as '1')
idx_dl = np.unique(np.random.choice(Ns, size=sampleFactor*sampleN, replace=True, p= ((una_probs)/np.sum(una_probs))))
unannotate_train = dat[idx_dl,]
unannotate_cls = np.repeat(1, len(unannotate_train), axis=0)
# combine data
train_sample = np.vstack((positive_train, unannotate_train))
positive_cls = positive_cls.tolist()
unannotate_cls = unannotate_cls.tolist()
cls = positive_cls+unannotate_cls
cls = np.array(cls).reshape(len(cls),)
cls_test = cls_test.flatten()
#train
if classifier =='logistic':
if len(test) == len(dat):
logit_model = LogisticRegression(C=0.001, penalty = 'l2')
logit_model.fit(train_sample, cls)
logit_pred_prob = logit_model.predict_proba(train_sample)
logit_pred = logit_model.predict(train_sample)
accuracy = accuracy_score(cls, logit_pred)
else:
logit_model = LogisticRegression(C=0.001, penalty = 'l2')
logit_model.fit(train_sample, cls)
logit_pred_prob = logit_model.predict_proba(test)
logit_pred = logit_model.predict(test)
accuracy = accuracy_score(cls_test, logit_pred)

if classifier == 'SVM':
if len(test) == len(dat):
logit_model = SVC(probability=True)
logit_model.fit(train_sample, cls)
logit_pred_prob = logit_model.predict_proba(train_sample)
logit_pred = logit_model.predict(train_sample)
accuracy = accuracy_score(cls, logit_pred)
else:
logit_model = SVC(probability=True)
logit_model.fit(train_sample, cls)
logit_pred_prob = logit_model.predict_proba(test)
logit_pred = logit_model.predict(test)
accuracy = accuracy_score(cls_test, logit_pred)
return logit_pred,logit_pred_prob,idx_pl,idx_dl,accuracy
def adaSample(Ps, Ns, train_mat, test_mat,cls_test,classifier, sampleFactor):
"""
iterating singleIter fuction and updating probabilities.
"""
pos_probs = np.repeat(1, len(Ps), axis=0)
una_probs = np.repeat(1, len(Ns), axis=0)

pos_num = np.array(Ps).reshape(len(Ps),1)
una_num = np.array(Ns).reshape(len(Ns),1)
#contain all probs
pos_probs_1 = pos_probs.reshape(pos_probs.shape[0],1)
pos_probs_1 = np.hstack((pos_num,pos_probs_1))
pos_probs_1 = pos_probs_1.astype(float)

una_probs_1 = una_probs.reshape(una_probs.shape[0],1)
una_probs_1 = np.hstack((una_num,una_probs_1))
una_probs_1 = una_probs_1.astype(float)
for i in range(10):
logit_pred,logit_pred_prob,idx_pl,idx_dl,accuracy = singleIter(Ps=Ps, Ns=Ns, dat=train_mat, test=train_mat,cls_test = cls_test, pos_probs=pos_probs, una_probs=una_probs, classifier=classifier, sampleFactor=1)
pos = logit_pred_prob[:len(idx_pl),0]
una = logit_pred_prob[len(idx_pl):len(idx_pl)+len(idx_dl),1]

pos = pos.reshape(pos.shape[0],1)
idx_pl = idx_pl.reshape(idx_pl.shape[0],1)

# the probs of postive instances with index
pos_1 = np.hstack((idx_pl,pos))

una = una.reshape(una.shape[0],1)
idx_dl = idx_dl.reshape(idx_dl.shape[0],1)

# the probs of negative instances with index
una_1 = np.hstack((idx_dl,una))


#cover porbs obtained from logistic to pos_probs_1
pos_1 = pd.DataFrame(pos_1)
pos_probs_1 = pd.DataFrame(pos_probs_1)
for i in range(len(pos_1)):
if pos_1[0][i].tolist() in pos_probs_1[0].tolist():
c = pos_probs_1[pos_probs_1[0].isin([pos_1[0][i]])].index.tolist()
pos_probs_1[1][c] = pos_1[1][i]

pos_probs_1 = np.array(pos_probs_1)
pos_1 = np.array(pos_1)
#cover porbs obtained from logistic to una_probs_1
una_1 = pd.DataFrame(una_1)
una_probs_1 = pd.DataFrame(una_probs_1)
for i in range(len(una_1)):
if una_1[0][i].tolist() in una_probs_1[0].tolist():
c = una_probs_1[una_probs_1[0].isin([una_1[0][i]])].index.tolist()
una_probs_1[1][c] = una_1[1][i]

una_probs_1 = np.array(una_probs_1)
una_1 = np.array(una_1)

#slice pos_probs to obtain prob only
pos_probs = pos_probs_1[:,1]
pos_probs = pos_probs.reshape(pos_probs.shape[0],)

#slice una_probs to obtain prb only
una_probs = una_probs_1[:,1]
una_probs = una_probs.reshape(una_probs.shape[0],)
logit_pred,logit_pred_prob,idx_pl,idx_dl,accuracy = singleIter(Ps=Ps, Ns=Ns, dat=train_mat, test=test_mat,cls_test = cls_test,pos_probs=pos_probs, una_probs=una_probs, classifier=classifier,sampleFactor=1)
return logit_pred,logit_pred_prob,idx_pl,idx_dl,accuracy,pos_probs,una_probs

原文链接：https://blog.csdn.net/ryan_8992625a/article/details/83419993