标签噪音数据集分类------ Adasampling in Python

Adasampling 方法于Adaboosting相似,Adaboosting中心思想是对于多次训练并不是很好得数据,但这种方法并不适用于带有label noise数据,因为模型对于带有label noise sample 训练一般都不是很好,多次训练noisy sample只会对模型造成相反作用。 而Adasampling而是降低取训练差样本概率,这样大概率可以剔除mislabel sample,增加模型准确率。以下是在python手撸 Adasampling。



def singleIter(Ps, Ns, dat, test, cls_test, pos_probs, una_probs, classifier, sampleFactor):
    """
    input: 
    Ps and Ns: postive class and nagetive class index in the array
    dat: trainning dataset (not including the label)
    test: test datset (not including the label)
    cls_test: test label
    pos_probs and una_probs: probability of postive and nagetive sample been labelled corretly
    classifier: 'logistic' or 'SVM'
    sampleFactor: control the sample size for each iterate
    
    output:
    logit_pred: predicted test class
    logit_pred_prob: the probability of each sample been label correctly
    idx_pl and idx_dl: the index of positive class and nagetive class for next iteration.
    accuracy: accuracy of model
    """
    positive_train = []
    positive_cls = []
    # determine the proper sample size for creating a balanced dataset
    sampleN = len(Ps) if len(Ps)<len(Ns) else len(Ns)
    
    
    # bootstrap sampling to build the positive training set (labeled as '0')&remove duplicates
   
    idx_pl = np.unique(np.random.choice(Ps, size=sampleFactor*sampleN, replace=True, p= ((pos_probs)/np.sum(pos_probs))))
    
    positive_train = dat[idx_pl,]
    positive_cls  = np.repeat(0, len(positive_train), axis=0)

    # bootstrap sampling to build the "unannotate" or "negative" training set (labeled as '1')
    idx_dl = np.unique(np.random.choice(Ns, size=sampleFactor*sampleN, replace=True, p= ((una_probs)/np.sum(una_probs))))
    unannotate_train = dat[idx_dl,]
    unannotate_cls  = np.repeat(1, len(unannotate_train), axis=0)

    # combine data
    train_sample = np.vstack((positive_train, unannotate_train))
    positive_cls = positive_cls.tolist()
    unannotate_cls = unannotate_cls.tolist()
    cls = positive_cls+unannotate_cls
    cls = np.array(cls).reshape(len(cls),)
    cls_test = cls_test.flatten()
    #train
    if classifier =='logistic':
        if len(test) == len(dat):
            logit_model  = LogisticRegression(C=0.001, penalty = 'l2') 
            logit_model.fit(train_sample, cls)
            logit_pred_prob = logit_model.predict_proba(train_sample)
            logit_pred = logit_model.predict(train_sample)
            accuracy = accuracy_score(cls, logit_pred)
        else:
            logit_model  = LogisticRegression(C=0.001, penalty = 'l2') 
            logit_model.fit(train_sample, cls)
            logit_pred_prob = logit_model.predict_proba(test)
            logit_pred = logit_model.predict(test)
            accuracy = accuracy_score(cls_test, logit_pred)
    
    if classifier == 'SVM':
        if len(test) == len(dat):
            logit_model  = SVC(probability=True) 
            logit_model.fit(train_sample, cls)
            logit_pred_prob = logit_model.predict_proba(train_sample)
            logit_pred = logit_model.predict(train_sample)
            accuracy = accuracy_score(cls, logit_pred)
        else:
            logit_model  = SVC(probability=True) 
            logit_model.fit(train_sample, cls)
            logit_pred_prob = logit_model.predict_proba(test)
            logit_pred = logit_model.predict(test)
            accuracy = accuracy_score(cls_test, logit_pred)
    return logit_pred,logit_pred_prob,idx_pl,idx_dl,accuracy

def adaSample(Ps, Ns, train_mat, test_mat,cls_test,classifier, sampleFactor):
    """
    iterating singleIter fuction and updating probabilities.
    """
    pos_probs = np.repeat(1, len(Ps), axis=0)
    una_probs = np.repeat(1, len(Ns), axis=0)
    
    pos_num = np.array(Ps).reshape(len(Ps),1)
    una_num = np.array(Ns).reshape(len(Ns),1)

    #contain all probs
    pos_probs_1 = pos_probs.reshape(pos_probs.shape[0],1)
    pos_probs_1 = np.hstack((pos_num,pos_probs_1))
    pos_probs_1 = pos_probs_1.astype(float)
    
    una_probs_1 = una_probs.reshape(una_probs.shape[0],1)
    una_probs_1 = np.hstack((una_num,una_probs_1))
    una_probs_1 = una_probs_1.astype(float)
    for i in range(10):
        logit_pred,logit_pred_prob,idx_pl,idx_dl,accuracy = singleIter(Ps=Ps, Ns=Ns, dat=train_mat, test=train_mat,cls_test = cls_test, pos_probs=pos_probs, una_probs=una_probs, classifier=classifier, sampleFactor=1)
        pos = logit_pred_prob[:len(idx_pl),0]
        una = logit_pred_prob[len(idx_pl):len(idx_pl)+len(idx_dl),1]
        
        pos = pos.reshape(pos.shape[0],1)
        idx_pl = idx_pl.reshape(idx_pl.shape[0],1)
    
        # the probs of postive instances with index
        pos_1 = np.hstack((idx_pl,pos))
        
        una = una.reshape(una.shape[0],1)
        idx_dl = idx_dl.reshape(idx_dl.shape[0],1)
        
        # the probs of negative instances with index
        una_1 = np.hstack((idx_dl,una))
        
        
        #cover porbs obtained from logistic to pos_probs_1 

        pos_1 = pd.DataFrame(pos_1)
        pos_probs_1 = pd.DataFrame(pos_probs_1)

        for i in range(len(pos_1)):
            if pos_1[0][i].tolist() in pos_probs_1[0].tolist():
                c = pos_probs_1[pos_probs_1[0].isin([pos_1[0][i]])].index.tolist()
                pos_probs_1[1][c] =  pos_1[1][i]
        
        pos_probs_1 = np.array(pos_probs_1)
        pos_1 = np.array(pos_1)
        #cover porbs obtained from logistic to una_probs_1       
        una_1 = pd.DataFrame(una_1)
        una_probs_1 = pd.DataFrame(una_probs_1)

        for i in range(len(una_1)):
            if una_1[0][i].tolist() in una_probs_1[0].tolist():
                c = una_probs_1[una_probs_1[0].isin([una_1[0][i]])].index.tolist()
                una_probs_1[1][c] =  una_1[1][i]
                
        una_probs_1 = np.array(una_probs_1)
        una_1 = np.array(una_1)
        
        #slice pos_probs to obtain prob only
        pos_probs = pos_probs_1[:,1]
        pos_probs = pos_probs.reshape(pos_probs.shape[0],)
        
        #slice una_probs to obtain prb only
        una_probs = una_probs_1[:,1]
        una_probs = una_probs.reshape(una_probs.shape[0],)

    logit_pred,logit_pred_prob,idx_pl,idx_dl,accuracy = singleIter(Ps=Ps, Ns=Ns, dat=train_mat, test=test_mat,cls_test = cls_test,pos_probs=pos_probs, una_probs=una_probs, classifier=classifier,sampleFactor=1)

    return logit_pred,logit_pred_prob,idx_pl,idx_dl,accuracy,pos_probs,una_probs


版权声明:本文为ryan_8992625a原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。