（机器学习）决策树及其sklearn应用

用途：

回归：房价预测、贷款风险评估
分类：邮件分类、保险行业的险种推广预测、医疗的辅助诊断

优点：

计算复杂度不高
输出结果容易理解
对中间值的缺失不敏感
可以处理不相关特征

缺点：

容易过拟合

使用数据类型：

数值型
标称型

算法原理：
决策树的原理就是通过 if-then 的过程将原本杂乱不确定的信息变成一个确定、有序的信息。
在这里插入图片描述
信息不确定的度量：

熵：香农熵
$\sum^k_i p_i * log(p_i)$
$D$ ：是一个数据集，有 $k$ 个类别
$p_i$ :表示第 $i$ 个类别在 $H$ 中的概率
GINI系数
$\sum^k_i p_i ^2$

*熵的计算要比GINI系数的计算稍慢，Sklearn中默认GINI系数，从效果上来讲并无明显差别。

决策前后，信息不确定性变化的度量：

信息增益： $g (D ∣ A) = H (D) - H （ D ∣ A)$
其中，
$\sum^k_i p_i * log(p_i)$ ，原数据集的信息熵
$-\sum^n_j \frac{|D_j|}{D}* \sum^k_i p_{ij}* log(p_{ij})$ ，特征A的信息熵

*信息增益越大，意味着特征A的分类能力越强，在决策过程中，就要优先选择特征A

决策树的时间复杂度：

训练集： $O (n * m * l o g (m))$
测试集： $O (l o g (m))$

决策树过拟合问题的解决办法：

剪枝

例1：python实现二分类问题的香农熵和gini系数计算

import numpy as np
import matplotlib.pyplot as plt

#二分类问题中的熵
def entropy(x):
    a = -x*np.log(x)-(1-x)*np.log(1-x)
    return a 

#二分类问题中的gini系数：
def gini(x):
    a = 1 - x**2 -(1-x)**2
    return a

a = 0.8
b = 0.5
print('当a = 0.8时的香农熵:%.3f'%entropy(a))
print('当b = 0.5时的香农熵:%.3f'%entropy(b))
print('当a = 0.8时的GINI系数:%.3f'%gini(a))
print('当b = 0.5时的GINI系数:%.3f'%gini(b))

x = np.linspace(0.001,0.999,100)
plt.plot(x,entropy(x),label = 'entropy')
plt.plot(x,gini(x),label = 'gini')
plt.legend()   
plt.show()

结果：
在这里插入图片描述

例2：自定义决策树，适用于标称型数据集

#自定义决策树，适用于标称型数据集
from math import log 
import operator
import matplotlib.pyplot as plt

#求给定数据集的熵，求H(D)
def calcShannonEnt(dataSet):
    rowNum = len(dataSet)
    labelSet = {}
    result = 0.0
    for m in dataSet:
        label = m[-1]
        if label not in labelSet.keys():
            labelSet[label] = 1
        else:
            labelSet[label] += 1
    for n in labelSet:
        prob = float(labelSet[n]) / rowNum
        result += -prob*log(prob,2)
    return result

#按照给定特征划分数据集，求f(xij)
def splitDataSet(dataSet,axis,val):
    result = []
    for element in dataSet:
        if element[axis] == val:
            a = element[:axis]
            a.extend(element[axis+1:])
            result.append(a)
    return result

#选择最好的数据集划分方式,求信息增益最大的指标，找出最大的H(D|A)
def chooseBestFeatureToSplit(dataSet):
    colNum = len(dataSet[0])
    baseShanoonEnt = calcShannonEnt(dataSet)
    bestShannonEnt = 0.0 
    bestFeature = -1
    for i in range(colNum):
        a = [col[i] for col in dataSet] #取dataSet的第i个特征值
        uniVal = set(a)  #取dataSet的第i个特征值中的所有特征
        newEntropy = 0
        for b in uniVal:
            miniDataSet = splitDataSet(dataSet,i,b) #划分出第i个特征为b的数据集
            newEntropy += len(miniDataSet)*calcShannonEnt(miniDataSet)/len(dataSet) #计算第i个特征为b的熵
        infoShannon = baseShanoonEnt - newEntropy  #计算第i个特征的信息增益
        if infoShannon > bestShannonEnt:  #求信息增益最大的特征
            bestShannonEnt = infoShannon
            bestFeature = i 
    return bestFeature            

#求列表中出现次数最多的元素
def majorityCnt(classList):
    count = {}
    for val in classList:
        if val not in count.keys(): count[val] = 0
        count[val] += 1
    countSorted = sorted(count.items(),key = operator.itemgetter(1),reverse = True)
    return countSorted[0][0]

def createTree(dataSet, labels):
    classList = [example[-1] for example in dataSet]
    if classList.count(classList[0]) ==len(classList):#类别相同则停止划分
        return classList[0]
    if len(dataSet[0]) == 1:#所有特征已经用完
        return majorityCnt(classList)
    bestFeat = chooseBestFeatureToSplit(dataSet)
    bestFeatLabel = labels[bestFeat]
    myTree = {bestFeatLabel:{}}
    del(labels[bestFeat])
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)
    for value in uniqueVals:
        subLabels = labels[:]#为了不改变原始列表的内容复制了一下
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet,bestFeat, value), subLabels)                                       
    return myTree

dataSet = [  [1,1,'yes']
            ,[1,1,'yes']
            ,[1,0,'no']
            ,[0,1,'no']
            ,[0,1,'no']
          ]
labels = ['no surfacing','flippers']

createTree(dataSet,labels)

结果：
在这里插入图片描述
例3：使用sklearn实现决策树分类

import numpy as np 
import matplotlib.pyplot as ply
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from matplotlib.colors import ListedColormap

iris = datasets.load_iris() #取鸢尾花数据集
X = iris.data[:,2:]  #自变量只取第三列和第四列指标值
y = iris.target      #应变量



#决策边界
def plot_decision_boundary(model,axis):
    #生成网格坐标矩阵
    x0, x1 = np.meshgrid(
          np.linspace(axis[0],axis[1],int((axis[1]-axis[0])*100)).reshape(1,-1)
        , np.linspace(axis[2],axis[3],int((axis[3]-axis[2])*100)).reshape(1,-1) 
    )
    X_new = np.c_[x0.ravel(),x1.ravel()] #np.ravel:扁平化函数，将多维数组转化为1维
    y_predict = model.predict(X_new)     #使用预测模型，预测相应值的结果
    zz = y_predict.reshape(x0.shape)   
    
    # 然后画出图
    plt.contour(x0, x1, zz)  #contourf参数：, linewidth = 5,camp = custom_camp ；  contourf等高线图，contour等高线轮廓
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Spectral)  
    
#剪枝参数：min_sample_split、min_samples_leaf、min_weight_fraction_leaf、max_depth、max_leaf_nodes、min_features;https://blog.csdn.net/cutwind/article/details/102902390
#特征选择标准:criterion ,‘gini’ or ‘entropy’ (default=”gini”)，前者是基尼系数，后者是信息熵
dt_clf = DecisionTreeClassifier(max_leaf_nodes = 4,criterion = 'entropy') 
dt_clf.fit(X,y)
#画图，分类散点图
# plt.scatter(X[y==0,0],X[y==0,1],color = 'red')
# plt.scatter(X[y==1,0],X[y==1,1],color = 'green')
# plt.scatter(X[y==2,0],X[y==2,1],color = 'pink')
# plt.show()
plot_decision_boundary(dt_clf,axis = [0,7,0,3])  
plt.show()

结果
在这里插入图片描述
例4：使用sklearn实现决策树的回归

#用决策树解决回归问题
import numpy as np
import matplotlib.pyplot as plt 
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

boston = datasets.load_boston()
X = boston.data
y = boston.target

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25)
dtr = DecisionTreeRegressor(max_leaf_nodes = 15 )
dtr.fit(X_train,y_train)
train_score = dtr.score(X_train,y_train)  #训练集的拟合优度：R_square
test_score = dtr.score(X_test,y_test)   #测试集的拟合优度：R_square
y_predict = dtr.predict(X_test)
print("train_score = %.2f"%(train_score*100))
print("test_score = %.2f"%(test_score*100))

原文链接：https://blog.csdn.net/semine_shen/article/details/108000535