西瓜书《机器学习》第四章部分课后题

决策树代码

基于信息增益实现图4.2的算法。ids_data是数据编号的list,values_of_attributes存储每列属性所有可能的取值。可拿原书表4.2和图4.5测试代码是否有Bug。

import math

def Setup(data):
    ids_data = [i for i in range(len(data))]
    is_visited_attributes = [False for x in range(len(attributes))]
    values_of_attributes = [set() for i in range(len(attributes))]
    for id in ids_data:
        x = data[id]
        for i in range(len(x) - 1):
            values_of_attributes[i].add(x[i])
    values_of_attributes = [list(e) for e in values_of_attributes]
    return ids_data, is_visited_attributes, values_of_attributes

def Ent(parameter, ids_data):
    [data, attributes] = parameter
    res = 0
    dict = {}
    for d in ids_data:
        key = data[d][-1]
        if dict.__contains__(key):
            dict[key] = dict[key] + 1
        else:
            dict[key] = 1
    for key in dict.keys():
        p_k = dict[key] / float(len(ids_data))
        res = res - p_k * math.log(p_k, 2)
    return res

def SplitDataWithAv(parameter, ids_data, id_attribute):
    [data, attributes] = parameter
    dict = {}
    for d in ids_data:
        key = data[d][id_attribute]
        if dict.__contains__(key):
            dict[key].append(d)
        else:
            dict[key] = [d]
    return dict

def Gain(parameter, ids_data, id_attribute):
    res = Ent(parameter, ids_data)
    split_dict = SplitDataWithAv(parameter, ids_data, id_attribute)
    for key in split_dict.keys():
        ids_data_v = split_dict[key]
        res = res - float(len(ids_data_v)) / len(ids_data) * Ent(parameter, ids_data_v)
    return res

def SelectBestAttribute(parameter, ids_data, is_visited_attributes):
    idOfBest = -1
    score = -1
    for i in range(0, len(attributes)):
        if not is_visited_attributes[i]:
            tempScore = Gain(parameter, ids_data, i)
            if score < tempScore:
                idOfBest = i
                score = tempScore
    return idOfBest

def SelectMostCategory(parameter, ids_data):
    [data, attributes] = parameter
    res = None
    dict = {}
    for i in ids_data:
        key = data[i][-1]
        if dict.__contains__(key):
            dict[key] = dict[key] + 1
        else:
            dict[key] = 1
    score = 0
    for key in dict:
        if score < dict[key]:
            score = dict[key]
            res = key
    return res

def TreeGenerate(parameter, ids_data, is_visited_attributes, values_of_attributes):
    [data, attributes] = parameter
    sample = set()
    for i in ids_data:
        sample.add(data[i][-1])
    if len(sample) == 1:
        node = list(sample)[0]
        return node
    flag = False
    for f in is_visited_attributes:
        flag = flag and f
    if flag:
        node = SelectMostCategory(parameter, ids_data)
        return node
    s = set()
    for i in ids_data:
        s.add(str(data[i]))
    if len(s) == 1:
        node = SelectMostCategory(ids_data)
        return node
    id_attribute = SelectBestAttribute(parameter, ids_data, is_visited_attributes)
    is_visited_attributes[id_attribute] = True
    dict = SplitDataWithAv(parameter, ids_data, id_attribute)
    node = {}
    for a_v in values_of_attributes[id_attribute]:
        if not dict.__contains__(a_v):
            node[a_v] = SelectMostCategory(parameter, ids_data)
        else:
            node[a_v] = TreeGenerate(parameter, dict[a_v], is_visited_attributes, values_of_attributes)
    node = {attributes[id_attribute] : node}
    is_visited_attributes[id_attribute] = False
    return node



data = [
    ['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '是'],
    ['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '是'],
    ['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '是'],
    ['青绿', '稍蜷', '沉闷', '清晰', '稍凹', '软粘', '是'],
    ['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘', '是'],
    ['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘', '否'],
    ['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑', '否'],
    ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '否'],
    ['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑', '否'],
    ['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑', '否'],
]

attributes = ['色泽','根蒂','敲声','纹理','脐部','触感']



ids_data, is_visited_attributes, values_of_attributes = Setup(data)
parameter = [data, attributes]

node = TreeGenerate(parameter, ids_data, is_visited_attributes, values_of_attributes)

print(node)

题目4.1

试证明对于不含冲突数据(即特征向量完全相同但标记不同)的训练集,必存在与训练集一致(即训练误差为0)的决策树。

回顾图4.2,决策树生成叶子节点的条件:(1)要么划分的样本集类别全部一样;(2)要么属性A用完了,或属性A上的取值都相同;(3)要么样本集为空。

同一叶节点上的样本集在这条路径上的属性取值相同,对于第(3)种情况,样本集为空,这部分训练集不会有训练误差;对于第(1)种情况,划分的样本集类别全部一样,这部分训练集当然也不会有训练误差;那么会产生训练误差的就只有第(2)种情况,假设训练误差不为0,即此时存在2个训练样本,他们的属性取值相同,但类别标记不一样,这与题设“不含冲突数据”相矛盾,因此训练误差为0。

题目4.3

试编程实现基于信息熵进行划分选择的决策树算法,并为表4.3中数据生成一棵决策树。

具有连续值属性的决策树的代码,可在决策树代码上修改而来。

import math

def Setup(data):
    ids_data = [i for i in range(len(data))]
    is_visited_attributes = [False for x in range(len(attributes))]
    values_of_attributes = [set() for i in range(len(attributes))]
    for id in ids_data:
        x = data[id]
        for i in range(len(x) - 1):
            if not is_continuous_attributes[i]:
                values_of_attributes[i].add(x[i])
            else:
                values_of_attributes[i].add(x[i])
    values_of_attributes = [list(e) for e in values_of_attributes]
    for i in range(len(x) - 1):
        if is_continuous_attributes[i]:
            values_of_attributes[i].sort()
            temp_list = list()
            for j in range(len(values_of_attributes[i]) -1):
                temp_list.append(float(values_of_attributes[i][j] + values_of_attributes[i][j + 1]) / 2)
            values_of_attributes[i] = temp_list
    return ids_data, is_visited_attributes, values_of_attributes

def Ent(parameter, ids_data):
    [data, attributes] = parameter
    res = 0
    dict = {}
    for d in ids_data:
        key = data[d][-1]
        if dict.__contains__(key):
            dict[key] = dict[key] + 1
        else:
            dict[key] = 1
    for key in dict.keys():
        p_k = dict[key] / float(len(ids_data))
        res = res - p_k * math.log(p_k, 2)
    return res

def SplitDataWithAv(parameter, ids_data, id_attribute):
    [data, attributes] = parameter
    dict = {}
    for d in ids_data:
        key = data[d][id_attribute]
        if dict.__contains__(key):
            dict[key].append(d)
        else:
            dict[key] = [d]
    return dict

def SplitDataWithNum(parameter, ids_data, id_attribute, number):
    [data, attributes] = parameter
    dict = {'yes':list(), 'no':list()}
    for d in ids_data:
        if data[d][id_attribute] <= number:
            dict['yes'].append(d)
        else:
            dict['no'].append(d)
    return dict

def Gain(parameter, ids_data, id_attribute, is_continuous_attribute):
    if not is_continuous_attribute:
        res = Ent(parameter, ids_data)
        split_dict = SplitDataWithAv(parameter, ids_data, id_attribute)
        for key in split_dict.keys():
            ids_data_v = split_dict[key]
            res = res - float(len(ids_data_v)) / len(ids_data) * Ent(parameter, ids_data_v)
        return res, None
    else:
        res = 0
        number_point = None
        ent = Ent(parameter, ids_data)
        for number in values_of_attributes[id_attribute]:
            split_dict = SplitDataWithNum(parameter, ids_data, id_attribute, number)
            ids_data_left = split_dict['yes']
            ids_data_right = split_dict['no']
            temp_gain = ent - float(len(ids_data_left)) / len(ids_data) * Ent(parameter, ids_data_left)
            temp_gain = temp_gain - float(len(ids_data_right)) / len(ids_data) * Ent(parameter, ids_data_right)
            if temp_gain > res:
                res = temp_gain
                number_point = number
        return res, number_point

def SelectBestAttribute(parameter, ids_data, is_visited_attributes):
    id_of_best = -1
    score = -1
    number_point = None
    for i in range(0, len(attributes)):
        if not is_visited_attributes[i]:
            temp_score, temp_number_point = Gain(parameter, ids_data, i, is_continuous_attributes[i])
            if score < temp_score:
                id_of_best = i
                score = temp_score
                number_point = temp_number_point
    return id_of_best, number_point

def SelectMostCategory(parameter, ids_data):
    [data, attributes] = parameter
    res = None
    dict = {}
    for i in ids_data:
        key = data[i][-1]
        if dict.__contains__(key):
            dict[key] = dict[key] + 1
        else:
            dict[key] = 1
    score = 0
    for key in dict:
        if score < dict[key]:
            score = dict[key]
            res = key
    return res

def TreeGenerate(parameter, ids_data, is_visited_attributes, values_of_attributes):
    [data, attributes] = parameter
    sample = set()
    for i in ids_data:
        sample.add(data[i][-1])
    if len(sample) == 1:
        node = list(sample)[0]
        return node
    flag = False
    for f in is_visited_attributes:
        flag = flag and f
    if flag:
        node = SelectMostCategory(parameter, ids_data)
        return node
    s = set()
    for i in ids_data:
        s.add(str(data[i]))
    if len(s) == 1:
        node = SelectMostCategory(ids_data)
        return node
    id_attribute, number_point = SelectBestAttribute(parameter, ids_data, is_visited_attributes)
    if not is_continuous_attributes[id_attribute]:
        is_visited_attributes[id_attribute] = True
    dict = None
    if not is_continuous_attributes[id_attribute]:
        dict = SplitDataWithAv(parameter, ids_data, id_attribute)
    else:
        dict = SplitDataWithNum(parameter, ids_data, id_attribute, number_point)
    node = {}
    if not is_continuous_attributes[id_attribute]:
        for a_v in values_of_attributes[id_attribute]:
            if not dict.__contains__(a_v):
                node[a_v] = SelectMostCategory(parameter, ids_data)
            else:
                node[a_v] = TreeGenerate(parameter, dict[a_v], is_visited_attributes, values_of_attributes)
        node = {attributes[id_attribute] : node}
    else:
        label = attributes[id_attribute] + "<=" + str(number_point)
        node['yes'] = TreeGenerate(parameter, dict['yes'], is_visited_attributes, values_of_attributes)
        node['no'] = TreeGenerate(parameter, dict['no'], is_visited_attributes, values_of_attributes)
        node = {label : node}
    if not is_continuous_attributes[id_attribute]:
        is_visited_attributes[id_attribute] = False
    return node



data = [
    ['青绿','蜷缩','浊响','清晰','凹陷','硬滑',0.697,0.460,'是'],
    ['乌黑','蜷缩','沉闷','清晰','凹陷','硬滑',0.774,0.376,'是'],
    ['乌黑','蜷缩','浊响','清晰','凹陷','硬滑',0.634,0.264,'是'],
    ['青绿','蜷缩','沉闷','清晰','凹陷','硬滑',0.608,0.318,'是'],
    ['浅白','蜷缩','浊响','清晰','凹陷','硬滑',0.556,0.215,'是'],
    ['青绿','稍蜷','浊响','清晰','稍凹','软粘',0.403,0.237,'是'],
    ['乌黑','稍蜷','浊响','稍糊','稍凹','软粘',0.481,0.149,'是'],
    ['乌黑','稍蜷','浊响','清晰','稍凹','硬滑',0.437,0.211,'是'],
    ['乌黑','稍蜷','沉闷','稍糊','稍凹','硬滑',0.666,0.091,'否'],
    ['青绿','硬挺','清脆','清晰','平坦','软粘',0.243,0.267,'否'],
    ['浅白','硬挺','清脆','模糊','平坦','硬滑',0.245,0.057,'否'],
    ['浅白','蜷缩','浊响','模糊','平坦','软粘',0.343,0.099,'否'],
    ['青绿','稍蜷','浊响','稍糊','凹陷','硬滑',0.639,0.161,'否'],
    ['浅白','稍蜷','沉闷','稍糊','凹陷','硬滑',0.657,0.198,'否'],
    ['乌黑','稍蜷','浊响','清晰','稍凹','软粘',0.360,0.370,'否'],
    ['浅白','蜷缩','浊响','模糊','平坦','硬滑',0.593,0.042,'否'],
    ['青绿','蜷缩','沉闷','稍糊','稍凹','硬滑',0.719,0.103,'否'],
]

attributes = ['色泽','根蒂','敲声','纹理','脐部','触感','密度','含糖率']
is_continuous_attributes = [False, False, False, False, False, False, True, True]

ids_data, is_visited_attributes, values_of_attributes = Setup(data)
parameter = [data, attributes]

node = TreeGenerate(parameter, ids_data, is_visited_attributes, values_of_attributes)

print(node)

可以拿表4.5和图4.10测试代码是否有Bug。

题目4.7

图4.2是一个递归算法,若面临巨量数据,则决策树的层数会很深,使用递归方法易导致“栈”溢出。试使用队列数据结构,以参数MaxDepth控制数的最大深度,写出与图4.2等价、但不使用递归的决策树生成算法。

题目可能有误,原义应该是使用数据结构进行改写。图4.2算法使用递归方法,容易导致“栈”溢出,经分析,图4.2算法是树的先序遍历(深度优先),可以使用数据结构将递归方法改写成非递归方法。若使用队列数据结构,则可改写成层次遍历(广度优先),这样处理的好处是预剪枝(层次遍历)和后剪枝(反序层次遍历,此时可使用双向队列数据结构)处理起来方便,坏处见题目4.8。

题目4.8

试将决策树生成的深度优先搜索过程修改为广度优先搜索,以参数MaxNode控制树的最大结点数,将题4.7中基于队列的决策树算法进行改写。对比题4.7中的算法,试析哪种方式更容易控制决策树所需存储不超出内存。

题目可能有误,见题目4.7。假设内存大小为O ( n ) O(n)O(n),树的深度为n nn,第n nn层共有2 n − 1 2^{n-1}2n1个结点,采用栈实现图4.2算法所需空间为O ( n ) O(n)O(n),而采用队列实现图4.2算法所需空间为O ( 2 n ) O(2^n)O(2n),显然广度优先策略更容易内存不足。


版权声明:本文为u014134327原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。