决策树代码
基于信息增益实现图4.2的算法。ids_data是数据编号的list,values_of_attributes存储每列属性所有可能的取值。可拿原书表4.2和图4.5测试代码是否有Bug。
import math
def Setup(data):
ids_data = [i for i in range(len(data))]
is_visited_attributes = [False for x in range(len(attributes))]
values_of_attributes = [set() for i in range(len(attributes))]
for id in ids_data:
x = data[id]
for i in range(len(x) - 1):
values_of_attributes[i].add(x[i])
values_of_attributes = [list(e) for e in values_of_attributes]
return ids_data, is_visited_attributes, values_of_attributes
def Ent(parameter, ids_data):
[data, attributes] = parameter
res = 0
dict = {}
for d in ids_data:
key = data[d][-1]
if dict.__contains__(key):
dict[key] = dict[key] + 1
else:
dict[key] = 1
for key in dict.keys():
p_k = dict[key] / float(len(ids_data))
res = res - p_k * math.log(p_k, 2)
return res
def SplitDataWithAv(parameter, ids_data, id_attribute):
[data, attributes] = parameter
dict = {}
for d in ids_data:
key = data[d][id_attribute]
if dict.__contains__(key):
dict[key].append(d)
else:
dict[key] = [d]
return dict
def Gain(parameter, ids_data, id_attribute):
res = Ent(parameter, ids_data)
split_dict = SplitDataWithAv(parameter, ids_data, id_attribute)
for key in split_dict.keys():
ids_data_v = split_dict[key]
res = res - float(len(ids_data_v)) / len(ids_data) * Ent(parameter, ids_data_v)
return res
def SelectBestAttribute(parameter, ids_data, is_visited_attributes):
idOfBest = -1
score = -1
for i in range(0, len(attributes)):
if not is_visited_attributes[i]:
tempScore = Gain(parameter, ids_data, i)
if score < tempScore:
idOfBest = i
score = tempScore
return idOfBest
def SelectMostCategory(parameter, ids_data):
[data, attributes] = parameter
res = None
dict = {}
for i in ids_data:
key = data[i][-1]
if dict.__contains__(key):
dict[key] = dict[key] + 1
else:
dict[key] = 1
score = 0
for key in dict:
if score < dict[key]:
score = dict[key]
res = key
return res
def TreeGenerate(parameter, ids_data, is_visited_attributes, values_of_attributes):
[data, attributes] = parameter
sample = set()
for i in ids_data:
sample.add(data[i][-1])
if len(sample) == 1:
node = list(sample)[0]
return node
flag = False
for f in is_visited_attributes:
flag = flag and f
if flag:
node = SelectMostCategory(parameter, ids_data)
return node
s = set()
for i in ids_data:
s.add(str(data[i]))
if len(s) == 1:
node = SelectMostCategory(ids_data)
return node
id_attribute = SelectBestAttribute(parameter, ids_data, is_visited_attributes)
is_visited_attributes[id_attribute] = True
dict = SplitDataWithAv(parameter, ids_data, id_attribute)
node = {}
for a_v in values_of_attributes[id_attribute]:
if not dict.__contains__(a_v):
node[a_v] = SelectMostCategory(parameter, ids_data)
else:
node[a_v] = TreeGenerate(parameter, dict[a_v], is_visited_attributes, values_of_attributes)
node = {attributes[id_attribute] : node}
is_visited_attributes[id_attribute] = False
return node
data = [
['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '是'],
['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '是'],
['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '是'],
['青绿', '稍蜷', '沉闷', '清晰', '稍凹', '软粘', '是'],
['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘', '是'],
['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘', '否'],
['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑', '否'],
['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '否'],
['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑', '否'],
['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑', '否'],
]
attributes = ['色泽','根蒂','敲声','纹理','脐部','触感']
ids_data, is_visited_attributes, values_of_attributes = Setup(data)
parameter = [data, attributes]
node = TreeGenerate(parameter, ids_data, is_visited_attributes, values_of_attributes)
print(node)
题目4.1
试证明对于不含冲突数据(即特征向量完全相同但标记不同)的训练集,必存在与训练集一致(即训练误差为0)的决策树。
回顾图4.2,决策树生成叶子节点的条件:(1)要么划分的样本集类别全部一样;(2)要么属性A用完了,或属性A上的取值都相同;(3)要么样本集为空。
同一叶节点上的样本集在这条路径上的属性取值相同,对于第(3)种情况,样本集为空,这部分训练集不会有训练误差;对于第(1)种情况,划分的样本集类别全部一样,这部分训练集当然也不会有训练误差;那么会产生训练误差的就只有第(2)种情况,假设训练误差不为0,即此时存在2个训练样本,他们的属性取值相同,但类别标记不一样,这与题设“不含冲突数据”相矛盾,因此训练误差为0。
题目4.3
试编程实现基于信息熵进行划分选择的决策树算法,并为表4.3中数据生成一棵决策树。
具有连续值属性的决策树的代码,可在决策树代码上修改而来。
import math
def Setup(data):
ids_data = [i for i in range(len(data))]
is_visited_attributes = [False for x in range(len(attributes))]
values_of_attributes = [set() for i in range(len(attributes))]
for id in ids_data:
x = data[id]
for i in range(len(x) - 1):
if not is_continuous_attributes[i]:
values_of_attributes[i].add(x[i])
else:
values_of_attributes[i].add(x[i])
values_of_attributes = [list(e) for e in values_of_attributes]
for i in range(len(x) - 1):
if is_continuous_attributes[i]:
values_of_attributes[i].sort()
temp_list = list()
for j in range(len(values_of_attributes[i]) -1):
temp_list.append(float(values_of_attributes[i][j] + values_of_attributes[i][j + 1]) / 2)
values_of_attributes[i] = temp_list
return ids_data, is_visited_attributes, values_of_attributes
def Ent(parameter, ids_data):
[data, attributes] = parameter
res = 0
dict = {}
for d in ids_data:
key = data[d][-1]
if dict.__contains__(key):
dict[key] = dict[key] + 1
else:
dict[key] = 1
for key in dict.keys():
p_k = dict[key] / float(len(ids_data))
res = res - p_k * math.log(p_k, 2)
return res
def SplitDataWithAv(parameter, ids_data, id_attribute):
[data, attributes] = parameter
dict = {}
for d in ids_data:
key = data[d][id_attribute]
if dict.__contains__(key):
dict[key].append(d)
else:
dict[key] = [d]
return dict
def SplitDataWithNum(parameter, ids_data, id_attribute, number):
[data, attributes] = parameter
dict = {'yes':list(), 'no':list()}
for d in ids_data:
if data[d][id_attribute] <= number:
dict['yes'].append(d)
else:
dict['no'].append(d)
return dict
def Gain(parameter, ids_data, id_attribute, is_continuous_attribute):
if not is_continuous_attribute:
res = Ent(parameter, ids_data)
split_dict = SplitDataWithAv(parameter, ids_data, id_attribute)
for key in split_dict.keys():
ids_data_v = split_dict[key]
res = res - float(len(ids_data_v)) / len(ids_data) * Ent(parameter, ids_data_v)
return res, None
else:
res = 0
number_point = None
ent = Ent(parameter, ids_data)
for number in values_of_attributes[id_attribute]:
split_dict = SplitDataWithNum(parameter, ids_data, id_attribute, number)
ids_data_left = split_dict['yes']
ids_data_right = split_dict['no']
temp_gain = ent - float(len(ids_data_left)) / len(ids_data) * Ent(parameter, ids_data_left)
temp_gain = temp_gain - float(len(ids_data_right)) / len(ids_data) * Ent(parameter, ids_data_right)
if temp_gain > res:
res = temp_gain
number_point = number
return res, number_point
def SelectBestAttribute(parameter, ids_data, is_visited_attributes):
id_of_best = -1
score = -1
number_point = None
for i in range(0, len(attributes)):
if not is_visited_attributes[i]:
temp_score, temp_number_point = Gain(parameter, ids_data, i, is_continuous_attributes[i])
if score < temp_score:
id_of_best = i
score = temp_score
number_point = temp_number_point
return id_of_best, number_point
def SelectMostCategory(parameter, ids_data):
[data, attributes] = parameter
res = None
dict = {}
for i in ids_data:
key = data[i][-1]
if dict.__contains__(key):
dict[key] = dict[key] + 1
else:
dict[key] = 1
score = 0
for key in dict:
if score < dict[key]:
score = dict[key]
res = key
return res
def TreeGenerate(parameter, ids_data, is_visited_attributes, values_of_attributes):
[data, attributes] = parameter
sample = set()
for i in ids_data:
sample.add(data[i][-1])
if len(sample) == 1:
node = list(sample)[0]
return node
flag = False
for f in is_visited_attributes:
flag = flag and f
if flag:
node = SelectMostCategory(parameter, ids_data)
return node
s = set()
for i in ids_data:
s.add(str(data[i]))
if len(s) == 1:
node = SelectMostCategory(ids_data)
return node
id_attribute, number_point = SelectBestAttribute(parameter, ids_data, is_visited_attributes)
if not is_continuous_attributes[id_attribute]:
is_visited_attributes[id_attribute] = True
dict = None
if not is_continuous_attributes[id_attribute]:
dict = SplitDataWithAv(parameter, ids_data, id_attribute)
else:
dict = SplitDataWithNum(parameter, ids_data, id_attribute, number_point)
node = {}
if not is_continuous_attributes[id_attribute]:
for a_v in values_of_attributes[id_attribute]:
if not dict.__contains__(a_v):
node[a_v] = SelectMostCategory(parameter, ids_data)
else:
node[a_v] = TreeGenerate(parameter, dict[a_v], is_visited_attributes, values_of_attributes)
node = {attributes[id_attribute] : node}
else:
label = attributes[id_attribute] + "<=" + str(number_point)
node['yes'] = TreeGenerate(parameter, dict['yes'], is_visited_attributes, values_of_attributes)
node['no'] = TreeGenerate(parameter, dict['no'], is_visited_attributes, values_of_attributes)
node = {label : node}
if not is_continuous_attributes[id_attribute]:
is_visited_attributes[id_attribute] = False
return node
data = [
['青绿','蜷缩','浊响','清晰','凹陷','硬滑',0.697,0.460,'是'],
['乌黑','蜷缩','沉闷','清晰','凹陷','硬滑',0.774,0.376,'是'],
['乌黑','蜷缩','浊响','清晰','凹陷','硬滑',0.634,0.264,'是'],
['青绿','蜷缩','沉闷','清晰','凹陷','硬滑',0.608,0.318,'是'],
['浅白','蜷缩','浊响','清晰','凹陷','硬滑',0.556,0.215,'是'],
['青绿','稍蜷','浊响','清晰','稍凹','软粘',0.403,0.237,'是'],
['乌黑','稍蜷','浊响','稍糊','稍凹','软粘',0.481,0.149,'是'],
['乌黑','稍蜷','浊响','清晰','稍凹','硬滑',0.437,0.211,'是'],
['乌黑','稍蜷','沉闷','稍糊','稍凹','硬滑',0.666,0.091,'否'],
['青绿','硬挺','清脆','清晰','平坦','软粘',0.243,0.267,'否'],
['浅白','硬挺','清脆','模糊','平坦','硬滑',0.245,0.057,'否'],
['浅白','蜷缩','浊响','模糊','平坦','软粘',0.343,0.099,'否'],
['青绿','稍蜷','浊响','稍糊','凹陷','硬滑',0.639,0.161,'否'],
['浅白','稍蜷','沉闷','稍糊','凹陷','硬滑',0.657,0.198,'否'],
['乌黑','稍蜷','浊响','清晰','稍凹','软粘',0.360,0.370,'否'],
['浅白','蜷缩','浊响','模糊','平坦','硬滑',0.593,0.042,'否'],
['青绿','蜷缩','沉闷','稍糊','稍凹','硬滑',0.719,0.103,'否'],
]
attributes = ['色泽','根蒂','敲声','纹理','脐部','触感','密度','含糖率']
is_continuous_attributes = [False, False, False, False, False, False, True, True]
ids_data, is_visited_attributes, values_of_attributes = Setup(data)
parameter = [data, attributes]
node = TreeGenerate(parameter, ids_data, is_visited_attributes, values_of_attributes)
print(node)
可以拿表4.5和图4.10测试代码是否有Bug。
题目4.7
图4.2是一个递归算法,若面临巨量数据,则决策树的层数会很深,使用递归方法易导致“栈”溢出。试使用队列数据结构,以参数MaxDepth控制数的最大深度,写出与图4.2等价、但不使用递归的决策树生成算法。
题目可能有误,原义应该是使用栈数据结构进行改写。图4.2算法使用递归方法,容易导致“栈”溢出,经分析,图4.2算法是树的先序遍历(深度优先),可以使用栈数据结构将递归方法改写成非递归方法。若使用队列数据结构,则可改写成层次遍历(广度优先),这样处理的好处是预剪枝(层次遍历)和后剪枝(反序层次遍历,此时可使用双向队列数据结构)处理起来方便,坏处见题目4.8。
题目4.8
试将决策树生成的深度优先搜索过程修改为广度优先搜索,以参数MaxNode控制树的最大结点数,将题4.7中基于队列的决策树算法进行改写。对比题4.7中的算法,试析哪种方式更容易控制决策树所需存储不超出内存。
题目可能有误,见题目4.7。假设内存大小为O ( n ) O(n)O(n),树的深度为n nn,第n nn层共有2 n − 1 2^{n-1}2n−1个结点,采用栈实现图4.2算法所需空间为O ( n ) O(n)O(n),而采用队列实现图4.2算法所需空间为O ( 2 n ) O(2^n)O(2n),显然广度优先策略更容易内存不足。