import math def createDataSet(): dataset = [['青年', '否', '否', '一般', '否'], ['青年', '否', '否', '好', '否'], ['青年', '是', '否', '好', '是'], ['青年', '是', '是', '一般', '是'], ['青年', '否', '否', '一般', '否'], ['中年', '否', '否', '一般', '否'], ['中年', '否', '否', '好', '否'], ['中年', '是', '是', '好', '是'], ['中年', '否', '是', '非常好', '是'], ['中年', '否', '是', '非常好', '是'], ['老年', '否', '是', '非常好', '是'], ['老年', '否', '是', '好', '是'], ['老年', '是', '否', '好', '是'], ['老年', '是', '否', '非常好', '是'], ['老年', '否', '否', '一般', '否']] labels = ['年龄', '有工作', '有自己的房子', '信贷情况'] return dataset, labels def calcEntropy(dataset): """ :return: """ num = len(dataset) labelCounts = {} for data in dataset: currLabel = data[-1] if currLabel not in labelCounts: labelCounts[currLabel] = 0 labelCounts[currLabel] += 1 entropy = 0 for key in labelCounts: prob = float(labelCounts[key]) / num entropy += -1 * prob * math.log(prob, 2) return entropy def splitDataSet(dataset, axis, value): """ :param dataset: :param axis: :param value: :return: """ ret = [] for data in dataset: if data[axis] == value: reduceFeatVec = data[:axis] reduceFeatVec.extend(data[axis + 1:]) ret.append(reduceFeatVec) return ret def chooseBestFeature(dataset): """ :param dataset: :return: """ numFeatures = len(dataset[0]) - 1 baseEntropy = calcEntropy(dataset) bestInfoGain = 0 bestFeature = 0 for i in range(numFeatures): features = [example[i] for example in dataset] uniqueVals = set(features) newEntropy = 0 for value in uniqueVals: subDataSet = splitDataSet(dataset, i, value) prob = len(subDataSet) / float(len(dataset)) newEntropy += prob * calcEntropy(subDataSet) infoGain = baseEntropy - newEntropy if infoGain > bestInfoGain: bestInfoGain = infoGain bestFeature = i return bestFeature def createTree(dataset, labels): """ :param dataset: :param labels: :return: """ classList = [example[-1] for example in dataset] if classList.count(classList[0]) == len(classList): return classList[0] if len(dataset[0]) == 1: return classList bestFeat = chooseBestFeature(dataset) bestFeatLabel = labels[bestFeat] myTree = {bestFeatLabel:{}} del(labels[bestFeat]) featValues = [example[bestFeat] for example in dataset] uniqueVals = set(featValues) for value in uniqueVals: subLabels = labels[:] myTree[bestFeatLabel][value] = createTree(splitDataSet (dataset, bestFeat, value), subLabels) return myTree if __name__ == '__main__': dataset, labels = createDataSet() print(createTree(dataset, labels))
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117相关知识
西瓜书+花书圣经+统计学习方法+南瓜书|四大人工智能名著分享
周志华西瓜书+花书圣经+李航统计学习方法+南瓜书|四大人工智能名著分享
决策树的过拟合问题及解决方案
分类算法3:决策树及R语言实现
基于决策树构建鸢尾花数据的分类模型并绘制决策树模型
基于决策树的水稻病虫害发生程度预测模型——以芜湖市为例
《统计学习方法》第 2 章“感知机”学习笔记
决策树模型——鸢尾花分类 剪枝前后正确率
决策树模型
python利用c4.5决策树对鸢尾花卉数据集进行分类(iris)
网址: 统计学习方法 —— 决策树模型 https://m.huajiangbk.com/newsview1911817.html
上一篇: 提升树算法详解与应用 |
下一篇: 鸢尾花数据集的决策树模型构建与优 |