import numpy as np from sklearn.neighbors import KNeighborsClassifier from sklearn import datasets #sklearn提供的数据集 数据集中就包含鸢尾花 12345
# 鸢尾花 因为自然环境不同 类别可以继续细分 # 调用datasets iris = datasets.load_iris() #调用鸢尾花数据集 X = iris['data'] #打印输出鸢尾花 y = iris['target'] # 150代表150 个样本 4代表4个属性 花萼长 款;花瓣长宽 X.shape 1234567
(150, 4) 1
# 将数据划分 一分为二;一部分用于训练 另一部分用于测试 # 将顺序打乱 index = np.arange(150) index 1234
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149]) 123456789101112
np.random.shuffle(index) #将顺序打乱,取出前100个为训练数据,后50个为测试数据 index 12
array([ 1, 21, 38, 118, 72, 114, 83, 29, 53, 122, 25, 149, 35, 50, 140, 143, 56, 0, 138, 17, 11, 10, 33, 61, 144, 19, 8, 88, 48, 85, 70, 112, 128, 127, 96, 137, 28, 107, 82, 77, 78, 79, 15, 146, 49, 12, 135, 145, 55, 2, 47, 80, 3, 141, 30, 111, 43, 133, 66, 13, 91, 54, 110, 124, 104, 106, 32, 52, 7, 120, 148, 123, 97, 18, 39, 126, 92, 60, 74, 46, 4, 23, 57, 58, 36, 113, 41, 44, 27, 142, 134, 98, 131, 64, 59, 94, 115, 42, 103, 5, 130, 102, 108, 117, 100, 105, 75, 65, 40, 139, 125, 84, 22, 109, 51, 63, 62, 99, 101, 73, 6, 9, 69, 67, 121, 20, 87, 147, 71, 14, 81, 16, 68, 86, 37, 95, 90, 34, 129, 136, 116, 93, 26, 132, 119, 45, 24, 89, 76, 31]) 123456789101112
# 150个数据 取出前100个为训练数据,后50个为测试数据(验证算法是否可靠) # 应用到实际中,也要获取现实中的数据 算法对数据进行分类----->上线:实时进行 分类 X_train,X_test = X[index[:100]],X[index[100:]] y_train,y_test = y[index[:100]],y[index[-50:]] #数据一一对应 12345
# 数据简单 只有四个属性 # 如果p=1,距离度量采用的是 曼哈顿距离 # p=2 距离度量采用的是 欧氏距离 # 100**0.5=10 邻居数最好不要超过样本数量的平方 knn = KNeighborsClassifier(n_neighbors=5,weights='distance',p = 1,n_jobs= 4) #邻居数不是越少越好 给1时结果比较武断,不好 knn.fit(X_train,y_train) #算法训练 y_ = knn.predict(X_test) # 算法预测 knn.score(X_test,y_test) 1234567891011
1.0 1
y_ 1
array([2, 2, 2, 2, 2, 2, 1, 1, 0, 2, 2, 1, 0, 2, 1, 1, 1, 1, 2, 1, 0, 0, 1, 1, 2, 0, 1, 2, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 2, 2, 2, 1, 0, 2, 2, 0, 0, 1, 1, 0]) 123
# 算法返回的值,变量_来接收:约定俗成的变量命名规则 proba_ = knn.predict_proba(X_test) #一般对接收的数据加_ proba_ 123
array([[ 0. , 0. , 1. ], [ 0. , 0. , 1. ], [ 0. , 0. , 1. ], [ 0. , 0. , 1. ], [ 0. , 0. , 1. ], [ 0. , 0. , 1. ], [ 0. , 1. , 0. ], [ 0. , 1. , 0. ], [ 1. , 0. , 0. ], [ 0. , 0. , 1. ], [ 0. , 0. , 1. ], [ 0. , 1. , 0. ], [ 1. , 0. , 0. ], [ 0. , 0. , 1. ], [ 0. , 1. , 0. ], [ 0. , 0.6, 0.4], [ 0. , 1. , 0. ], [ 0. , 1. , 0. ], [ 0. , 0.2, 0.8], [ 0. , 1. , 0. ], [ 1. , 0. , 0. ], [ 1. , 0. , 0. ], [ 0. , 1. , 0. ], [ 0. , 1. , 0. ], [ 0. , 0. , 1. ], [ 1. , 0. , 0. ], [ 0. , 1. , 0. ], [ 0. , 0. , 1. ], [ 0. , 1. , 0. ], [ 1. , 0. , 0. ], [ 0. , 1. , 0. ], [ 1. , 0. , 0. ], [ 0. , 0.6, 0.4], [ 0. , 1. , 0. ], [ 1. , 0. , 0. ], [ 0. , 1. , 0. ], [ 0. , 1. , 0. ], [ 1. , 0. , 0. ], [ 0. , 0. , 1. ], [ 0. , 0. , 1. ], [ 0. , 0. , 1. ], [ 0. , 1. , 0. ], [ 1. , 0. , 0. ], [ 0. , 0. , 1. ], [ 0. , 0.4, 0.6], [ 1. , 0. , 0. ], [ 1. , 0. , 0. ], [ 0. , 1. , 0. ], [ 0. , 1. , 0. ], [ 1. , 0. , 0. ]])
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950iris.target_names # 'setosa', ------- 0 # 'versicolor', --- 1 # 'virginica' ---- 2 1234
array(['setosa', 'versicolor', 'virginica'], dtype='<U10') 12
# 返回最大值的索引 proba_.argmax(axis = 1) 12
array([2, 2, 2, 2, 2, 2, 1, 1, 0, 2, 2, 1, 0, 2, 1, 1, 1, 1, 2, 1, 0, 0, 1, 1, 2, 0, 1, 2, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 2, 2, 2, 1, 0, 2, 2, 0, 0, 1, 1, 0], dtype=int64) 123
# 进行了封装准确率 knn.score(X_test,y_test) 12
1.0 1
# 对比 看算法预测的和真实结果是否对应 # 大部分正确:算法ok # 大部分错误 不OK print(y_) print('-----------') print(y_test) 123456
[2 2 2 2 2 2 1 1 0 2 2 1 0 2 1 1 1 1 2 1 0 0 1 1 2 0 1 2 1 0 1 0 1 1 0 1 1 0 2 2 2 1 0 2 2 0 0 1 1 0] ----------- [2 2 2 2 2 2 1 1 0 2 2 1 0 2 1 1 1 1 2 1 0 0 1 1 2 0 1 2 1 0 1 0 1 1 0 1 1 0 2 2 2 1 0 2 2 0 0 1 1 0] 12345
# 准确率 (y_ == y_test).sum()/50 12
1.0 1
相关知识
机器学习案例:鸢尾花分类——基于Scikit
花了1个月时间,把Python库全部整理出来了,覆盖所有,建议收藏
python鸢尾花数据集的分类问题 -- 逻辑回归问题研究
CLIP(下)
网址: scikit https://m.huajiangbk.com/newsview387256.html
上一篇: 数据分析之鸢尾花简单分析 |
下一篇: 《机器学习》分析鸢尾花数据集 |