机器学习Sklearn实战——KNN算法

KNN鸢尾花分类

import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
import numpy as np
X,y = datasets.load_iris(True)    #返回x、y
X = X[:,:2]
plt.scatter(X[:,0],X[:,1],c=y)
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X,y)
x1 = np.linspace(4,8,100)   #横坐标4到8
y1 = np.linspace(2,4.5,80)    #纵坐标2到4.5
X1,Y1 = np.meshgrid(x1,y1)
X1 =X1.reshape(-1,1)
Y1 =Y1.reshape(-1,1)
X_test = np.concatenate([X1,Y1],axis = 1)     #shape为(8000,2)

from matplotlib.colors import ListedColormap
lc1 = ListedColormap(["#FFAAAA","#AAFFAA","#AAAAFF"])
lc2 = ListedColormap(["#FF0000","#00FF00","#0000FF"])
y_ = knn.predict(X_test)
plt.scatter(X_test[:,0],X_test[:,1], c=y_ ,cmap=lc1)
plt.scatter(X[:,0],X[:,1], c=y,cmap=lc2)

机器学习Sklearn实战——KNN算法

KNN参数的筛选

import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
from sklearn.model_selection import cross_val_score

X,y = datasets.load_iris(True)
knn = KNeighborsClassifier()
score = cross_val_score(knn,X,y,scoring="accuracy",cv=6)
print(score)

erros = []
for i in range(1,14):        #150开根号
    knn = KNeighborsClassifier(n_neighbors=i)
    score = cross_val_score(knn,X,y,scoring="accuracy",cv=6).mean()
    erros.append(1-score)
import matplotlib.pyplot as plt
plt.plot(np.arange(1,14),erros)

机器学习Sklearn实战——KNN算法

weights = ["uniform","distance"]
for w in weights:
    knn = KNeighborsClassifier(n_neighbors = 12,weights = w)
    print(cross_val_score(knn,X,y,scoring="accuracy",cv = 6).mean())
0.98
0.9733333333333333
result = {}
for k in range(1,14):
    for w in weights:
        knn = KNeighborsClassifier(n_neighbors = k,weights = w)
        sm = cross_val_score(knn,X,y,scoring="accuracy",cv = 6).mean()
        result[w + str(k)] = sm
result
{'uniform1': 0.96,
 'distance1': 0.96,
 'uniform2': 0.94,
 'distance2': 0.96,
 'uniform3': 0.9666666666666667,
 'distance3': 0.9666666666666667,
 'uniform4': 0.9666666666666667,
 'distance4': 0.9666666666666667,
 'uniform5': 0.9666666666666667,
 'distance5': 0.9666666666666667,
 'uniform6': 0.9666666666666667,
 'distance6': 0.96,
 'uniform7': 0.9733333333333333,
 'distance7': 0.9733333333333333,
 'uniform8': 0.9666666666666667,
 'distance8': 0.9666666666666667,
 'uniform9': 0.9733333333333333,
 'distance9': 0.9733333333333333,
 'uniform10': 0.96,
 'distance10': 0.96,
 'uniform11': 0.9733333333333333,
 'distance11': 0.9733333333333333,
 'uniform12': 0.98,
 'distance12': 0.9733333333333333,
 'uniform13': 0.9733333333333333,
 'distance13': 0.9733333333333333}
np.array(list(result.values())).argmax()
list(result)[22]
22
'uniform12'

KNN癌症诊断

import numpy as np 
import pandas as pd
from pandas import Series,DataFrame
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

cancer = pd.read_csv("/Users/zhucan/Desktop/cancer.csv",sep = "\t")
cancer.drop("ID",axis = 1, inplace=True)
X = cancer.iloc[:,1:]
y = cancer["Diagnosis"]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
knn = KNeighborsClassifier()
params = {"n_neighbors":[i for i in range(1,30)],"weights":["distance","uniform"],"p":[1,2]}
gcv = GridSearchCV(knn,params,scoring = "accuracy",cv = 6)
gcv.fit(X_train,y_train)
gcv.best_estimator_
gcv.best_score_
gcv.best_params_
y_ = gcv.predict(X_test)
gcv.score(X_test,y_test)       #此时的gcv就是gcv.best_estimator_
pd.crosstab(index = y_test,columns = y_,rownames=["True"],colnames=["Predict"]) 
#输出混淆矩阵
KNeighborsClassifier(n_neighbors=4, p=1, weights='distance')
0.9516666666666667
{'n_neighbors': 4, 'p': 1, 'weights': 'distance'}
0.9385964912280702
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
confusion_matrix(y_,y_test)
print(classification_report(y_test,y_,target_names = ["B","M"]))

78/(78+5) = 0.94      78/(78+2) = 0.97

29/(29+2) = 0.94     29/(29+5) = 0.85

找健康的比找生病的要强

KNN数据归一化操作 

#归一化操作
X_norm1 = (X-X.min())/(X.max()-X.min())
X_train,X_test,y_train,y_test = train_test_split(X_norm1,y,test_size = 0.2)
knn = KNeighborsClassifier()
params = {"n_neighbors":[i for i in range(1,30)],"weights":["distance","uniform"],"p":[1,2]}
gcv = GridSearchCV(knn,params,scoring = "accuracy",cv = 6)
gcv.fit(X_train,y_train)
from sklearn.metrics import accuracy_score
y_ = gcv.predict(X_test)
accuracy_score(y_test,y_)
0.9649122807017544
#另外的方法
#标准化
X_norm2 = (X - X.mean())/X.std()

from sklearn.preprocessing import MinMaxScaler,StandardScaler
mms = MinMaxScaler()
mms.fit(X)
X2 = mms.transform(X)      #和归一化效果一样

ss = StandardScaler()
X3=ss.fit_transform(X)
X3                   #和标准化效果一样

sklearn中数据拆分

import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn.model_selection import KFold,StratifiedKFold #KFold、StratifiedKFold将数据分成多少份

data = np.random.randint(0,10,size=(8,2))
target = np.array([0,0,1,0,1,1,1,0])
# train,test是索引,只要有索引就可以获取数据
KFold = KFold(n_splits=4)
for train,test in KFold.split(data,target):
    print(target[train],target[test])
[1 0 1 1 1 0] [0 0]
[0 0 1 1 1 0] [1 0]
[0 0 1 0 1 0] [1 1]
[0 0 1 0 1 1] [1 0]
#分成4分,每一份数据特征,数据样本比例和原来一样
sKFold = StratifiedKFold(n_splits=4)
for train,test in sKFold.split(data,target):
    print(target[train],target[test])
[0 0 1 1 1 0] [0 1]
[0 1 0 1 1 0] [0 1]
[0 0 1 1 1 0] [0 1]
[0 0 1 0 1 1] [1 0]

#train_test_split,KFold,StratifiedKFold作用都是将数据拆分 

str类型数据的转变与训练预测

data = pd.read_csv("/Users/zhucan/Desktop/salary.txt")
data.drop(labels=["final_weight","education","capital_gain","capital_loss"],axis = 1,inplace=True)
X = data.iloc[:,0:-1]
y = data.iloc["salary"]
#方法将数据中str转换int,float从而算法可以计算
#map方法,apply,transform
u = X["workclass"].unique()
u
array(['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov',
       'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked'],
      dtype=object)
np.argwhere(u=='Local-gov')[0,0]
4
def convert(x):                 #利用数字进行映射     
    return np.argwhere(u==x)[0,0]
X["workclass"]=X["workclass"].map(convert)

cols = ['marital_status', 'occupation','relationship', 'race', 'sex','native_country']
for col in cols:
    u = X[col].unique()
    def convert(x):           
        return np.argwhere(u==x)[0,0]
    X[col] = X[col].map(convert)
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn.model_selection import KFold,StratifiedKFold

data = pd.read_csv("/Users/zhucan/Desktop/salary.txt")
data.head()
data.drop(labels=["final_weight","education","capital_gain","capital_loss"],axis = 1,inplace=True)

X = data.iloc[:,0:-1]
y = data["salary"]

u = X["workclass"].unique()
def convert(x):                
    return np.argwhere(u==x)[0,0]
X["workclass"]=X["workclass"].map(convert)

cols = ['marital_status','occupation','relationship','race','sex','native_country']
for col in cols:
    u = X[col].unique()
    def convert(x):                
        return np.argwhere(u==x)[0,0]
    X[col] = X[col].map(convert)

knn = KNeighborsClassifier()
kFold = KFold(10)
knn = KNeighborsClassifier()
accuracy = 0
for train,test in kFold.split(X,y):
    knn.fit(X.loc[train],y[train])
    acc = knn.score(X.loc[test],y[test])
    accuracy += acc/10
print(accuracy)
0.7973345728987424
上一篇:machinefinal


下一篇:KNN算法