sklearn机器学习（八）

2023-11-14 20:48:16

Task08
本次学习参照Datawhale开源学习：https://github.com/datawhalechina/machine-learning-toy-code/tree/main/ml-with-sklearn
内容安排如下，主要是一些代码实现和部分原理介绍。

8. 可视化

机器学习可视化有助于我们分析模型效果、理解模型原理、对比模型优劣。本章将介个不同的机器学习任务分别对回归、分类、聚类给出可视化实例。

8.1 回归

直接用matplotlib画出线性回归直线：

import numpy as np

def true_fun(X): 
    return 1.5*X + 0.2

np.random.seed(0) # 设置随机种子
n_samples = 30 # 设置采样数据点的个数

X_train = np.sort(np.random.rand(n_samples)) 
y_train = (true_fun(X_train) + np.random.randn(n_samples) * 0.05).reshape(n_samples,1)

from sklearn.linear_model import LinearRegression # 导入线性回归模型
model = LinearRegression() # 定义模型
model.fit(X_train[:,np.newaxis], y_train) # 训练模型

import matplotlib.pyplot as plt

X_test = np.linspace(0, 1, 100)
plt.plot(X_test, model.predict(X_test[:, np.newaxis]), label="Model")
plt.plot(X_test, true_fun(X_test), label="True function")
plt.scatter(X_train,y_train) # 画出训练集的点
plt.legend(loc="best")
plt.show()

8.2 分类

matplotlib.pyplot contourf()函数可以用来画决策边界填充轮廓线：

import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm

data = np.array([
    [0.1, 0.7],
    [0.3, 0.6],
    [0.4, 0.1],
    [0.5, 0.4],
    [0.8, 0.04],
    [0.42, 0.6],
    [0.9, 0.4],
    [0.6, 0.5],
    [0.7, 0.2],
    [0.7, 0.67],
    [0.27, 0.8],
    [0.5, 0.72]
])
label = [1] * 6 + [0] * 6
x_min, x_max = data[:, 0].min() - 0.2, data[:, 0].max() + 0.2
y_min, y_max = data[:, 1].min() - 0.2, data[:, 1].max() + 0.2
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.002),
                     np.arange(y_min, y_max, 0.002)) # meshgrid如何生成网格
model_linear = svm.SVC(kernel='linear', C = 0.001)
model_linear.fit(data, label) # 训练
Z = model_linear.predict(np.c_[xx.ravel(), yy.ravel()]) # 预测
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap = plt.cm.ocean, alpha=0.6) # 决策填充轮廓线
plt.scatter(data[:6, 0], data[:6, 1], marker='o', color='r', s=100, lw=3) 
plt.scatter(data[6:, 0], data[6:, 1], marker='x', color='k', s=100, lw=3)
plt.title('Linear SVM')
plt.show()

sklearn.tree.plot_tree()可以用来决策树可视化：

'''对鸢尾花数据集进行分类任务'''
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import matplotlib.pyplot as plt

# 导入数据集
data = load_iris()
df = pd.DataFrame(data.data, columns = data.feature_names)

# 分类标签
df['Species'] = data.target
target = np.unique(data.target)
target_names = np.unique(data.target_names)
targets = dict(zip(target, target_names))
df['Species'] = df['Species'].replace(targets)

# 划分特征和标签
x = df.drop(columns="Species")
y = df["Species"]
feature_names = x.columns
labels = y.unique()

# 划分训练集和测试集
X_train, test_x, y_train, test_lab = train_test_split(x,y,test_size = 0.4,random_state = 42)

# 使用决策树训练模型
model = DecisionTreeClassifier(max_depth =3, random_state = 42)
model.fit(X_train, y_train)      

# 决策树可视化
plt.figure(figsize=(30,10), facecolor ='g')
a = tree.plot_tree(model,feature_names = feature_names,class_names = labels,rounded = True,filled = True,fontsize=14)
plt.show()

8.3 聚类

画出聚类质心：

'''生成数据'''
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
X, y = make_blobs(n_samples=1000, # 1000个样本
                 n_features=2, # 每个样本2个特征（2维数据）
                 centers=5, # 5个簇中心
                 random_state=42)
fig, ax=plt.subplots(1)

'''聚类'''
from sklearn.cluster import KMeans 
n_clusters=5
cluster = KMeans(n_clusters=5,random_state=0).fit(X)
y_pred = cluster.fit_predict(X)

centroid=cluster.cluster_centers_ # 聚类质心
inertia=cluster.inertia_

'''画出聚类质心'''
fig, ax=plt.subplots(1)

for i in range(n_clusters):
    ax.scatter(X[y_pred==i, 0], X[y_pred==i, 1],
               marker='o',
               s=8)
ax.scatter(centroid[:,0],centroid[:,1],marker='x',s=100,c='black')

码农公寓

8. 可视化

8.1 回归

8.2 分类

8.3 聚类

相关文章