Task08
本次学习参照Datawhale开源学习:https://github.com/datawhalechina/machine-learning-toy-code/tree/main/ml-with-sklearn
内容安排如下,主要是一些代码实现和部分原理介绍。
8. 可视化
机器学习可视化有助于我们分析模型效果、理解模型原理、对比模型优劣。本章将介个不同的机器学习任务分别对回归、分类、聚类给出可视化实例。
8.1 回归
直接用matplotlib画出线性回归直线:
import numpy as np
def true_fun(X):
return 1.5*X + 0.2
np.random.seed(0) # 设置随机种子
n_samples = 30 # 设置采样数据点的个数
X_train = np.sort(np.random.rand(n_samples))
y_train = (true_fun(X_train) + np.random.randn(n_samples) * 0.05).reshape(n_samples,1)
from sklearn.linear_model import LinearRegression # 导入线性回归模型
model = LinearRegression() # 定义模型
model.fit(X_train[:,np.newaxis], y_train) # 训练模型
import matplotlib.pyplot as plt
X_test = np.linspace(0, 1, 100)
plt.plot(X_test, model.predict(X_test[:, np.newaxis]), label="Model")
plt.plot(X_test, true_fun(X_test), label="True function")
plt.scatter(X_train,y_train) # 画出训练集的点
plt.legend(loc="best")
plt.show()
8.2 分类
matplotlib.pyplot contourf()函数可以用来画决策边界填充轮廓线:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
data = np.array([
[0.1, 0.7],
[0.3, 0.6],
[0.4, 0.1],
[0.5, 0.4],
[0.8, 0.04],
[0.42, 0.6],
[0.9, 0.4],
[0.6, 0.5],
[0.7, 0.2],
[0.7, 0.67],
[0.27, 0.8],
[0.5, 0.72]
])
label = [1] * 6 + [0] * 6
x_min, x_max = data[:, 0].min() - 0.2, data[:, 0].max() + 0.2
y_min, y_max = data[:, 1].min() - 0.2, data[:, 1].max() + 0.2
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.002),
np.arange(y_min, y_max, 0.002)) # meshgrid如何生成网格
model_linear = svm.SVC(kernel='linear', C = 0.001)
model_linear.fit(data, label) # 训练
Z = model_linear.predict(np.c_[xx.ravel(), yy.ravel()]) # 预测
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap = plt.cm.ocean, alpha=0.6) # 决策填充轮廓线
plt.scatter(data[:6, 0], data[:6, 1], marker='o', color='r', s=100, lw=3)
plt.scatter(data[6:, 0], data[6:, 1], marker='x', color='k', s=100, lw=3)
plt.title('Linear SVM')
plt.show()
sklearn.tree.plot_tree()可以用来决策树可视化:
'''对鸢尾花数据集进行分类任务'''
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import matplotlib.pyplot as plt
# 导入数据集
data = load_iris()
df = pd.DataFrame(data.data, columns = data.feature_names)
# 分类标签
df['Species'] = data.target
target = np.unique(data.target)
target_names = np.unique(data.target_names)
targets = dict(zip(target, target_names))
df['Species'] = df['Species'].replace(targets)
# 划分特征和标签
x = df.drop(columns="Species")
y = df["Species"]
feature_names = x.columns
labels = y.unique()
# 划分训练集和测试集
X_train, test_x, y_train, test_lab = train_test_split(x,y,test_size = 0.4,random_state = 42)
# 使用决策树训练模型
model = DecisionTreeClassifier(max_depth =3, random_state = 42)
model.fit(X_train, y_train)
# 决策树可视化
plt.figure(figsize=(30,10), facecolor ='g')
a = tree.plot_tree(model,feature_names = feature_names,class_names = labels,rounded = True,filled = True,fontsize=14)
plt.show()
8.3 聚类
画出聚类质心:
'''生成数据'''
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
X, y = make_blobs(n_samples=1000, # 1000个样本
n_features=2, # 每个样本2个特征(2维数据)
centers=5, # 5个簇中心
random_state=42)
fig, ax=plt.subplots(1)
'''聚类'''
from sklearn.cluster import KMeans
n_clusters=5
cluster = KMeans(n_clusters=5,random_state=0).fit(X)
y_pred = cluster.fit_predict(X)
centroid=cluster.cluster_centers_ # 聚类质心
inertia=cluster.inertia_
'''画出聚类质心'''
fig, ax=plt.subplots(1)
for i in range(n_clusters):
ax.scatter(X[y_pred==i, 0], X[y_pred==i, 1],
marker='o',
s=8)
ax.scatter(centroid[:,0],centroid[:,1],marker='x',s=100,c='black')