ECG心电信号的分类(MIT-BIH数据库)
基于随机森林和基于决策树的分类
最终结果分为6类,前期对MIT-BIH数据库中数据的提取和处理已完成。这里主要讲分类。
1. 决策树
from sklearn.tree import DecisionTreeClassifier # 导入DecisionTreeClassifier函数
from sklearn.model_selection import train_test_split
from sklearn import metrics # 分类结果评价函数
import os
import matplotlib.pyplot as plt
import numpy as np
#先导入数据和标签
#data=....
#labels=....
#打乱顺序
num_example=data.shape[0]
arr=np.arange(num_example)
np.random.shuffle(arr)
data=data[arr]
labels=labels[arr]
#划分训练集、测试集、验证集8:2
#x指数据、y指标签
split_idx = int(0.8 * len(data))
x_train = data[:split_idx]
x_test =data[split_idx:]
assert len(data) == len(x_train) + len(x_test)
split_idy = int(0.8 * len(labels))
y_train= labels[:split_idy]
y_test =labels[split_idy:]
assert len(data) == len(y_train) + len(y_test)
# 划分数据集
# x_train, x_test, y_train, y_test = train_test_split(data, labels, random_state=0, train_size=0.8)
#print('训练集和测试集 shape', x_train.shape, y_train.shape, x_test.shape, y_test.shape)
model = DecisionTreeClassifier(criterion="entropy", random_state=42) # 实例化模型DecisionTreeClassifier()
model.fit(x_train, y_train) # 在训练集上训练模型
print(model) # 输出模型
# 在测试集上测试模型
expected = y_test # 测试样本的期望输出
predicted = model.predict(x_test) # 测试样本预测
# 输出结果
print(metrics.classification_report(expected, predicted)) # 输出结果,精确度、召回率、f-1分数
print(metrics.confusion_matrix(expected, predicted)) # 混淆矩阵
# auc = metrics.roc_auc_score(y_test, predicted)
accuracy = metrics.accuracy_score(y_test, predicted) # 求精度
print("Accuracy: %.2f%%" % (accuracy * 100.0))
代码运行结果:Accuracy: 95.03%
2. 随机森林
通过集成学习的思想将多棵树集成的算法,基本单元是决策树。
Bagging+决策树=随机森林
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics # 分类结果评价函数
from sklearn.metrics import plot_confusion_matrix
#import scipy.io as scio
import os
import matplotlib.pyplot as plt
import numpy as np
#从txt文件中导入ECG的数据和标签
#data=.....
#labels=.....
# 打乱顺序
num_example = data.shape[0]
arr = np.arange(num_example)
np.random.shuffle(arr)
data = data[arr]
labels = labels[arr]
target_name=['1','2','3','4','5','6']
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, random_state=1)
# 创建一个随机森林分类器的实例
randomforest = RandomForestClassifier(random_state=42, n_estimators=120)
# 利用训练集样本对分类器模型进行训练
randomforest.fit(x_train, y_train)
expected = y_test # 测试样本的期望输出
predicted = randomforest.predict(x_test) # 测试样本预测
# 画出训练后模型的混淆矩阵,方便观察训练的效果
plot_confusion_matrix(
randomforest,
x_test,
y_test,
display_labels=target_name,
cmap="Blues",
normalize="true",
)
accuracy = metrics.accuracy_score(y_test, predicted) # 求精度
print("Accuracy: %.2f%%" % (accuracy * 100.0))
plt.title("ECG classification results")
plt.show()
代码运行结果:Accuracy: 98.08%
3. k折交叉验证法
k折交叉验证用于模型调优,所有的数据都被用来训练会导致过拟合,k折交叉验证会缓解过拟合。
将数据分为k组,每次从训练集中抽取k份中的一份数据作为验证集,其余作为测试集,测试结果采用K组数据的平均值。
from sklearn.tree import DecisionTreeClassifier # 导入DecisionTreeClassifier函数
from sklearn.model_selection import train_test_split
from sklearn import metrics # 分类结果评价函数
import os
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import KFold
#先导入数据和标签
#data=....
#labels=....
avg_accuracy=0
kf=KFold(n_splits=10) #K折(这里取了10)
for train_index,test_index in kf.split(data):
# print("Train:", train_index, "Validation:", test_index)
x_train,x_test=data[train_index],data[test_index]
y_train,y_test=labels[train_index],labels[test_index]
# 划分数据集
# x_train, x_test, y_train, y_test = train_test_split(data, labels, random_state=0, train_size=0.8)
#print('训练集和测试集 shape', x_train.shape, y_train.shape, x_test.shape, y_test.shape)
# #
model = DecisionTreeClassifier(criterion="entropy",random_state=42) # 实例化模型DecisionTreeClassifier()
model.fit(x_train, y_train) # 在训练集上训练模型
print(model) # 输出模型
# 在测试集上测试模型
expected = y_test # 测试样本的期望输出
predicted = model.predict(x_test) # 测试样本预测
# 输出结果
print(metrics.classification_report(expected, predicted)) # 输出结果,精确度、召回率、f-1分数
print(metrics.confusion_matrix(expected, predicted)) # 混淆矩阵
# auc = metrics.roc_auc_score(y_test, predicted)
accuracy = metrics.accuracy_score(y_test, predicted) # 求精度
print("Accuracy: %.2f%%" % (accuracy * 100.0))
avg_accuracy +=accuracy
print("K fold average accuracy: {}".format(avg_accuracy/10))
代码运行结果:K fold average accuracy: 0.9476829658037712
4.数据降维:主成分分析(PCA)
在很对情形下,变量之间存在相关性,从而增加了问题分析的复杂性,如果分别对某一个指标进行分析,这时不能完全利用数据中的信息。
主成分分析是一种使用最广泛的数据降维算法(非监督)。 其主要用于降维。主要思想是将n维特征映射到k维上。
随机森林+PCA
from sklearn.decomposition import PCA
from typing import Any
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics # 分类结果评价函数
from matplotlib import pyplot as plt
from sklearn.metrics import plot_confusion_matrix
#import scipy.io as scio
import os
import numpy as np
#from sklearn.model_selection import KFold
#从txt文件中导入ECG的数据和标签
#data=.....
#labels=.....
# 打乱顺序
num_example = data.shape[0]
arr = np.arange(num_example)
np.random.shuffle(arr)
data = data[arr]
labels = labels[arr]
target_name=['1','2','3','4','5','6']
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, random_state=1)
# 创建一个随机森林分类器的实例
randomforest = RandomForestClassifier(random_state=42, n_estimators=120)
# 利用训练集样本对分类器模型进行训练
randomforest.fit(x_train, y_train)
expected = y_test # 测试样本的期望输出
predicted = randomforest.predict(x_test) # 测试样本预测
# 画出训练后模型的混淆矩阵,方便观察训练的效果
plot_confusion_matrix(
randomforest,
x_test,
y_test,
display_labels=target_name,
cmap="Blues",
normalize="true",
)
accuracy = metrics.accuracy_score(y_test, predicted) # 求精度
print("Accuracy: %.2f%%" % (accuracy * 100.0))
plt.title("ECG classification results")
plt.show()
降维后得到16维的数据
代码运行结果:Accuracy: 98.77%
决策树+PCA
# 打乱顺序
num_example = data.shape[0]
arr = np.arange(num_example)
np.random.shuffle(arr)
data = data[arr]
labels = labels[arr]
pca = PCA(n_components=16) #降到16维
pca.fit(data) #训练
newX=pca.fit_transform(data) #降维后的数据
# PCA(copy=True, n_components=2, whiten=False)
print(pca.explained_variance_ratio_) #输出贡献率
# print(newX) #输出降维后的数据
x_train, x_test, y_train, y_test = train_test_split(newX, labels, test_size=0.3, random_state=0)
#
model = DecisionTreeClassifier(criterion="entropy", random_state=42) # 实例化模型DecisionTreeClassifier()
model.fit(x_train, y_train) # 在训练集上训练模型
print(model) # 输出模型
# 在测试集上测试模型
expected = y_test # 测试样本的期望输出
predicted = model.predict(x_test) # 测试样本预测
# 输出结果
print(metrics.classification_report(expected, predicted)) # 输出结果,精确度、召回率、f-1分数
print(metrics.confusion_matrix(expected, predicted)) # 混淆矩阵
# auc = metrics.roc_auc_score(y_test, predicted)
accuracy = metrics.accuracy_score(y_test, predicted) # 求精度
print("Accuracy: %.2f%%" % (accuracy * 100.0))
代码运行结果:Accuracy: 95.67%
从上面的结果得出,决策树的效果稍弱于随机森林,用PCA对数据进行降维后,在对其进行分类,得到的效果会更好。