文章目录
学习资料
kaggle
数据集、源文件等
一、导包
# -*- coding: utf-8 -*
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
# 设置中文字体
myfont = matplotlib.font_manager.FontProperties(fname="./DroidSansFallback.ttf")
plt.rcParams['axes.unicode_minus'] = False
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# preprocessing是预处理模块 scale函数实现标准化(去均值的中心化(均值为0)和方差的规模化(方差为1));StandarScaler可以在训练数据集上做了标准转换操作之后,把相同的转换应用到测试训练集中。
from sklearn.preprocessing import scale, StandardScaler
from sklearn.linear_model import LogisticRegression # 逻辑回归
from sklearn.neighbors import KNeighborsClassifier # k近邻
from sklearn.svm import SVC # SVM
from sklearn.neural_network import MLPClassifier # 前馈神经网络
from sklearn.tree import DecisionTreeClassifier # 决策树
from sklearn.ensemble import RandomForestClassifier #随机森林
from sklearn.naive_bayes import GaussianNB #
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
import warnings
warnings.filterwarnings('ignore')
二、数据探索
data = pd.read_csv("data/healthcare-dataset-stroke-data.csv",encoding='UTF-8')
df=data.copy()
df.head()
del df['id'] # id属性肯定没用
df.info() # 发现bmi属性有部分缺失
df.describe()
# 中分率貌似不高啊 才0.048
三、数据可视化
fig = plt.figure()
fig.set(alpha=0.2) # 设定图表颜色alpha参数
# 中分情况
plt.subplot2grid((2,9),(0,0),colspan=2)
df.stroke.value_counts().plot(kind='bar')
plt.title('中分情况(1为中分)',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)
# 性别情况
plt.subplot2grid((2,9),(0,3),colspan=2)
df.gender.value_counts().plot(kind='bar')
plt.title('性别情况',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)
# 年龄情况
plt.subplot2grid((2,9),(0,6),colspan=2)
df.age.plot(kind='kde') # 密度图
plt.title('年龄情况',fontproperties=myfont)
plt.ylabel('年龄',fontproperties=myfont)
fig = plt.figure()
fig.set(alpha=0.2) # 设定图表颜色alpha参数
# 高血压情况情况
plt.subplot2grid((2,9),(0,0),colspan=2)
df.hypertension.value_counts().plot(kind='bar')
plt.title('高血压情况(1为高血压)',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)
# 心脏病情况
plt.subplot2grid((2,9),(0,3),colspan=2)
df.hypertension.value_counts().plot(kind='bar')
plt.title('心脏病情况',fontproperties=myfont)
plt.ylabel('人数密度',fontproperties=myfont)
# 结婚情况
plt.subplot2grid((2,9),(0,6),colspan=2)
df.ever_married.value_counts().plot(kind='bar')
plt.title('结婚情况',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)
fig = plt.figure()
fig.set(alpha=0.2) # 设定图表颜色alpha参数
# 工作类型情况
plt.subplot2grid((2,12),(0,0),colspan=2)
df.work_type.value_counts().plot(kind='bar')
plt.title('工作类型情况',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)
# 住宅类型情况
plt.subplot2grid((2,12),(0,3),colspan=2)
df.Residence_type.value_counts().plot(kind='bar')
plt.title('住宅类型情况',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)
# 平均血糖
plt.subplot2grid((2,12),(0,6),colspan=2)
df.avg_glucose_level.plot(kind='kde')
plt.title('平均血糖',fontproperties=myfont)
plt.ylabel('血糖浓度',fontproperties=myfont)
# 抽烟情况
plt.subplot2grid((2,12),(0,9),colspan=2)
df.smoking_status.value_counts().plot(kind='bar')
plt.title('抽烟情况',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)
四、其他属性与中分关系可视化
tmp = df.drop(['avg_glucose_level','age','bmi','stroke'],axis=1)
for column in tmp.columns:
tmp[column][df.stroke==1].value_counts().plot(kind='bar')
plt.title(column+'=》中分',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)
plt.xlabel(column+'情况',fontproperties=myfont)
plt.show()
五、数据预处理
# 数据预处理
# 找到多值属性
for column in df.select_dtypes(include=['object']): # 处理所有字符串 select_dtypes函数按照指定类型返回列名
if len(np.unique(df[column]))>2: # 对每一列进行去重计数 找到所有多值属性
print(f"==== [COLUMNS: {column}] ====")
print(df[column].value_counts()) # 查看多值组成
# 发现gender属性的Other 只有一个 就删除他 那他就变成二值属性了
# df['gender']=='Other' 获得gender属性是Other 的一个bool series 然后套一层df得到所有的gender是Other的行 然后获取第0个行索引 删掉他
df.drop(df[df['gender']=='Other'].index[0],axis=0,inplace=True)
# 对其他多值属性进行one hot 编码 将离散的属性进行拓展
df=pd.get_dummies(df,columns=['work_type'])
df=pd.get_dummies(df,columns=['smoking_status'])
# 找到二值属性
for column in df.select_dtypes(include=['object']):
if len(np.unique(df[column]))<=2:
print(f"==== [COLUMNS: {column}] ====")
print(df[column].value_counts())
# 使用sklearn.preprocessing.LabelEncoder 编码二项式值
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df.gender = le.fit_transform(df.gender)
df.ever_married = le.fit_transform(df.ever_married)
df.Residence_type=le.fit_transform(df.Residence_type)
六、缺失值处理
# 查看缺失值
df.isna().sum()
var_names = list(df)
# 使用knn处理缺失值
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
dff = imputer.fit_transform(df) # 返回numpy的矩阵
dff = pd.DataFrame(dff,columns=var_names)
df.bmi=dff.bmi
# 仍然还有一个缺失值 直接删除
df.dropna(axis=0,how='any',inplace=True)
df.isna().sum()
七、模型预测
y = df['stroke']
x = df.drop(['stroke'],axis=1)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=11,shuffle=True)
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
algorithms=[
LogisticRegression,
KNeighborsClassifier,
MLPClassifier,
DecisionTreeClassifier,
RandomForestClassifier,
GaussianNB,
AdaBoostClassifier,
BaggingClassifier
]
score_df = pd.DataFrame()
for algorithm in algorithms:
score = {"算法":algorithm.__name__}
model=algorithm().fit(x_train,y_train)
y_pred=model.predict(x_test)
score["accuracy_score"] = accuracy_score(y_test,y_pred)*100
score_df = score_df.append(score, ignore_index=True)
score_df.sort_values(by="accuracy_score",ascending = False,inplace=True)
score_df
score_df.plot(x='算法',y='accuracy_score',kind='bar')
plt.title('算法排名',fontproperties=myfont)
plt.ylabel('准确率',fontproperties=myfont)
plt.xlabel('算法',fontproperties=myfont)
plt.show()
完整代码
# -*- coding: utf-8 -*
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import matplotlib
# 设置中文字体
myfont = matplotlib.font_manager.FontProperties(fname="./DroidSansFallback.ttf")
plt.rcParams['axes.unicode_minus'] = False
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# preprocessing是预处理模块 scale函数实现标准化(去均值的中心化(均值为0)和方差的规模化(方差为1));StandarScaler可以在训练数据集上做了标准转换操作之后,把相同的转换应用到测试训练集中。
from sklearn.preprocessing import scale, StandardScaler
from sklearn.linear_model import LogisticRegression # 逻辑回归
from sklearn.neighbors import KNeighborsClassifier # k近邻
from sklearn.svm import SVC # SVM
from sklearn.neural_network import MLPClassifier # 前馈神经网络
from sklearn.tree import DecisionTreeClassifier # 决策树
from sklearn.ensemble import RandomForestClassifier #随机森林
from sklearn.naive_bayes import GaussianNB #
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv("data/healthcare-dataset-stroke-data.csv",encoding='UTF-8')
df=data.copy()
df.head()
del df['id'] # id属性肯定没用
df.info() # 发现bmi属性有部分缺失
df.describe()
# 中分率貌似不高啊 才0.048
fig = plt.figure()
fig.set(alpha=0.2) # 设定图表颜色alpha参数
# 中分情况
plt.subplot2grid((2,9),(0,0),colspan=2)
df.stroke.value_counts().plot(kind='bar')
plt.title('中分情况(1为中分)',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)
# 性别情况
plt.subplot2grid((2,9),(0,3),colspan=2)
df.gender.value_counts().plot(kind='bar')
plt.title('性别情况',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)
# 年龄情况
plt.subplot2grid((2,9),(0,6),colspan=2)
df.age.plot(kind='kde') # 密度图
plt.title('年龄情况',fontproperties=myfont)
plt.ylabel('年龄',fontproperties=myfont)
fig = plt.figure()
fig.set(alpha=0.2) # 设定图表颜色alpha参数
# 高血压情况情况
plt.subplot2grid((2,9),(0,0),colspan=2)
df.hypertension.value_counts().plot(kind='bar')
plt.title('高血压情况(1为高血压)',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)
# 心脏病情况
plt.subplot2grid((2,9),(0,3),colspan=2)
df.hypertension.value_counts().plot(kind='bar')
plt.title('心脏病情况',fontproperties=myfont)
plt.ylabel('人数密度',fontproperties=myfont)
# 结婚情况
plt.subplot2grid((2,9),(0,6),colspan=2)
df.ever_married.value_counts().plot(kind='bar')
plt.title('结婚情况',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)
fig = plt.figure()
fig.set(alpha=0.2) # 设定图表颜色alpha参数
# 工作类型情况
plt.subplot2grid((2,12),(0,0),colspan=2)
df.work_type.value_counts().plot(kind='bar')
plt.title('工作类型情况',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)
# 住宅类型情况
plt.subplot2grid((2,12),(0,3),colspan=2)
df.Residence_type.value_counts().plot(kind='bar')
plt.title('住宅类型情况',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)
# 平均血糖
plt.subplot2grid((2,12),(0,6),colspan=2)
df.avg_glucose_level.plot(kind='kde')
plt.title('平均血糖',fontproperties=myfont)
plt.ylabel('血糖浓度',fontproperties=myfont)
# 抽烟情况
plt.subplot2grid((2,12),(0,9),colspan=2)
df.smoking_status.value_counts().plot(kind='bar')
plt.title('抽烟情况',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)
tmp = df.drop(['avg_glucose_level','age','bmi','stroke'],axis=1)
for column in tmp.columns:
tmp[column][df.stroke==1].value_counts().plot(kind='bar')
plt.title(column+'=》中分',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)
plt.xlabel(column+'情况',fontproperties=myfont)
plt.show()
# 数据预处理
# 找到多值属性
for column in df.select_dtypes(include=['object']): # 处理所有字符串 select_dtypes函数按照指定类型返回列名
if len(np.unique(df[column]))>2: # 对每一列进行去重计数 找到所有多值属性
print(f"==== [COLUMNS: {column}] ====")
print(df[column].value_counts()) # 查看多值组成
# 发现gender属性的Other 只有一个 就删除他 那他就变成二值属性了
# df['gender']=='Other' 获得gender属性是Other 的一个bool series 然后套一层df得到所有的gender是Other的行 然后获取第0个行索引 删掉他
df.drop(df[df['gender']=='Other'].index[0],axis=0,inplace=True)
# 对其他多值属性进行one hot 编码 将离散的属性进行拓展
df=pd.get_dummies(df,columns=['work_type'])
df=pd.get_dummies(df,columns=['smoking_status'])
df.info()
# 找到二值属性
for column in df.select_dtypes(include=['object']):
if len(np.unique(df[column]))<=2:
print(f"==== [COLUMNS: {column}] ====")
print(df[column].value_counts())
# 使用sklearn.preprocessing.LabelEncoder 编码二项式值
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df.gender = le.fit_transform(df.gender)
df.ever_married = le.fit_transform(df.ever_married)
df.Residence_type=le.fit_transform(df.Residence_type)
# 处理缺失值
# 查看缺失值
df.isna().sum()
var_names = list(df)
# 使用knn处理缺失值
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
dff = imputer.fit_transform(df) # 返回numpy的矩阵
dff = pd.DataFrame(dff,columns=var_names)
df.bmi=dff.bmi
# 仍然还有一个缺失值 直接删除
df.dropna(axis=0,how='any',inplace=True)
df.isna().sum()
y = df['stroke']
x = df.drop(['stroke'],axis=1)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=11,shuffle=True)
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
algorithms=[
LogisticRegression,
KNeighborsClassifier,
MLPClassifier,
DecisionTreeClassifier,
RandomForestClassifier,
GaussianNB,
AdaBoostClassifier,
BaggingClassifier
]
score_df = pd.DataFrame()
for algorithm in algorithms:
score = {"算法":algorithm.__name__}
model=algorithm().fit(x_train,y_train)
y_pred=model.predict(x_test)
score["accuracy_score"] = accuracy_score(y_test,y_pred)*100
score_df = score_df.append(score, ignore_index=True)
score_df.sort_values(by="accuracy_score",ascending = False,inplace=True)
score_df.plot(x='算法',y='accuracy_score',kind='bar')
plt.title('算法排名',fontproperties=myfont)
plt.ylabel('准确率',fontproperties=myfont)
plt.xlabel('算法',fontproperties=myfont)
plt.show()