【kaggle】中分预测

文章目录

学习资料

kaggle
数据集、源文件等

一、导包

# -*- coding: utf-8 -*

import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline 
# 设置中文字体
myfont = matplotlib.font_manager.FontProperties(fname="./DroidSansFallback.ttf")
plt.rcParams['axes.unicode_minus'] = False


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# preprocessing是预处理模块 scale函数实现标准化(去均值的中心化(均值为0)和方差的规模化(方差为1));StandarScaler可以在训练数据集上做了标准转换操作之后,把相同的转换应用到测试训练集中。
from sklearn.preprocessing import scale, StandardScaler 
from sklearn.linear_model import LogisticRegression # 逻辑回归
from sklearn.neighbors import KNeighborsClassifier # k近邻
from sklearn.svm import SVC # SVM
from sklearn.neural_network import MLPClassifier # 前馈神经网络
from sklearn.tree import DecisionTreeClassifier # 决策树
from sklearn.ensemble import RandomForestClassifier #随机森林 
from sklearn.naive_bayes import GaussianNB # 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier

import warnings
warnings.filterwarnings('ignore')

二、数据探索

data = pd.read_csv("data/healthcare-dataset-stroke-data.csv",encoding='UTF-8')
df=data.copy()
df.head()

del df['id'] # id属性肯定没用
df.info() # 发现bmi属性有部分缺失

df.describe()
# 中分率貌似不高啊 才0.048

三、数据可视化

fig = plt.figure()
fig.set(alpha=0.2)  # 设定图表颜色alpha参数
# 中分情况
plt.subplot2grid((2,9),(0,0),colspan=2)             
df.stroke.value_counts().plot(kind='bar')
plt.title('中分情况(1为中分)',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)


# 性别情况
plt.subplot2grid((2,9),(0,3),colspan=2)             
df.gender.value_counts().plot(kind='bar')
plt.title('性别情况',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)

# 年龄情况
plt.subplot2grid((2,9),(0,6),colspan=2)             
df.age.plot(kind='kde') # 密度图
plt.title('年龄情况',fontproperties=myfont)
plt.ylabel('年龄',fontproperties=myfont)

fig = plt.figure()
fig.set(alpha=0.2)  # 设定图表颜色alpha参数
# 高血压情况情况
plt.subplot2grid((2,9),(0,0),colspan=2)             
df.hypertension.value_counts().plot(kind='bar')
plt.title('高血压情况(1为高血压)',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)

# 心脏病情况
plt.subplot2grid((2,9),(0,3),colspan=2)             
df.hypertension.value_counts().plot(kind='bar')
plt.title('心脏病情况',fontproperties=myfont)
plt.ylabel('人数密度',fontproperties=myfont)

# 结婚情况
plt.subplot2grid((2,9),(0,6),colspan=2)             
df.ever_married.value_counts().plot(kind='bar')
plt.title('结婚情况',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)

fig = plt.figure()
fig.set(alpha=0.2)  # 设定图表颜色alpha参数
# 工作类型情况
plt.subplot2grid((2,12),(0,0),colspan=2)             
df.work_type.value_counts().plot(kind='bar')
plt.title('工作类型情况',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)

# 住宅类型情况
plt.subplot2grid((2,12),(0,3),colspan=2)             
df.Residence_type.value_counts().plot(kind='bar')
plt.title('住宅类型情况',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)

# 平均血糖
plt.subplot2grid((2,12),(0,6),colspan=2)             
df.avg_glucose_level.plot(kind='kde')
plt.title('平均血糖',fontproperties=myfont)
plt.ylabel('血糖浓度',fontproperties=myfont)
# 抽烟情况
plt.subplot2grid((2,12),(0,9),colspan=2)             
df.smoking_status.value_counts().plot(kind='bar')
plt.title('抽烟情况',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)

四、其他属性与中分关系可视化

tmp = df.drop(['avg_glucose_level','age','bmi','stroke'],axis=1)
for column in tmp.columns:
    tmp[column][df.stroke==1].value_counts().plot(kind='bar')
    plt.title(column+'=》中分',fontproperties=myfont)
    plt.ylabel('人数',fontproperties=myfont)
    plt.xlabel(column+'情况',fontproperties=myfont)
    plt.show()

五、数据预处理

# 数据预处理
# 找到多值属性
for column in df.select_dtypes(include=['object']): # 处理所有字符串 select_dtypes函数按照指定类型返回列名
    if len(np.unique(df[column]))>2:  # 对每一列进行去重计数 找到所有多值属性
        print(f"==== [COLUMNS: {column}] ====")
        print(df[column].value_counts()) # 查看多值组成

# 发现gender属性的Other 只有一个 就删除他 那他就变成二值属性了

# df['gender']=='Other' 获得gender属性是Other 的一个bool series 然后套一层df得到所有的gender是Other的行 然后获取第0个行索引 删掉他
df.drop(df[df['gender']=='Other'].index[0],axis=0,inplace=True) 

# 对其他多值属性进行one hot 编码 将离散的属性进行拓展
df=pd.get_dummies(df,columns=['work_type'])
df=pd.get_dummies(df,columns=['smoking_status'])

# 找到二值属性
for column in df.select_dtypes(include=['object']):
    if len(np.unique(df[column]))<=2: 
        print(f"==== [COLUMNS: {column}] ====")
        print(df[column].value_counts()) 

# 使用sklearn.preprocessing.LabelEncoder 编码二项式值 
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df.gender = le.fit_transform(df.gender)
df.ever_married = le.fit_transform(df.ever_married)
df.Residence_type=le.fit_transform(df.Residence_type)

六、缺失值处理

# 查看缺失值
df.isna().sum()

var_names = list(df)
# 使用knn处理缺失值
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
dff = imputer.fit_transform(df) # 返回numpy的矩阵
dff = pd.DataFrame(dff,columns=var_names) 
df.bmi=dff.bmi


# 仍然还有一个缺失值 直接删除
df.dropna(axis=0,how='any',inplace=True)
df.isna().sum()

七、模型预测

y = df['stroke']
x = df.drop(['stroke'],axis=1)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=11,shuffle=True)

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)



algorithms=[
    LogisticRegression,
    KNeighborsClassifier,
    MLPClassifier,
    DecisionTreeClassifier,
    RandomForestClassifier,
    GaussianNB,
    AdaBoostClassifier,
    BaggingClassifier
]


score_df = pd.DataFrame()
for algorithm in algorithms:
    score = {"算法":algorithm.__name__}
    model=algorithm().fit(x_train,y_train)
    y_pred=model.predict(x_test)
    score["accuracy_score"] = accuracy_score(y_test,y_pred)*100
    score_df = score_df.append(score, ignore_index=True)


score_df.sort_values(by="accuracy_score",ascending = False,inplace=True)
score_df

score_df.plot(x='算法',y='accuracy_score',kind='bar')
plt.title('算法排名',fontproperties=myfont)
plt.ylabel('准确率',fontproperties=myfont)
plt.xlabel('算法',fontproperties=myfont)
plt.show()

完整代码

# -*- coding: utf-8 -*

import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import matplotlib
# 设置中文字体
myfont = matplotlib.font_manager.FontProperties(fname="./DroidSansFallback.ttf")
plt.rcParams['axes.unicode_minus'] = False


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# preprocessing是预处理模块 scale函数实现标准化(去均值的中心化(均值为0)和方差的规模化(方差为1));StandarScaler可以在训练数据集上做了标准转换操作之后,把相同的转换应用到测试训练集中。
from sklearn.preprocessing import scale, StandardScaler 
from sklearn.linear_model import LogisticRegression # 逻辑回归
from sklearn.neighbors import KNeighborsClassifier # k近邻
from sklearn.svm import SVC # SVM
from sklearn.neural_network import MLPClassifier # 前馈神经网络
from sklearn.tree import DecisionTreeClassifier # 决策树
from sklearn.ensemble import RandomForestClassifier #随机森林 
from sklearn.naive_bayes import GaussianNB # 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier

import warnings
warnings.filterwarnings('ignore')

data = pd.read_csv("data/healthcare-dataset-stroke-data.csv",encoding='UTF-8')
df=data.copy()
df.head()

del df['id'] # id属性肯定没用
df.info() # 发现bmi属性有部分缺失

df.describe()
# 中分率貌似不高啊 才0.048

fig = plt.figure()
fig.set(alpha=0.2)  # 设定图表颜色alpha参数
# 中分情况
plt.subplot2grid((2,9),(0,0),colspan=2)             
df.stroke.value_counts().plot(kind='bar')
plt.title('中分情况(1为中分)',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)


# 性别情况
plt.subplot2grid((2,9),(0,3),colspan=2)             
df.gender.value_counts().plot(kind='bar')
plt.title('性别情况',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)

# 年龄情况
plt.subplot2grid((2,9),(0,6),colspan=2)             
df.age.plot(kind='kde') # 密度图
plt.title('年龄情况',fontproperties=myfont)
plt.ylabel('年龄',fontproperties=myfont)



fig = plt.figure()
fig.set(alpha=0.2)  # 设定图表颜色alpha参数
# 高血压情况情况
plt.subplot2grid((2,9),(0,0),colspan=2)             
df.hypertension.value_counts().plot(kind='bar')
plt.title('高血压情况(1为高血压)',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)

# 心脏病情况
plt.subplot2grid((2,9),(0,3),colspan=2)             
df.hypertension.value_counts().plot(kind='bar')
plt.title('心脏病情况',fontproperties=myfont)
plt.ylabel('人数密度',fontproperties=myfont)

# 结婚情况
plt.subplot2grid((2,9),(0,6),colspan=2)             
df.ever_married.value_counts().plot(kind='bar')
plt.title('结婚情况',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)

fig = plt.figure()
fig.set(alpha=0.2)  # 设定图表颜色alpha参数
# 工作类型情况
plt.subplot2grid((2,12),(0,0),colspan=2)             
df.work_type.value_counts().plot(kind='bar')
plt.title('工作类型情况',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)

# 住宅类型情况
plt.subplot2grid((2,12),(0,3),colspan=2)             
df.Residence_type.value_counts().plot(kind='bar')
plt.title('住宅类型情况',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)

# 平均血糖
plt.subplot2grid((2,12),(0,6),colspan=2)             
df.avg_glucose_level.plot(kind='kde')
plt.title('平均血糖',fontproperties=myfont)
plt.ylabel('血糖浓度',fontproperties=myfont)
# 抽烟情况
plt.subplot2grid((2,12),(0,9),colspan=2)             
df.smoking_status.value_counts().plot(kind='bar')
plt.title('抽烟情况',fontproperties=myfont)
plt.ylabel('人数',fontproperties=myfont)

tmp = df.drop(['avg_glucose_level','age','bmi','stroke'],axis=1)
for column in tmp.columns:
    tmp[column][df.stroke==1].value_counts().plot(kind='bar')
    plt.title(column+'=》中分',fontproperties=myfont)
    plt.ylabel('人数',fontproperties=myfont)
    plt.xlabel(column+'情况',fontproperties=myfont)
    plt.show()
    
# 数据预处理
# 找到多值属性
for column in df.select_dtypes(include=['object']): # 处理所有字符串 select_dtypes函数按照指定类型返回列名
    if len(np.unique(df[column]))>2:  # 对每一列进行去重计数 找到所有多值属性
        print(f"==== [COLUMNS: {column}] ====")
        print(df[column].value_counts()) # 查看多值组成
        
        
# 发现gender属性的Other 只有一个 就删除他 那他就变成二值属性了

# df['gender']=='Other' 获得gender属性是Other 的一个bool series 然后套一层df得到所有的gender是Other的行 然后获取第0个行索引 删掉他
df.drop(df[df['gender']=='Other'].index[0],axis=0,inplace=True) 


# 对其他多值属性进行one hot 编码 将离散的属性进行拓展
df=pd.get_dummies(df,columns=['work_type'])
df=pd.get_dummies(df,columns=['smoking_status'])


df.info()


# 找到二值属性
for column in df.select_dtypes(include=['object']):
    if len(np.unique(df[column]))<=2: 
        print(f"==== [COLUMNS: {column}] ====")
        print(df[column].value_counts()) 
        
        
# 使用sklearn.preprocessing.LabelEncoder 编码二项式值 
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df.gender = le.fit_transform(df.gender)
df.ever_married = le.fit_transform(df.ever_married)
df.Residence_type=le.fit_transform(df.Residence_type)


# 处理缺失值


# 查看缺失值
df.isna().sum()


var_names = list(df)
# 使用knn处理缺失值
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
dff = imputer.fit_transform(df) # 返回numpy的矩阵
dff = pd.DataFrame(dff,columns=var_names) 
df.bmi=dff.bmi


# 仍然还有一个缺失值 直接删除
df.dropna(axis=0,how='any',inplace=True)
df.isna().sum()

y = df['stroke']
x = df.drop(['stroke'],axis=1)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=11,shuffle=True)

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

algorithms=[
    LogisticRegression,
    KNeighborsClassifier,
    MLPClassifier,
    DecisionTreeClassifier,
    RandomForestClassifier,
    GaussianNB,
    AdaBoostClassifier,
    BaggingClassifier
]


score_df = pd.DataFrame()
for algorithm in algorithms:
    score = {"算法":algorithm.__name__}
    model=algorithm().fit(x_train,y_train)
    y_pred=model.predict(x_test)
    score["accuracy_score"] = accuracy_score(y_test,y_pred)*100
    score_df = score_df.append(score, ignore_index=True)
    
    
score_df.sort_values(by="accuracy_score",ascending = False,inplace=True)

score_df.plot(x='算法',y='accuracy_score',kind='bar')
plt.title('算法排名',fontproperties=myfont)
plt.ylabel('准确率',fontproperties=myfont)
plt.xlabel('算法',fontproperties=myfont)
plt.show()


上一篇:Matplotlib设置中文


下一篇:山东省技能兴鲁大数据赛项