下载数据集到本地
分析数据
- 先知晓各个数据特征的含义,观察一下
- 找到有用的,也就是可以影响到预测标签的数据
- 没有用的数据不用管
处理数据
-
这些有用的数据中,有些可能是空值
- 如果该列数据较多,就取平均值
- 如果极少,可以删了该行数据
-
有些有用的数据是字符串,不是数值,需要转化为数值
把各类模型跑一遍,找到分最高的
- 使用交叉验证,对比各个分数
- 这里只列举部分模型,还可以继续添加模型继续对比,或者调参,选出更好的
"""
# @Time : 2020/9/1
# @Author : Jimou Chen
"""
import pandas as pd
from sklearn.preprocessing import StandardScaler
data = pd.read_csv('train.csv')
# print(data)
'''先处理空缺的数据'''
# 处理空缺的年龄,设为平均年龄
data['Age'] = data['Age'].fillna(data['Age'].median())
# print(data.describe())
# 处理性别,转化维0和1,loc是取数据的,里面传行,列
data.loc[data['Sex'] == 'male', 'Sex'] = 1
data.loc[data['Sex'] == 'female', 'Sex'] = 0
# print(data.loc[:, 'Sex'])
# 处理Embarked,登录港口
# print(data['Embarked'].unique()) # 看一下里面有几类
# 由于'S'比较多,就把空值用S填充
data['Embarked'] = data['Embarked'].fillna('S')
# 转化为数字
data.loc[data['Embarked'] == 'S', 'Embarked'] = 0
data.loc[data['Embarked'] == 'C', 'Embarked'] = 1
data.loc[data['Embarked'] == 'Q', 'Embarked'] = 2
'''接下来选取有用的特征'''
feature = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
x_data = data[feature]
y_data = data['Survived'] # 预测的标签
# 数据标准化
scaler = StandardScaler()
x_data = scaler.fit_transform(x_data)
# print(x_data)
'''处理完数据之后,现在可以使用各自算法看看效果了'''
from sklearn.model_selection import cross_val_score # 导入交叉验证后的分数
# 逻辑回归
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
# 计算交叉验证的误差,分三组
scores = cross_val_score(lr, x_data, y_data, cv=3)
print(scores.mean()) # 求平均
# 神经网络模型
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(20, 10), max_iter=2000)
# 计算交叉验证的误差,分三组
scores = cross_val_score(mlp, x_data, y_data, cv=3)
print(scores.mean()) # 求平均
# kNN
from sklearn.neighbors import KNeighborsClassifier
kNN = KNeighborsClassifier(n_neighbors=21)
scores = cross_val_score(kNN, x_data, y_data, cv=3)
print(scores.mean())
# 决策树
from sklearn.tree import DecisionTreeClassifier
# 最小分割样本数,小于4个就不往下分割了
d_tree = DecisionTreeClassifier(max_depth=3, min_samples_split=4)
scores = cross_val_score(d_tree, x_data, y_data, cv=3)
print(scores.mean())
'''下面是集成学习'''
# 随机森林
from sklearn.ensemble import RandomForestClassifier
rf1 = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2)
scores = cross_val_score(rf1, x_data, y_data, cv=3)
print(scores.mean())
# 100棵决策树构成
rf2 = RandomForestClassifier(n_estimators=100, min_samples_split=4)
scores = cross_val_score(rf2, x_data, y_data, cv=3)
print(scores.mean())
# Bagging
from sklearn.ensemble import BaggingClassifier
# 集成rf2,做20次有放回的抽样,由于rf2也是集成学习模型,所以运行时间有点久
bg = BaggingClassifier(rf2, n_estimators=20)
scores = cross_val_score(bg, x_data, y_data, cv=3)
print(scores.mean())
# AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
adb = AdaBoostClassifier(rf2, n_estimators=20)
scores = cross_val_score(adb, x_data, y_data, cv=3)
print(scores.mean())
# Stacking
from mlxtend.classifier import StackingClassifier
stacking = StackingClassifier(classifiers=[bg, mlp, lr],
meta_classifier=LogisticRegression())
scores = cross_val_score(stacking, x_data, y_data, cv=3)
print(scores.mean())
# Voting
from sklearn.ensemble import VotingClassifier
voting = VotingClassifier([('ado', adb), ('mlp', mlp),
('LR', lr), ('kNN', kNN),
('d_tree', d_tree)])
scores = cross_val_score(voting, x_data, y_data, cv=3)
print(scores.mean())
- 结果
0.7901234567901234
0.8024691358024691
0.8125701459034792
0.8103254769921436
0.8013468013468014
0.819304152637486
0.8204264870931538
0.7991021324354658
0.819304152637486
0.8170594837261503
Process finished with exit code 0
- 可以发现Bagging集成随机森林的效果相对不错
- 接下来就用它来试试
使用该较好模型进行预测
"""
# @Time : 2020/9/2
# @Author : Jimou Chen
"""
import sys
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pandas as pd
# 先处理空缺的数据
def deal_train(train_data):
# 处理空缺的年龄,设为平均年龄
train_data['Age'] = train_data['Age'].fillna(train_data['Age'].median())
# print(train_data.describe())
# 处理性别,转化维0和1,loc是取数据的,里面传行,列
train_data.loc[train_data['Sex'] == 'male', 'Sex'] = 1
train_data.loc[train_data['Sex'] == 'female', 'Sex'] = 0
# print(train_data.loc[:, 'Sex'])
# 处理Embarked,登录港口
# print(train_data['Embarked'].unique()) # 看一下里面有几类
# 由于'S'比较多,就把空值用S填充
train_data['Embarked'] = train_data['Embarked'].fillna('S')
# 转化为数字
train_data.loc[train_data['Embarked'] == 'S', 'Embarked'] = 0
train_data.loc[train_data['Embarked'] == 'C', 'Embarked'] = 1
train_data.loc[train_data['Embarked'] == 'Q', 'Embarked'] = 2
'''接下来选取有用的特征'''
feature = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
x_data = train_data[feature]
y_data = train_data['Survived'] # 预测的标签
# 数据标准化
scaler = StandardScaler()
x_data = scaler.fit_transform(x_data)
return x_data, y_data
# 处理测试集数据
def deal_test(test_data, label_data):
# 填充年龄和Fare
test_data['Age'] = test_data['Age'].fillna(test_data['Age'].median())
test_data['Fare'] = test_data['Fare'].fillna(test_data['Fare'].median())
# 处理性别字符串为数值
test_data.loc[test_data['Sex'] == 'male', 'Sex'] = 1
test_data.loc[test_data['Sex'] == 'female', 'Sex'] = 0
# 处理登岸地点为数值
test_data.loc[test_data['Embarked'] == 'S', 'Embarked'] = 0
test_data.loc[test_data['Embarked'] == 'C', 'Embarked'] = 1
test_data.loc[test_data['Embarked'] == 'Q', 'Embarked'] = 2
# 接下来选取有用的特征'''
feature = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
x_data = test_data[feature]
y_data = label_data['Survived']
# 数据标准化
scaler = StandardScaler()
x_data = scaler.fit_transform(x_data)
return x_data, y_data
if __name__ == '__main__':
# 读入训练集和测试集
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')
real_label_data = pd.read_csv('data/gender_submission.csv')
# 队训练集和测试集进行处理
x_train, y_train = deal_train(train_data)
x_test, y_test = deal_test(test_data, real_label_data)
# 建立模型
rf = RandomForestClassifier(n_estimators=10, max_depth=3, min_samples_split=4)
bagging = BaggingClassifier(rf, n_estimators=12)
bagging.fit(x_train, y_train)
# 预测
prediction = bagging.predict(x_test)
# 评估
print(bagging.score(x_test, y_test))
print((classification_report(prediction, y_test)))
# 保存预测结果为csv
submission = pd.DataFrame({
"PassengerId": test_data["PassengerId"],
"Survived": prediction
})
submission.to_csv('predict.csv', index=False)
- 结果
0.9282296650717703
precision recall f1-score support
0 0.99 0.91 0.95 290
1 0.82 0.98 0.89 128
accuracy 0.93 418
macro avg 0.91 0.94 0.92 418
weighted avg 0.94 0.93 0.93 418
Process finished with exit code 0
- 在迭代测试时发现,效果最好的是score为0.98+
提交
- 有满意的效果就提交看看
- 注意提交的格式
- 直接把生成的csv预测结果文件拖过去
- 然后他就会给出分数和排名
总结
- 要想得到好的预测集上传的高分的话,可以通过不断迭代,找到接近最好的参数
- 也可以使用更加好的算法和模型拿高分