1. 加载数据(以csv格式,从本地文件导入)
# -*- coding: utf-8 -*-
import numpy as np
import sys
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
print("载入数据")
# tsv文件与csv文件相似,分隔符是‘\t’
# 使用round() 确定数据保留几位小数
dataset = pd.read_csv('data.tsv', sep='\t').round(decimals=4)
# 查看数据及各字段详情
print(dataset.dtypes)
X = dataset[['gender', '7d_open_dt', '7d_open_cnt', 'today_active', 'is_new_user',
'frd_cnt_f', 'agree_rate_f', 'agreed_rate_f', '7d_show_cnt']].values
y = dataset['is_agree'].values
2. 训练集测试集划分与数据归一化
使用训练集训练一个scaler,然后对测试集进行转化
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)
# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
print(sc.scale_)
print(sc.mean_) # 均值
print(sc.var_) # 方差
print(sc.n_samples_seen_) # 评估样本数
3. 模型训练
# Fitting Logistic Regression to the Training set
lr = LogisticRegression(random_state=0)
lr.fit(X_train, y_train)
print("Coefficients:%s, intercept %s" % (lr.coef_, lr.intercept_))
4. 模型预测与结果处理
4.1 predict_probas() 模型预测概率
# 模型预测概率结果
# predict_probas() 输出有两列,T[1]取第一列(正样本列)
y_pred_prob = lr.predict_proba(X_test).T[1]
result = pd.DataFrame()
result['lable'] = y_test # 真实label
result['pred_prob'] = y_pred_prob # 预测prob
threshold = 0.9
for i in range(0, len(y_pred_prob)):
if y_pred_prob[i] >= threshold:
y_pred_prob[i] = 1
else:
y_pred_prob[i] = 0
result['lable_pred'] = y_pred_prob # 预测label
print(result.head())
# 对预测结果进行排序
result = result.sort_values(by=['uid', 'pred_prob'], ascending=[True, False])
# 写出到文件
result.to_csv("./output.csv", sep='\t', index=None, encoding='gbk')
4.2 predict() 模型预测标签(默认阈值0.5)
# Predicting the Test set results
y_pred = lr.predict(X_test)
5. 模型评测
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('准确率Accuracy(Test): %.4g' % metrics.accuracy_score(y_test, y_pred))
print('AUC Score (Test): %f' % metrics.roc_auc_score(y_test, y_pred))
print('精确率Precision(Test):', metrics.precision_score(y_test, y_pred))
print('召回率Recall(Test):', metrics.recall_score(y_test, y_pred))