Python 实现LogisticRegression小Demo

2024-03-22 11:52:04

1. 加载数据（以csv格式，从本地文件导入）

# -*- coding: utf-8 -*-

import numpy as np
import sys
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

print("载入数据")
# tsv文件与csv文件相似，分隔符是‘\t’
# 使用round() 确定数据保留几位小数
dataset = pd.read_csv('data.tsv', sep='\t').round(decimals=4)
# 查看数据及各字段详情
print(dataset.dtypes)

X = dataset[['gender', '7d_open_dt', '7d_open_cnt', 'today_active', 'is_new_user',
             'frd_cnt_f', 'agree_rate_f', 'agreed_rate_f', '7d_show_cnt']].values
y = dataset['is_agree'].values

2. 训练集测试集划分与数据归一化

使用训练集训练一个scaler,然后对测试集进行转化

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

print(sc.scale_)
print(sc.mean_)     # 均值 
print(sc.var_)      # 方差
print(sc.n_samples_seen_)  # 评估样本数

3. 模型训练

# Fitting Logistic Regression to the Training set
lr = LogisticRegression(random_state=0)
lr.fit(X_train, y_train)
print("Coefficients:%s, intercept %s" % (lr.coef_, lr.intercept_))

4. 模型预测与结果处理

4.1 predict_probas() 模型预测概率

# 模型预测概率结果
# predict_probas() 输出有两列，T[1]取第一列（正样本列）
y_pred_prob = lr.predict_proba(X_test).T[1]
result = pd.DataFrame()
result['lable'] = y_test           # 真实label
result['pred_prob'] = y_pred_prob  # 预测prob
threshold = 0.9

for i in range(0, len(y_pred_prob)):
    if y_pred_prob[i] >= threshold:
       y_pred_prob[i] = 1
    else:
       y_pred_prob[i] = 0


result['lable_pred'] = y_pred_prob  # 预测label
print(result.head())
# 对预测结果进行排序
result = result.sort_values(by=['uid', 'pred_prob'], ascending=[True, False])
# 写出到文件
result.to_csv("./output.csv", sep='\t', index=None, encoding='gbk')

4.2 predict() 模型预测标签（默认阈值0.5）

# Predicting the Test set results
y_pred = lr.predict(X_test)

5. 模型评测

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
print(cm)

print('准确率Accuracy(Test): %.4g' % metrics.accuracy_score(y_test, y_pred))
print('AUC Score (Test): %f' % metrics.roc_auc_score(y_test, y_pred))
print('精确率Precision(Test):', metrics.precision_score(y_test, y_pred))
print('召回率Recall(Test):', metrics.recall_score(y_test, y_pred))

码农公寓

相关文章