数据可视化 - part

连续变量的分布

适合又细又密的柱状图。每个取值一个柱子。

import matplotlib.pyplot as plt
import pandas as pd

rawdata = pd.read_csv(r"..\Data\train_set.csv")
X = rawdata.iloc[:, 0:-1]

def plot_attr(dataset,attr):
    data = dataset[attr].value_counts()
    data.sort_index(inplace=True)
    plt.bar(x=data.index,height=data.values,color="#5d8ca8")
    plt.xlabel(attr)
    plt.ylabel("counts")
    plt.show()

plot_attr(X, "age")

e.g.
数据可视化 - part

属性之间的相关性

热力图首选。包含y的热力图:

from scipy.stats import pearsonr
import seaborn
import matplotlib.pyplot as plt
import pandas as pd

def draw_heatmap(dataset):
    xlen = len(dataset[:,:-1].columns)
    df = pd.DataFrame()
    for i in range(xlen+1):
        corrs = []
        for j in range(xlen):
            corr, p_value = pearsonr(dataset.iloc[:,j], dataset.iloc[:,i])
            corrs.append(corr)
        df_corr = pd.DataFrame({dataset.columns[i]: corrs})
        df = pd.concat([df, df_corr], axis=1)
    df.index = dataset[:,:-1].columns
    for i in range(xlen):
        for j in range(xlen+1):
            df.iloc[i,j] = abs(df.iloc[i,j])
    seaborn.heatmap(df, cmap="Reds",linewidths=1)
    plt.show()

draw_heatmap(rawdata)

e.g.
数据可视化 - part

roc曲线

这是从官网上抄来的例子。模型用的是logistic回归

from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import plot_roc_curve
from sklearn.model_selection import StratifiedKFold

lr = LogisticRegression(max_iter=1000)

def plot_roc(tprs, aucs, ax, pic_name):

    ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
            label='Chance', alpha=.8)
    mean_fpr = np.linspace(0, 1, 100)
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    ax.plot(mean_fpr, mean_tpr, color='b',
            label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
            lw=2, alpha=.8)

    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                    label=r'$\pm$ 1 std. dev.')

    ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05])
    ax.legend(loc="lower right")
    plt.savefig("../output/"+pic_name+".png")
    plt.show()

def train_lr(estimator, X, Y, pic_name):
    kf = StratifiedKFold(n_splits=5)
    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)
    fig, ax = plt.subplots()
    ax.set(title="ROC_Curve -- LinearRegression")
    df = pd.DataFrame({"attr": X.columns})

    for i,(train, test) in enumerate(kf.split(X,Y)):
        estimator = estimator.fit(X.iloc[train], Y.iloc[train])
        df["importance{}".format(i+1)] = estimator.coef_[0]
        viz = plot_roc_curve(estimator, X.iloc[test], Y.iloc[test],
                             name='ROC fold {}'.format(i+1),
                             alpha=0.3, lw=1, ax=ax)
        interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
        interp_tpr[0] = 0.0
        tprs.append(interp_tpr)
        aucs.append(viz.roc_auc)

    plot_roc(tprs, aucs, ax, pic_name)
    df["ave"] = df.iloc[:, 1:].mean(axis=1)
    df["sort_helper"] = df["ave"].abs()
    df = df.sort_values(by="sort_helper", ascending=False)
    return df

train_lr(lr, X, Y, "lr")

交叉验证、每一折的roc曲线叠加、显示方差;最后输出每一折的auc,保存图片

e.g.
数据可视化 - part

其实excel画的图很好看。除非数据量大用python,否则用excel [真香]。

上一篇:用python实现RFM模型


下一篇:Column Transformer with Mixed Types -- of sklearn