# -*- coding: utf-8 -*- """ Created on Fri Oct 9 13:34:59 2020 @author: Administrator """ import scorecardpy as sc dat = sc.germancredit() #首先,导入germancredit数据。 #1. 筛选变量 #这个函数可以根据指定的条件筛选变量,例如IV值、缺失率、一致性等 dt_s = sc.var_filter(dt=dat , y="creditability" , x=None , iv_limit=0.02 , missing_limit=0.95 , identical_limit=0.95 , var_rm=None #强制删除变量的名称 , var_kp=None #强制保留变量的名称 , return_rm_reason=False #是否返回每个变量被删除的原因 , positive='bad|1' #坏样本的标签 ) # 数据划分 train, test = sc.split_df(dt=dt_s , y='creditability' , ratio=0.7 #默认按照7:3对数据集进行分割 , seed=186 ).values() #变量分箱 返回字典 bins = sc.woebin(dt_s, y="creditability") bins = sc.woebin(dt=dt_s #数据 , y="creditability" #目标值 , x=None # , var_skip=None # 指定不需要分箱的变量。 , breaks_list=None #分割点的List。对分箱进行调整的时候用。可以进行自定义分箱 , special_values=None #指定单独的箱。 , stop_limit=0.1 #当IV值的增加值小于stop_limit或者卡方值小于qchisq(1-stoplimit, 1)时停止分割。 , count_distr_limit=0.05 #分箱结果中最小占比。默认0.05 , bin_num_limit=8 #最大分箱数。 # min_perc_fine_bin=0.02, min_perc_coarse_bin=0.05, max_num_bin=8, , positive="bad|1" , no_cores=None , print_step=0 , method="tree"#分箱方法,可以有"tree" or "chimerge"。 , ignore_const_cols=True#是否忽略常数列。 , ignore_datetime_cols=True , check_cate_num=True#检查分类变量中类别数是否大于50。 , replace_blank=True#将空值替换为None。 , save_breaks_list=None ) # 变量分箱 sc.woebin_plot(bins) # 分箱调整 breaks_adj = sc.woebin_adj(dt_s, "creditability", bins) bins_adj = sc.woebin(dt_s, y="creditability", breaks_list=breaks_adj) #woe转换 train_woe = sc.woebin_ply(train, bins_adj) test_woe = sc.woebin_ply(test, bins_adj) #模型建立 y_train = train_woe.loc[:,'creditability'] X_train = train_woe.loc[:,train_woe.columns != 'creditability'] y_test = test_woe.loc[:,'creditability'] X_test = test_woe.loc[:,train_woe.columns != 'creditability'] # logistic regression ------ from sklearn.linear_model import LogisticRegression lr = LogisticRegression(penalty='l1', C=0.9, solver='saga', n_jobs=-1) lr.fit(X_train, y_train) # lr.coef_ # lr.intercept_ # predicted proability train_pred = lr.predict_proba(X_train)[:,1] test_pred = lr.predict_proba(X_test)[:,1] #模型评估 train_perf = sc.perf_eva(y_train, train_pred, title = "train") test_perf = sc.perf_eva(y_test, test_pred, title = "test") #评分映射 card = sc.scorecard(bins_adj, lr, X_train.columns) # credit score train_score = sc.scorecard_ply(train, card, print_step=0) test_score = sc.scorecard_ply(test, card, print_step=0) #评分稳定性评估--PSI sc.perf_psi( score = {'train':train_score, 'test':test_score}, label = {'train':y_train, 'test':y_test} )