import os import time import pandas as pd import numpy as np import matplotlib.pyplot as plt # 模型处理模块 from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV from sklearn.preprocessing import StandardScaler # 常规模型 from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier # 集成学习和stacking模型 from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier import xgboost as xgb # 评价标准模块 from sklearn import metrics from sklearn.metrics import accuracy_score,roc_auc_score,recall_score,precision_score, classification_report import warnings warnings.filterwarnings('ignore') %matplotlib inline data = pd.read_csv(r"E:\Excersise\ML\Trip\order_train_merage.csv",parse_dates=["orderdate","arrival","etd"]) data.head() # 检测个字段的缺失及占比 data.apply(lambda x: [x.isnull().sum(), x.isnull().sum()/x.size], axis=0) data.dropna(inplace=True) data.label.value_counts() data.duplicated().sum() data.describe(include="object") dummies = pd.get_dummies(data.hotelbelongto ,prefix='hotelbelongto') dummies_1 = pd.get_dummies(data.supplierchannel ,prefix='supplierchannel') data = pd.concat([data,dummies,dummies_1],axis=1) data.head() #ADASYN自适应采样 from imblearn.over_sampling import ADASYN sample =ADASYN() #抽样的X,Y都要为数组 X_resampled,y_resampled = sample.fit_resample(data.loc[:,data.columns != "label"].values,data.label.values) model_name_param_dict = { 'LR': (LogisticRegression()), 'DT': (DecisionTreeClassifier()), 'AdaBoost': (AdaBoostClassifier()), 'GBDT': (GradientBoostingClassifier()), 'RF': (RandomForestClassifier()), 'XGBoost':(XGBClassifier()) } result = {} for model_name, model in model_name_param_dict.items(): result[model_name] = train_model(X_train, y_train, X_test, y_test, model,model_name)