数据预处理

将值为0的数据替换为空值

data = data.replace(0.0000, np.nan)

统计某一列空值的数量

data['one_column'].isnull().sum()

统计缺失值大于某一阈值的列的名字

data_null = []
for data_col in data.columns.values.tolist():
    if data[data_col].isnull().sum() >= 16:
        data_null.append(data_col)
        print(data_col, data[data_col].isnull().sum())
data_null

删除某一列或批量删除某几列

data = data.drop('one_column',axis=1)
data = data.drop(data_null,axis=1)

用均值填充缺失值

avg = data[null_colums].mean()
data[null_colums] = data[null_colums].fillna(avg)

缺失值分析

def missing_values(df):
    alldata_na = pd.DataFrame(df.isnull().sum(), columns={'missingNum'})
    alldata_na['existNum'] = len(df) - alldata_na['missingNum']
    alldata_na['sum'] = len(df)
    alldata_na['missingRatio'] = alldata_na['missingNum']/len(df)*100
    alldata_na['dtype'] = df.dtypes
    #ascending:默认True升序排列;False降序排列
    alldata_na = alldata_na[alldata_na['missingNum']>0].reset_index().sort_values(by=['missingNum','index'],ascending=[False,True])
    alldata_na.set_index('index',inplace=True)
    return alldata_na

missing_values(data_train)

是否有单调特征列(单调的特征列很大可能是时间)

#是否有单调特征列(单调的特征列很大可能是时间)
def incresing(vals):
    cnt = 0
    len_ = len(vals)
    for i in range(len_-1):
        if vals[i+1] > vals[i]:
            cnt += 1
    return cnt

fea_cols = [col for col in data_train.columns]
for col in fea_cols:
    cnt = incresing(data_train[col].values)
    if cnt / data_train.shape[0] >= 0.55:
        print('单调特征:',col)
        print('单调特征值个数:', cnt)
        print('单调特征值比例:', cnt / data_train.shape[0])

特征nunique分布

# 特征nunique分布
for feature in categorical_feas:
    print(feature + "的特征分布如下:")
    print(data_train[feature].value_counts())
    plt.hist(data_all[feature], bins=3)
    plt.show()

统计特征值出现频次大于100的特征

# 统计特征值出现频次大于100的特征
for feature in categorical_feas:
    df_value_counts = pd.DataFrame(data_train[feature].value_counts())
    df_value_counts = df_value_counts.reset_index()
    df_value_counts.columns = [feature, 'counts'] # change column names
    print(df_value_counts[df_value_counts['counts'] >= 100])

Labe 分布

# Labe 分布
fig,axes = plt.subplots(2,3,figsize=(20,5))
fig.set_size_inches(20,12)
sns.distplot(data_train['tradeMoney'],ax=axes[0][0])
sns.distplot(data_train[(data_train['tradeMoney']<=20000)]['tradeMoney'],ax=axes[0][1])
sns.distplot(data_train[(data_train['tradeMoney']>20000)&(data_train['tradeMoney']<=50000)]['tradeMoney'],ax=axes[0][2])
sns.distplot(data_train[(data_train['tradeMoney']>50000)&(data_train['tradeMoney']<=100000)]['tradeMoney'],ax=axes[1][0])
sns.distplot(data_train[(data_train['tradeMoney']>100000)]['tradeMoney'],ax=axes[1][1])

转换object类型数据为LabelCode

# 转换object类型数据为LabelCode
columns = ['rentType','communityName','houseType', 'houseFloor', 'houseToward', 'houseDecoration',  'region', 'plate']
for feature in columns:
    data[feature] = LabelEncoder().fit_transform(data[feature])

时间格式预处理

#将update_date从例如2019-02-20的str变为datetime格式,并提取处year、month、day
data["year"] = pd.to_datetime(data["update_date"]).dt.year 
data["month"] = pd.to_datetime(data["update_date"]).dt.month
data["day"] = pd.to_datetime(data["update_date"]).dt.day
data['week'] = pd.to_datetime(data["update_date"]).dt.week
data['weekday'] = pd.to_datetime(data["update_date"]).dt.weekday

找出 year 中2019年以后的数据,并将其他数据删除

data = data[data["year"] >= 2019] #找出 year 中2019年以后的数据,并将其他数据删除

聚合时间数据

# 聚合时间数据
total_balance = data.groupby(['date'])['total_purchase_amt','total_redeem_amt'].sum().reset_index()

将时间字符串转换为10位时间戳

import time
#将时间字符串转换为10位时间戳,时间字符串默认为2017-10-01 13:37:04格式
def date_to_timestamp(date, format_string="%Y/%m/%d %H:%M:%S.%f."):
    time_array = time.strptime(date, format_string)
    time_stamp = int(time.mktime(time_array))
    return time_stamp
上一篇:我的 9 年开源之路:395 Patch、20+Feature,背后只有努力与热爱


下一篇:Mac 安装 Homebrew