将值为0的数据替换为空值
data = data.replace(0.0000, np.nan)
统计某一列空值的数量
data['one_column'].isnull().sum()
统计缺失值大于某一阈值的列的名字
data_null = []
for data_col in data.columns.values.tolist():
if data[data_col].isnull().sum() >= 16:
data_null.append(data_col)
print(data_col, data[data_col].isnull().sum())
data_null
删除某一列或批量删除某几列
data = data.drop('one_column',axis=1)
data = data.drop(data_null,axis=1)
用均值填充缺失值
avg = data[null_colums].mean()
data[null_colums] = data[null_colums].fillna(avg)
缺失值分析
def missing_values(df):
alldata_na = pd.DataFrame(df.isnull().sum(), columns={'missingNum'})
alldata_na['existNum'] = len(df) - alldata_na['missingNum']
alldata_na['sum'] = len(df)
alldata_na['missingRatio'] = alldata_na['missingNum']/len(df)*100
alldata_na['dtype'] = df.dtypes
#ascending:默认True升序排列;False降序排列
alldata_na = alldata_na[alldata_na['missingNum']>0].reset_index().sort_values(by=['missingNum','index'],ascending=[False,True])
alldata_na.set_index('index',inplace=True)
return alldata_na
missing_values(data_train)
是否有单调特征列(单调的特征列很大可能是时间)
#是否有单调特征列(单调的特征列很大可能是时间)
def incresing(vals):
cnt = 0
len_ = len(vals)
for i in range(len_-1):
if vals[i+1] > vals[i]:
cnt += 1
return cnt
fea_cols = [col for col in data_train.columns]
for col in fea_cols:
cnt = incresing(data_train[col].values)
if cnt / data_train.shape[0] >= 0.55:
print('单调特征:',col)
print('单调特征值个数:', cnt)
print('单调特征值比例:', cnt / data_train.shape[0])
特征nunique分布
# 特征nunique分布
for feature in categorical_feas:
print(feature + "的特征分布如下:")
print(data_train[feature].value_counts())
plt.hist(data_all[feature], bins=3)
plt.show()
统计特征值出现频次大于100的特征
# 统计特征值出现频次大于100的特征
for feature in categorical_feas:
df_value_counts = pd.DataFrame(data_train[feature].value_counts())
df_value_counts = df_value_counts.reset_index()
df_value_counts.columns = [feature, 'counts'] # change column names
print(df_value_counts[df_value_counts['counts'] >= 100])
Labe 分布
# Labe 分布
fig,axes = plt.subplots(2,3,figsize=(20,5))
fig.set_size_inches(20,12)
sns.distplot(data_train['tradeMoney'],ax=axes[0][0])
sns.distplot(data_train[(data_train['tradeMoney']<=20000)]['tradeMoney'],ax=axes[0][1])
sns.distplot(data_train[(data_train['tradeMoney']>20000)&(data_train['tradeMoney']<=50000)]['tradeMoney'],ax=axes[0][2])
sns.distplot(data_train[(data_train['tradeMoney']>50000)&(data_train['tradeMoney']<=100000)]['tradeMoney'],ax=axes[1][0])
sns.distplot(data_train[(data_train['tradeMoney']>100000)]['tradeMoney'],ax=axes[1][1])
转换object类型数据为LabelCode
# 转换object类型数据为LabelCode
columns = ['rentType','communityName','houseType', 'houseFloor', 'houseToward', 'houseDecoration', 'region', 'plate']
for feature in columns:
data[feature] = LabelEncoder().fit_transform(data[feature])
时间格式预处理
#将update_date从例如2019-02-20的str变为datetime格式,并提取处year、month、day
data["year"] = pd.to_datetime(data["update_date"]).dt.year
data["month"] = pd.to_datetime(data["update_date"]).dt.month
data["day"] = pd.to_datetime(data["update_date"]).dt.day
data['week'] = pd.to_datetime(data["update_date"]).dt.week
data['weekday'] = pd.to_datetime(data["update_date"]).dt.weekday
找出 year 中2019年以后的数据,并将其他数据删除
data = data[data["year"] >= 2019] #找出 year 中2019年以后的数据,并将其他数据删除
聚合时间数据
# 聚合时间数据
total_balance = data.groupby(['date'])['total_purchase_amt','total_redeem_amt'].sum().reset_index()
将时间字符串转换为10位时间戳
import time
#将时间字符串转换为10位时间戳,时间字符串默认为2017-10-01 13:37:04格式
def date_to_timestamp(date, format_string="%Y/%m/%d %H:%M:%S.%f."):
time_array = time.strptime(date, format_string)
time_stamp = int(time.mktime(time_array))
return time_stamp