山东数据大赛-供水管网压力预测-排名106-6.45分

任务
通过某新区供水管网的历史压力数据、天气数据和供水管网互通图,预测未来某时间点的压力数据。

数据
主办方提供某新区供水管网数据,数据划分如下:
训练集:2018至2019年的30个压力监测点近两年的压力数据、2018年至2019年的天气数据,以及标明了30个压力监测点位置的供水管网互通图。
测试集:以下4段时间的每小时的压力数据、每天的天气数据,需要分别去预测对应日期每小时的压力数据。
山东数据大赛-供水管网压力预测-排名106-6.45分
具体数据字段描述如下:

(1)压力数据
山东数据大赛-供水管网压力预测-排名106-6.45分
(2)气象数据
山东数据大赛-供水管网压力预测-排名106-6.45分
山东数据大赛-供水管网压力预测-排名106-6.45分

总体思路如下:

  1. 把原本为列名的小时改成Hour字段,做行的条件分类字段
  2. 处理异常数据
  3. 获取日期和时间的特征
  4. 划分训练集和测试集的月份
  5. 最终是使用id和时间日期的特征去预测每个管网的压力值
  6. 使用lgb进行预测

代码如下:

import gc

import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

train2018 = pd.read_csv('train_水压数据_2018.csv', engine='python')
train2019 = pd.read_csv('train_水压数据_2019.csv', engine='python')
test2020 = pd.read_csv('test_水压数据_2020.csv', engine='python')
test = pd.read_csv('to_predict.csv', engine='python')
submit = pd.read_csv('submit.csv', engine='python')

'''
通过某新区供水管网的历史压力数据、天气数据和供水管网互通图,预测未来某时间点的压力数据。
训练集:2018至2019年的30个压力监测点近两年的压力数据、2018年至2019年的天气数据,以及标明了30个压力监测点位置的供水管网互通图。
测试集:以下4段时间的每小时的压力数据、每天的天气数据,需要分别去预测对应日期每小时的压力数据。
相邻的管道结合起来建模
'''


# 把原本为列名的小时改成Hour字段,做行的条件分类字段
def reshape_data(df1):
    time = df1["Time"].values
    meas = df1["MeasName"].values

    df_list = []

    for i in range(0, 24):
        hour = 'H' + str(i)
        pressure = df1[hour].values
        df2 = pd.DataFrame()
        df2["Time"] = time
        df2["MeasName"] = meas
        df2["Hour"] = hour
        df2["pressure"] = pressure

        df_list.append(df2)

    df3 = pd.concat(df_list)
    df3.sort_values(by=['Time', 'MeasName'], inplace=True)
    df3 = df3.reset_index(drop=True)
    return df3


train2018 = reshape_data(train2018)
train2019 = reshape_data(train2019)
test2020 = reshape_data(test2020)

train2018['Time_time'] = pd.to_datetime(train2018['Time'])
train2019['Time_time'] = pd.to_datetime(train2019['Time'])
test2020['Time_time'] = pd.to_datetime(test2020['Time'])
test['Time_time'] = pd.to_datetime(test['Time'])


def abnormal(df):
    # 处理-9999异常值: 填充为nan
    index_value = list(df[df['pressure'] == -99999].index)
    for i in index_value:
        df.loc[i, 'pressure'] = np.nan
    # 把压力值小于0.1的数据设为nan
    index_value = list(df[df['pressure'] < 0.1].index)
    for i in index_value:
        df.loc[i, 'pressure'] = np.nan
    # 把压力值大于0.5的数据设为nan
    index_value = list(df[df['pressure'] > 0.5].index)
    for i in index_value:
        df.loc[i, 'pressure'] = np.nan
    return df.dropna()


train2018 = abnormal(train2018)
train2019 = abnormal(train2019)
test2020 = abnormal(test2020)


def feature1(df):
    df['Day'] = df['Time'].apply(lambda x: int(x.split('-')[-1]))
    df['Hour'] = df['Hour'].apply(lambda x: int(x.replace('H', '')))
    df['MeasName'] = df['MeasName'].apply(lambda x: int(x.replace('站点', '')))
    return df


train2018 = feature1(train2018)
train2019 = feature1(train2019)
test2020 = feature1(test2020)
test = feature1(test)

train2019Mon2 = train2019[(train2019['Time_time'] >= '2019-2-1') & (train2019['Time_time'] <= '2019-2-28')]
train2019Mon1 = train2019[(train2019['Time_time'] >= '2019-1-1') & (train2019['Time_time'] <= '2019-1-28')]
Mon_2_1_2019 = train2019Mon2['pressure'].mean() - train2019Mon1['pressure'].mean()

train1 = test2020[(test2020['Time_time'] >= '2020-1-1') & (test2020['Time_time'] <= '2020-1-31')]
test1 = test[(test['Time_time'] >= '2020-2-3') & (test['Time_time'] <= '2020-2-16')]

used_feat = [f for f in train1.columns if f not in ['id', 'pressure', 'Time', 'Time_time']]
print('feat nums ', len(used_feat), used_feat)

train_x = train1[used_feat]
train_y = train1['pressure']
test_x = test1[used_feat]
print(train_x.shape, test_x.shape)

scores = []

params = {'learning_rate': 0.05,
          'boosting_type': 'gbdt',
          'objective': 'regression_l1',
          'metric': 'mae',
          'min_child_samples': 46,
          'min_child_weight': 0.01,
          'feature_fraction': 0.8,
          'bagging_fraction': 0.8,
          'bagging_freq': 2,
          'num_leaves': 26,
          'max_depth': 9,
          'seed': 2019,
          'verbosity': -1,
          }

oof_train = np.zeros(len(train_x))
preds = np.zeros(len(test_x))
folds = 5
seeds = [2048, 1997]
for seed in seeds:
    kfold = KFold(n_splits=folds, shuffle=True, random_state=seed)
    for fold, (trn_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        print('fold ', fold + 1)
        x_trn, y_trn, x_val, y_val = train_x.iloc[trn_idx], train_y.iloc[trn_idx], train_x.iloc[val_idx], train_y.iloc[
            val_idx]
        train_set = lgb.Dataset(x_trn, y_trn)
        val_set = lgb.Dataset(x_val, y_val)

        model = lgb.train(params, train_set, num_boost_round=5000,
                          valid_sets=(train_set, val_set), early_stopping_rounds=90,
                          verbose_eval=50)
        oof_train[val_idx] += model.predict(x_val) / len(seeds)
        preds += model.predict(test_x) / folds / len(seeds)
        del x_trn, y_trn, x_val, y_val, model, train_set, val_set
        gc.collect()

    mse = (mean_squared_error(oof_train, train1['pressure']))

    print('-' * 120)
    print('mse ', round(mse, 5))

test1_pre = preds + Mon_2_1_2019
test1.loc[:, 'pressure'] = test1_pre.tolist()

####分段2
train2019Mon4 = train2019[(train2019['Time_time'] >= '2019-4-1') & (train2019['Time_time'] <= '2019-4-30')]
train2019Mon3 = train2019[(train2019['Time_time'] >= '2019-3-1') & (train2019['Time_time'] <= '2019-3-30')]
Mon_4_3_2019 = train2019Mon4['pressure'].mean() - train2019Mon3['pressure'].mean()

train2 = test2020[(test2020['Time_time'] >= '2020-3-1') & (test2020['Time_time'] <= '2020-3-31')]
test2 = test[(test['Time_time'] >= '2020-4-6') & (test['Time_time'] <= '2020-4-19')]

used_feat = [f for f in train2.columns if f not in ['id', 'pressure', 'Time', 'Time_time']]
print('feat nums ', len(used_feat), used_feat)

train_x = train2[used_feat]
train_y = train2['pressure']
test_x = test2[used_feat]
print(train_x.shape, test_x.shape)

oof_train = np.zeros(len(train_x))
preds = np.zeros(len(test_x))
folds = 5
seeds = [2048, 1997]

for seed in seeds:
    kfold = KFold(n_splits=folds, shuffle=True, random_state=seed)
    for fold, (trn_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        print('fold ', fold + 1)
        x_trn, y_trn, x_val, y_val = train_x.iloc[trn_idx], train_y.iloc[trn_idx], train_x.iloc[val_idx], train_y.iloc[
            val_idx]
        train_set = lgb.Dataset(x_trn, y_trn)
        val_set = lgb.Dataset(x_val, y_val)

        model = lgb.train(params, train_set, num_boost_round=5000,
                          valid_sets=(train_set, val_set), early_stopping_rounds=90,
                          verbose_eval=50)
        oof_train[val_idx] += model.predict(x_val) / len(seeds)
        preds += model.predict(test_x) / folds / len(seeds)
        del x_trn, y_trn, x_val, y_val, model, train_set, val_set
        gc.collect()

    mse = (mean_squared_error(oof_train, train2['pressure']))

    print('-' * 120)
    print('mse ', round(mse, 5))

test2_pre = preds + Mon_2_1_2019
test2.loc[:, 'pressure'] = test2_pre.tolist()

####分段3
train2019Mon6 = train2019[(train2019['Time_time'] >= '2019-6-1') & (train2019['Time_time'] <= '2019-6-30')]
train2019Mon5 = train2019[(train2019['Time_time'] >= '2019-5-1') & (train2019['Time_time'] <= '2019-5-30')]
Mon_6_5_2019 = train2019Mon6['pressure'].mean() - train2019Mon5['pressure'].mean()

train3 = test2020[(test2020['Time_time'] >= '2020-5-1') & (test2020['Time_time'] <= '2020-5-31')]
test3 = test[(test['Time_time'] >= '2020-6-1') & (test['Time_time'] <= '2020-6-14')]

used_feat = [f for f in train3.columns if f not in ['id', 'pressure', 'Time', 'Time_time']]
print('feat nums ', len(used_feat), used_feat)

train_x = train3[used_feat]
train_y = train3['pressure']
test_x = test3[used_feat]
print(train_x.shape, test_x.shape)

oof_train = np.zeros(len(train_x))
preds = np.zeros(len(test_x))
folds = 5
seeds = [2048, 1997]
for seed in seeds:
    kfold = KFold(n_splits=folds, shuffle=True, random_state=seed)
    for fold, (trn_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        print('fold ', fold + 1)
        x_trn, y_trn, x_val, y_val = train_x.iloc[trn_idx], train_y.iloc[trn_idx], train_x.iloc[val_idx], train_y.iloc[
            val_idx]
        train_set = lgb.Dataset(x_trn, y_trn)
        val_set = lgb.Dataset(x_val, y_val)

        model = lgb.train(params, train_set, num_boost_round=5000,
                          valid_sets=(train_set, val_set), early_stopping_rounds=90,
                          verbose_eval=50)
        oof_train[val_idx] += model.predict(x_val) / len(seeds)
        preds += model.predict(test_x) / folds / len(seeds)
        del x_trn, y_trn, x_val, y_val, model, train_set, val_set
        gc.collect()

    mse = (mean_squared_error(oof_train, train3['pressure']))

    print('-' * 120)
    print('mse ', round(mse, 5))

pre = preds + Mon_2_1_2019
test3.loc[:, 'pressure'] = pre.tolist()

###分段4
train2019Mon9 = train2019[(train2019['Time_time'] >= '2019-9-1') & (train2019['Time_time'] <= '2019-9-30')]
train2019Mon8 = train2019[(train2019['Time_time'] >= '2019-8-1') & (train2019['Time_time'] <= '2019-8-30')]
Mon_9_8_2019 = train2019Mon9['pressure'].mean() - train2019Mon8['pressure'].mean()

train4 = test2020[(test2020['Time_time'] >= '2020-8-1') & (test2020['Time_time'] <= '2020-8-31')]
test4 = test[(test['Time_time'] >= '2020-9-7') & (test['Time_time'] <= '2020-9-20')]

used_feat = [f for f in train4.columns if f not in ['id', 'pressure', 'Time', 'Time_time']]
print('feat nums ', len(used_feat), used_feat)

train_x = train4[used_feat]
train_y = train4['pressure']
test_x = test4[used_feat]
print(train_x.shape, test_x.shape)

oof_train = np.zeros(len(train_x))
preds = np.zeros(len(test_x))
folds = 5
seeds = [2048, 1997]
for seed in seeds:
    kfold = KFold(n_splits=folds, shuffle=True, random_state=seed)
    for fold, (trn_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        print('fold ', fold + 1)
        x_trn, y_trn, x_val, y_val = train_x.iloc[trn_idx], train_y.iloc[trn_idx], train_x.iloc[val_idx], train_y.iloc[
            val_idx]
        train_set = lgb.Dataset(x_trn, y_trn)
        val_set = lgb.Dataset(x_val, y_val)

        model = lgb.train(params, train_set, num_boost_round=5000,
                          valid_sets=(train_set, val_set), early_stopping_rounds=90,
                          verbose_eval=50)
        oof_train[val_idx] += model.predict(x_val) / len(seeds)
        preds += model.predict(test_x) / folds / len(seeds)
        del x_trn, y_trn, x_val, y_val, model, train_set, val_set
        gc.collect()

    mse = (mean_squared_error(oof_train, train4['pressure']))

    print('-' * 120)
    print('mse ', round(mse, 5))

pre = preds + Mon_2_1_2019
np.savetxt('pre4.csv', pre)
# test4.loc[:, 'pressure'] = pre.tolist()
test4.to_csv('test4.csv')
test4 = pd.read_csv('test4.csv')
pre = pd.read_csv('pre4.csv', header=None)
test4.loc[:, 'pressure'] = pre.values.tolist()
test = pd.concat([test1, test2, test3, test4], axis=0)
test[['id', 'pressure']].to_csv('lgb_5000.csv', index=False)

上一篇:大数据和云计算技术周报(第106期)


下一篇:LeetCode 106~112题