import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
def draw(path, y_true, y_predict):
y_predict = pd.Series(y_predict)
y_predict.index = y_test.index
fig = plt.figure(figsize=(10, 6))
y_true.sort_index(inplace=True)
y_predict.sort_index(inplace=True)
plt.plot(y_true.index, y_true, marker=‘o’, markersize=1)
plt.plot(y_true.index, y_predict, marker=‘x’, markersize=1)
plt.savefig(path)
def getMetrics(y_true, y_predict):
Result = {}
Result[‘MAE’] = metrics.mean_absolute_error(y_test, y_predict)
Result[‘RMSE’] = metrics.mean_squared_error(y_test, y_predict, squared=False)
Result[‘MAPE’] = metrics.mean_absolute_percentage_error(y_test, y_predict)
return Result
def linear_regression(X_train, X_test, y_train):
linreg = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)
linreg.fit(X_train, y_train)
y_pred = linreg.predict(X_test)
return y_pred
def SVR_regression(X_train, X_test, y_train):
model = SVR()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
return y_pred
def decision_tree_regression(X_train, X_test, y_train):
model = DecisionTreeRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
return y_pred
def random_forest(X_train, X_test, y_train):
model = RandomForestRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
return y_pred
def LSTM_regression(X_train, X_test, y_train):
train_X = X_train.values
test_X = X_test.values
train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
scaler = MinMaxScaler(feature_range=(0, 1))
train_X = X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1]))
test_X = X_test.values.reshape((X_test.shape[0], 1, X_test.shape[1]))
model = Sequential()
model.add(LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(Dense(1))
model.compile(loss=‘mae’, optimizer=‘adam’)
history = model.fit(train_X, train_y, epochs=100, batch_size=72, validation_data=(test_X, test_y), verbose=2,
shuffle=False)
y_predict = model.predict(test_X)
return y_predict.reshape((y_test.shape[0]))
if name == ‘main’:
FileList = [‘data/PRSA_Data_Aotizhongxin_20130301-20170228.csv’, ‘data/PRSA_Data_Changping_20130301-20170228.csv’]
rawData = pd.read_csv(FileList[0])
rawData[‘season’] = (rawData[‘month’] -1) // 3 + 1
rawData[“Date”] = pd.to_datetime(data[[“year”,“month”,“day”,“hour”]], format=’%Y%m%d%H’)
rawData.index = rawData[“Date”]
rawData.sort_index(inplace=True)
rawData = rawData.groupby([‘year’,‘season’])
resultSet = {}
resultSet[“linear_regression”] = {}
resultSet[“SVR_regression”] = {}
resultSet[“decision_tree_regression”] = {}
resultSet[“random_forest”] = {}
resultSet[“LSTM_regression”] = {}
for item in rawData:
print(’’ * 20, item[0], '’ * 20)
path = str(item[0][0]) + ‘-sea’ + str(item[0][1])
data = pd.DataFrame(item[1][[“year”,“month”,“day”,“hour”,“PM2.5”,“TEMP”,“PRES”,“DEWP”,“RAIN”,“wd”,“WSPM”]])
dataset = data[[“PM2.5”,“TEMP”,“PRES”,“DEWP”,“RAIN”,“WSPM”]]
dataset[“PM2.5(t-1)”] = data[“PM2.5”].shift(1).values
dataset.dropna(axis=0, how=‘any’, inplace=True)
# 数据归一化处理
encoder = LabelEncoder()
dataset[“WD”] = encoder.fit_transform(data[“wd”])
dataset = dataset.apply(lambda x: (x - np.min(x)+1) / (np.max(x) - np.min(x)+1))
X = dataset[[“PM2.5(t-1)”,“TEMP”,“PRES”,“DEWP”,“RAIN”,“WD”,“WSPM”]]
y = dataset[“PM2.5”]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
y_predict = linear_regression(X_train, X_test, y_train)
draw('result/linear_regression-' + path, y_test, y_predict)
resultSet["linear_regression"][path] = getMetrics(y_test, y_predict)
y_predict = SVR_regression(X_train, X_test, y_train)
draw('result/SVR_regression-' + path, y_test, y_predict)
resultSet["SVR_regression"][path] = getMetrics(y_test, y_predict)
y_predict = decision_tree_regression(X_train, X_test, y_train)
draw('result/decision_tree_regression-' + path, y_test, y_predict)
resultSet["decision_tree_regression"][path] = getMetrics(y_test, y_predict)
y_predict = random_forest(X_train, X_test, y_train)
draw('result/random_forest-' + path, y_test, y_predict)
resultSet["random_forest"][path] = getMetrics(y_test, y_predict)
y_predict = LSTM_regression(X_train, X_test, y_train)
draw('result/LSTM_regression-' + path, y_test, y_predict)
resultSet["LSTM_regression"][path] = getMetrics(y_test, y_predict)
for key, result in resultSet:
result = pd.DataFrame(result).T
result.to_csv(key+".csv")
print(result)