源代码:
# %%
'''
步骤:
1、读入数据集,将车费、经纬度进行清洗
(使用plt画散点图(省略))
2、用sklearn进行预测
'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
train = pd.read_csv(r"C:\Users\Administrator\纽约出租车车费预测\train.csv",nrows=1000000)
train.head()
train.describe() # 发现车费min为负,经度纬度、乘客数的max过大
train.shape # 原始数据集大小
train.drop(train[train.isna().any(1)].index, axis=0, inplace = True) # 删除任何有nan的行
train.shape # 删除nan之后的大小
# # 清洗乘客数
train["passenger_count"].describe()
train["passenger_count"].value_counts().sort_values(ascending=True) # 寻找人数异常值的个数
train.drop(train[(train['passenger_count'] > 6) | (train['passenger_count'] == 0)].index, inplace = True, axis = 0) #drop异常人数值
train["passenger_count"].value_counts().sort_values(ascending=True)
# # 清洗经纬度
eps = 1e-7
train[(train["pickup_longitude"] - train["dropoff_longitude"] < eps) & (train["pickup_longitude"] - train["dropoff_longitude"] > -eps) & \
(train["pickup_latitude"] - train["dropoff_latitude"] < eps) & (train["pickup_latitude"] - train["dropoff_latitude"] > -eps)\
] # 很多起始位置基本小数点前6位没有发生变化
# 与describe里的经纬度对比,需要把一些离平均值很远的行去掉
for name in train.columns[3:7]:
train.drop(train[(train[name] < train[name].mean()-10) | (train[name] > train[name].mean() + 10)].index\
, axis = 0, inplace = True)
train.describe()
# %% [markdown]
# # 清洗车费
train["fare_amount"].value_counts().sort_index(ascending=True) # 计数后按车费排序
train.drop(train[train["fare_amount"] < eps].index, inplace = True, axis = 0)
train["fare_amount"].describe() # 认为大于0即为合法数据
train.describe() # 除了车费,其他数据方差很小,说明异常值基本去除
# # 导入测试集,并给训练集和测试集加入有关时间的列
test = pd.read_csv(r"C:\Users\Administrator\Desktop\纽约出租车车费预测\test.csv")
# 转时间类型
train['key'] = pd.to_datetime(train['key'])
train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'])
test['key'] = pd.to_datetime(test['key'])
test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime'])
train.dtypes
# 增加列
train['year'] = train['pickup_datetime'].dt.year
train['month'] = train['pickup_datetime'].dt.month
train['day'] = train['pickup_datetime'].dt.day
train['hour'] = train['pickup_datetime'].dt.hour
train['day of week'] = train['pickup_datetime'].dt.dayofweek
test['year'] = test['pickup_datetime'].dt.year
test['month'] = test['pickup_datetime'].dt.month
test['day'] = test['pickup_datetime'].dt.day
test['hour'] = test['pickup_datetime'].dt.hour
test['day of week'] = test['pickup_datetime'].dt.dayofweek
train.dtypes
test.dtypes
# # 计算路程以及每mile的车费(预测时没用到,因为是预测test的车费)
def distance(lat1, long1, lat2, long2):
data = [train, test]
for i in data:
R = 6371 # 地球半径(单位:千米)
phi1 = np.radians(i[lat1])
phi2 = np.radians(i[lat2])
delta_phi = np.radians(i[lat2]-i[lat1])
delta_lambda = np.radians(i[long2]-i[long1])
#a = sin²((φB - φA)/2) + cos φA . cos φB . sin²((λB - λA)/2)
a = np.sin(delta_phi / 2.0) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2.0) ** 2
#c = 2 * atan2( √a, √(1−a) )
c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
#d = R*c
d = (R * c) # 单位:千米
i['H_Distance'] = d
return d
distance('pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude')
'''eps = 1e-7
train.drop( train[(train["pickup_longitude"] - train["dropoff_longitude"] < eps) & (train["pickup_longitude"] - train["dropoff_longitude"] > -eps) & \
(train["pickup_latitude"] - train["dropoff_latitude"] < eps) & (train["pickup_latitude"] - train["dropoff_latitude"] > -eps)\
].index,inplace = True, axis = 0) # 去除没有动的点
'''
eps = 1e-7
train.drop(train[(train['H_Distance']< eps) & (train['H_Distance'] > -eps)].index, inplace=True, axis=0)
train["fare_pre_mile"] = train.fare_amount / train.H_Distance # 每mile的价钱
train
train["fare_pre_mile"].describe()
for i in range(0,20): # 发现大于平均值以后的数量占比很小,考虑由于个别异常值导致平均值过大
print(train[train["fare_pre_mile"] > train["fare_pre_mile"].mean()+i]["fare_pre_mile"].count())
# 首先去除油价搞的离谱的
train.drop(train[(train['fare_pre_mile'] > train["fare_pre_mile"].mean()+1000)].index, inplace=True, axis=0)
train["fare_pre_mile"].describe()
train.drop(train[(train['fare_pre_mile'] > train["fare_pre_mile"].mean()+100)].index, inplace=True, axis=0)
train["fare_pre_mile"].describe()
train.drop(train[(train['fare_pre_mile'] > train["fare_pre_mile"].mean()+100)].index, inplace=True, axis=0)
train["fare_pre_mile"].describe()
train.drop(train[(train['fare_pre_mile'] > train["fare_pre_mile"].mean()+50)].index, inplace=True, axis=0)
train["fare_pre_mile"].describe()
# 发现平均值基本稳定了,油价也比较接近常识
for i in range(0,20): # 遍历每mile油费大于i的count
print(i," : ",train[train["fare_pre_mile"] > i]["fare_pre_mile"].count())
# 去掉后面一部分
train.drop(train[(train['fare_pre_mile'] > 8)].index, inplace=True, axis=0)
# 再去除小于1的
train.drop(train[(train['fare_pre_mile'] < 1)].index, inplace=True, axis=0)
train['fare_pre_mile'].describe()
# 预测
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler # 标准化
x_train = train.drop(["key","pickup_datetime","fare_amount","fare_pre_mile"],1) # 训练集数据
y_train = train["fare_amount"] # 训练集结果
x_test = test.drop(["key","pickup_datetime"],1)
std_x = StandardScaler()
x_train = std_x.fit_transform(x_train)
x_test = std_x.fit_transform(x_test)
std_y = StandardScaler()
y_train = std_y.fit_transform(np.array(y_train).reshape(-1,1))
x_train.shape
y_train.shape
x_test.shape
# 梯度下降预测
sgd = SGDRegressor()
y_train = y_train.ravel()
sgd.fit(x_train,y_train)
y_sgd_predict = sgd.predict(x_test)
y_sgd_predict = std_y.inverse_transform(y_sgd_predict)
y_sgd_predict
test["fare_amount"]=y_sgd_predict
train