本案例的重点在于Matplotlib可视化的基础操作实战练习。
import os #导入必要的库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
os.chdir("D:\Datalu\File") #指定工作目录
%matplotlib inline #必要可视化设置
plt.rcParams["font.sans-serif"] = ["KAITI"]
plt.rcParams["axes.unicode_minus"] = False
一、问题的提出
1.查看一年的平均气温
2.查看一月份的气温
3.每个月的平均气温(柱状图和箱线图)
1.1 导入两份气温数据
#导入室外气温数据
df1 = pd.read_csv("temperature_outdoor_2014.tsv",delimiter="\t", names=["time", "outdoor"])
df1.head(2)
time
outdoor
0
1388530986
4.38
1
1388531586
4.25
#导入室内气温数据
df2 = pd.read_csv("temperature_indoor_2014.tsv",delimiter="\t", names=["time2", "indoor"])
df2.head(2)
time2
indoor
0
1388530986
21.94
1
1388531586
22.00
#合并两份数据
df = pd.concat([df1,df2],join="inner",axis=1)
df
time
outdoor
time2
indoor
0
1388530986
4.38
1388530986
21.94
1
1388531586
4.25
1388531586
22.00
2
1388532187
4.19
1388532187
22.00
3
1388532787
4.06
1388532787
22.00
4
1388533388
4.06
1388533388
22.00
...
...
...
...
...
49540
1419975991
1.44
1419977793
11.75
49541
1419976592
1.50
1419978393
11.75
49542
1419977192
1.50
1419978994
11.75
49543
1419977793
1.56
1419979595
11.75
49544
1419978393
1.62
1419980195
11.81
49545 rows × 4 columns
df.columns
Index(['time', 'outdoor', 'time2', 'indoor'], dtype='object')
df.drop('time2',axis=1,inplace=True)
df.head(2)
time
outdoor
indoor
0
1388530986
4.38
21.94
1
1388531586
4.25
22.00
dt1 = df.copy()
数据集一共有三列数据,其中一列是时间戳,两列是气温数据
这里有两种方法可以将其转化为时间按,第一种是在导入文件时,第二种是用to_datetime方法
二、查看数据基本信息
dt1.info(memory_usage="deep") # 没有自动辨认成时间
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49545 entries, 0 to 49544
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 time 49545 non-null int64
1 outdoor 49545 non-null float64
2 indoor 49545 non-null float64
dtypes: float64(2), int64(1)
memory usage: 1.1 MB
dt1.values #查看数据集的值
array([[1.38853099e+09, 4.38000000e+00, 2.19400000e+01],
[1.38853159e+09, 4.25000000e+00, 2.20000000e+01],
[1.38853219e+09, 4.19000000e+00, 2.20000000e+01],
...,
[1.41997719e+09, 1.50000000e+00, 1.17500000e+01],
[1.41997779e+09, 1.56000000e+00, 1.17500000e+01],
[1.41997839e+09, 1.62000000e+00, 1.18100000e+01]])
dt1.values[:,0] #查看数据集某一列的值
array([1.38853099e+09, 1.38853159e+09, 1.38853219e+09, ...,
1.41997719e+09, 1.41997779e+09, 1.41997839e+09])
dt1.time.values #也可以通过列名来查看值
array([1388530986, 1388531586, 1388532187, ..., 1419977192, 1419977793,
1419978393], dtype=int64)
2.1 将时间戳转换为日期时间格式
dt1["time"] = pd.Timestamp(dt1["time"],unit="s") #这样转换时间戳是错误的
dt1
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-140-255e05936dae> in <module>
----> 1 dt1["time"] = pd.Timestamp(dt1["time"],unit="s") #这样转换时间戳是错误的
2 dt1
pandas\_libs\tslibs\timestamps.pyx in pandas._libs.tslibs.timestamps.Timestamp.__new__()
pandas\_libs\tslibs\conversion.pyx in pandas._libs.tslibs.conversion.convert_to_tsobject()
TypeError: Cannot convert input [0 1388530986
1 1388531586
2 1388532187
3 1388532787
4 1388533388
...
49540 1419975991
49541 1419976592
49542 1419977192
49543 1419977793
49544 1419978393
Name: time, Length: 49545, dtype: int64] of type <class 'pandas.core.series.Series'> to Timestamp
df["time"] = df["time"].apply(lambda x:pd.TimeStamp(x)) #这样转换时间戳也是错误的
df
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-11-ea11b62a5933> in <module>
----> 1 df["time"] = df["time"].apply(lambda x:pd.TimeStamp(x))
2 df
D:\Anaconda3\lib\site-packages\pandas\core\series.py in apply(self, func, convert_dtype, args, **kwds)
4136 else:
4137 values = self.astype(object)._values
-> 4138 mapped = lib.map_infer(values, f, convert=convert_dtype)
4139
4140 if len(mapped) and isinstance(mapped[0], Series):
pandas\_libs\lib.pyx in pandas._libs.lib.map_infer()
<ipython-input-11-ea11b62a5933> in <lambda>(x)
----> 1 df["time"] = df["time"].apply(lambda x:pd.TimeStamp(x))
2 df
D:\Anaconda3\lib\site-packages\pandas\__init__.py in __getattr__(name)
242 return _SparseArray
243
--> 244 raise AttributeError(f"module 'pandas' has no attribute '{name}'")
245
246
AttributeError: module 'pandas' has no attribute 'TimeStamp'
dt1["time"] = pd.to_datetime(dt1["time"],unit="s") #用to_datetime可以实现转换
dt1.head(2)
outdoor
indoor
month
temperature_diff
time
2014-01-01 00:03:06+01:00
4.38
21.94
1
17.56
2014-01-01 00:13:06+01:00
4.25
22.00
1
17.75
dt1["time"] = pd.to_datetime(dt1["time"].values,unit="s").tz_localize('UTC') #UTC时间,协调世界时,和前面的时间好像一致
dt1.head(2)
outdoor
indoor
month
temperature_diff
time
2014-01-01 00:03:06+01:00
4.38
21.94
1
17.56
2014-01-01 00:13:06+01:00
4.25
22.00
1
17.75
# 因为这是欧洲时间,因此还需要从协调世界时转换,比上面早了一个小时
dt1["time"] = pd.to_datetime(dt1["time"].values).tz_localize('UTC').tz_convert("Europe/Stockholm")
dt1
#然后把时间设置为索引,可以更方便操作
dt1.set_index("time",inplace =True)
2.2 查看整理好的数据最新信息
dt1.tail(2)
outdoor
indoor
month
temperature_diff
time
2014-12-30 23:16:33+01:00
1.56
11.75
12
10.19
2014-12-30 23:26:33+01:00
1.62
11.81
12
10.19
dt1.describe()
outdoor
indoor
count
49545.000000
49545.000000
mean
8.461685
23.396307
std
7.866008
4.684381
min
-15.500000
10.310000
25%
2.620000
19.810000
50%
7.750000
22.940000
75%
13.880000
27.620000
max
34.380000
33.120000
三、气温随着时间变化可视化
#放在一起画
plt.figure(figsize=(12,6),dpi = 100)
plt.plot(dt1[["outdoor","indoor"]],color="r")
plt.title("斯德哥尔摩气温图")
plt.xlabel("时间")
plt.ylabel("气温",rotation =0)
plt.legend()
plt.tight_layout()
plt.show()
#注意到在2014年8月到9月之间有异常,放大细部观察
plt.figure(figsize=(16,6),dpi = 100)
plt.plot(dt1.iloc[25000:35000,:],color="r")
plt.title("斯德哥尔摩气温图")
plt.xlabel("时间")
plt.ylabel("气温",rotation =0)
plt.tight_layout()
plt.show()
#两列数据分开画
plt.figure(figsize=(12,6),dpi=100)
plt.plot(dt1["outdoor"],color="r",label="outdoor")
plt.plot(dt1["indoor"],color="b",label="indoor")
plt.title("斯德哥尔摩气温状况")
plt.xlabel("时间")
plt.ylabel("气温")
plt.legend()
plt.tight_layout()
plt.show()
上图看到,可能存在数据缺失
查看一年的平均气温
dt1.mean()
outdoor 8.461685
indoor 23.396307
dtype: float64
查看一月份的气温
#问题分解:怎么索引出一月的气温
df1.filter(lambda x:x.index.month == 1) #这样过滤不出来
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-59-1c58f5c242af> in <module>
1 #2.查看一月份的气温
2 #怎么索引出一月的气温
----> 3 df1.filter(lambda x:x.index.month == 1)
D:\Anaconda3\lib\site-packages\pandas\core\generic.py in filter(self, items, like, regex, axis)
4974 if items is not None:
4975 name = self._get_axis_name(axis)
-> 4976 return self.reindex(**{name: [r for r in items if r in labels]})
4977 elif like:
4978
TypeError: 'function' object is not iterable
dt1.loc[dt1.index.month==1] #方法一:用索引作条件进行过滤
outdoor
indoor
month
temperature_diff
time
2014-01-01 00:03:06+01:00
4.38
21.94
1
17.56
2014-01-01 00:13:06+01:00
4.25
22.00
1
17.75
2014-01-01 00:23:07+01:00
4.19
22.00
1
17.81
2014-01-01 00:33:07+01:00
4.06
22.00
1
17.94
2014-01-01 00:43:08+01:00
4.06
22.00
1
17.94
...
...
...
...
...
2014-01-31 23:16:56+01:00
-3.88
16.31
1
20.19
2014-01-31 23:26:57+01:00
-3.81
16.31
1
20.12
2014-01-31 23:36:57+01:00
-3.81
16.31
1
20.12
2014-01-31 23:46:58+01:00
-3.75
16.31
1
20.06
2014-01-31 23:56:58+01:00
-3.69
16.38
1
20.07
4452 rows × 4 columns
dt1.loc[(dt1.index>"2014-01-01") & (dt1.index<"2014-02-01")] #方法二:用条件过滤
outdoor
indoor
time
2014-01-01 00:03:06+01:00
4.38
21.94
2014-01-01 00:13:06+01:00
4.25
22.00
2014-01-01 00:23:07+01:00
4.19
22.00
2014-01-01 00:33:07+01:00
4.06
22.00
2014-01-01 00:43:08+01:00
4.06
22.00
...
...
...
2014-01-31 23:16:56+01:00
-3.88
16.31
2014-01-31 23:26:57+01:00
-3.81
16.31
2014-01-31 23:36:57+01:00
-3.81
16.31
2014-01-31 23:46:58+01:00
-3.75
16.31
2014-01-31 23:56:58+01:00
-3.69
16.38
4452 rows × 2 columns
dt1["2014-1-1":"2014-1-31"] #方法三:切片索引
outdoor
indoor
time
2014-01-01 00:03:06+01:00
4.38
21.94
2014-01-01 00:13:06+01:00
4.25
22.00
2014-01-01 00:23:07+01:00
4.19
22.00
2014-01-01 00:33:07+01:00
4.06
22.00
2014-01-01 00:43:08+01:00
4.06
22.00
...
...
...
2014-01-31 23:16:56+01:00
-3.88
16.31
2014-01-31 23:26:57+01:00
-3.81
16.31
2014-01-31 23:36:57+01:00
-3.81
16.31
2014-01-31 23:46:58+01:00
-3.75
16.31
2014-01-31 23:56:58+01:00
-3.69
16.38
4452 rows × 2 columns
dt1["2014-01"] #方法四:用时间索引直接正则表达式过滤
outdoor
indoor
time
2014-01-01 00:03:06+01:00
4.38
21.94
2014-01-01 00:13:06+01:00
4.25
22.00
2014-01-01 00:23:07+01:00
4.19
22.00
2014-01-01 00:33:07+01:00
4.06
22.00
2014-01-01 00:43:08+01:00
4.06
22.00
...
...
...
2014-01-31 23:16:56+01:00
-3.88
16.31
2014-01-31 23:26:57+01:00
-3.81
16.31
2014-01-31 23:36:57+01:00
-3.81
16.31
2014-01-31 23:46:58+01:00
-3.75
16.31
2014-01-31 23:56:58+01:00
-3.69
16.38
4452 rows × 2 columns
dt1["month"] = dt1.index.month #增加一列月份特征
dt1["temperature_diff"] = dt1["indoor"]-dt1["outdoor"] #增加一列温差特征
dt1.head()
outdoor
indoor
month
temperature_diff
time
2014-01-01 00:03:06+01:00
4.38
21.94
1
17.56
2014-01-01 00:13:06+01:00
4.25
22.00
1
17.75
2014-01-01 00:23:07+01:00
4.19
22.00
1
17.81
2014-01-01 00:33:07+01:00
4.06
22.00
1
17.94
2014-01-01 00:43:08+01:00
4.06
22.00
1
17.94
dt1.loc[dt1["month"] ==1].iloc[:,0:2] #方法五:通过其他新增特征过滤
outdoor
indoor
time
2014-01-01 00:03:06+01:00
4.38
21.94
2014-01-01 00:13:06+01:00
4.25
22.00
2014-01-01 00:23:07+01:00
4.19
22.00
2014-01-01 00:33:07+01:00
4.06
22.00
2014-01-01 00:43:08+01:00
4.06
22.00
...
...
...
2014-01-31 23:16:56+01:00
-3.88
16.31
2014-01-31 23:26:57+01:00
-3.81
16.31
2014-01-31 23:36:57+01:00
-3.81
16.31
2014-01-31 23:46:58+01:00
-3.75
16.31
2014-01-31 23:56:58+01:00
-3.69
16.38
4452 rows × 2 columns
#画出一月份室内外气温图
plt.figure(figsize=(16,6),dpi = 100)
plt.plot(dt1.loc[dt1.index.month==1].iloc[:,0:1],color="r",label="一月室外气温")
plt.plot(dt1.loc[dt1.index.month==1].iloc[:,1:2],color="b",label="一月室内气温")
plt.title("斯德哥尔摩气温图")
plt.xlabel("时间")
plt.ylabel("气温",rotation =0)
plt.legend()
plt.tight_layout()
plt.show()
#查看一月室内外温差的变动幅度
plt.figure(figsize=(16,6),dpi = 100)
plt.plot(dt1.loc[dt1.index.month==1].iloc[:,3:],color="r",label="一月室内外温差")
plt.title("斯德哥尔摩气温图")
plt.xlabel("时间")
plt.ylabel("气温",rotation =0)
plt.legend()
plt.tight_layout()
plt.show()
1月13日这一天斯德哥尔摩内外温差达到最大,最大为35度。
每个月的平均气温(柱状图和箱线图)
#先计算出每个月的平均气温,再画图
#方法一 从时间这一特征中衍生出月份特征,根据月份进行分组
dt2 = dt1.groupby("month").mean()
dt2
outdoor
indoor
temperature_diff
month
1
-1.776646
19.862590
21.639236
2
2.231613
20.231675
18.000063
3
4.615437
19.597298
14.981861
4
8.105193
22.151119
14.045926
5
12.261396
26.334053
14.072656
6
15.586955
28.687025
13.100070
7
20.780314
30.607379
9.827065
8
16.494823
28.094698
11.599875
9
12.823905
26.949290
14.125385
10
9.352000
23.378314
14.026315
11
4.992142
20.608239
15.616097
12
-0.060139
16.464418
16.524557
#方法二 重采样
dt3 = dt1.to_period(freq="M").groupby(level=0).mean()
dt3
outdoor
indoor
month
temperature_diff
time
2014-01
-1.776646
19.862590
1
21.639236
2014-02
2.231613
20.231675
2
18.000063
2014-03
4.615437
19.597298
3
14.981861
2014-04
8.105193
22.151119
4
14.045926
2014-05
12.261396
26.334053
5
14.072656
2014-06
15.586955
28.687025
6
13.100070
2014-07
20.780314
30.607379
7
9.827065
2014-08
16.494823
28.094698
8
11.599875
2014-09
12.823905
26.949290
9
14.125385
2014-10
9.352000
23.378314
10
14.026315
2014-11
4.992142
20.608239
11
15.616097
2014-12
-0.060139
16.464418
12
16.524557
#pandas自带画图
dt2[["outdoor","indoor"]].plot(kind="bar",color=["r","b"],figsize=(12,6))
<matplotlib.axes._subplots.AxesSubplot at 0x19a33701588>
#matplotlib画图
plt.figure(figsize=(16,6),dpi = 100)
bar_width=0.35
plt.bar(dt2.index.values,dt2["outdoor"].values,color="r",width=0.4,label="一月室内外温差")
plt.bar(dt2.index.values+bar_width,dt2["indoor"].values,color="b",width=0.4,label="一月室内气温")
plt.title("斯德哥尔摩气温图")
plt.xlabel("时间")
plt.ylabel("气温",rotation =0)
plt.tight_layout()
plt.show()
#两张图分开画
fig,ax = plt.subplots(1,2,figsize=(8,4))
fig.subplots_adjust(wspace=0.5)
ax[0].bar(dt2.index.values,dt2["outdoor"].values,color="r",label="室外气温")
ax[0].set_title("斯德哥尔摩气温图")
ax[0].set_xlabel("时间")
ax[0].set_ylabel("气温",rotation=0)
ax[0].legend()
ax[1].bar(dt2.index.values,dt2["indoor"].values,color="b",label="室内气温")
ax[1].set_title("斯德哥尔摩气温图")
ax[1].set_xlabel("时间")
ax[1].set_ylabel("气温",rotation=0)
ax[1].legend()
plt.tight_layout()
plt.show()
#室内和室外气温箱线图
plt.figure(figsize=(16,6),dpi = 100)
plt.boxplot([dt2["outdoor"].values,dt2["indoor"].values],labels=["室外气温","室内气温"],whis=1.63)
plt.grid(axis="y",ls=":",lw=1,color="gray",alpha=0.4)
plt.show()