1001系列之案例0002如何从斯德哥尔摩气温数据集中可视化挖掘

本案例的重点在于Matplotlib可视化的基础操作实战练习。

import os                   #导入必要的库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
os.chdir("D:\Datalu\File")  #指定工作目录
%matplotlib inline          #必要可视化设置
plt.rcParams["font.sans-serif"] = ["KAITI"]
plt.rcParams["axes.unicode_minus"] = False

一、问题的提出

1.查看一年的平均气温
2.查看一月份的气温
3.每个月的平均气温(柱状图和箱线图)

1.1 导入两份气温数据

#导入室外气温数据
df1 = pd.read_csv("temperature_outdoor_2014.tsv",delimiter="\t", names=["time", "outdoor"])
df1.head(2)
time outdoor
0 1388530986 4.38
1 1388531586 4.25
#导入室内气温数据
df2 = pd.read_csv("temperature_indoor_2014.tsv",delimiter="\t", names=["time2", "indoor"])
df2.head(2)
time2 indoor
0 1388530986 21.94
1 1388531586 22.00
#合并两份数据
df = pd.concat([df1,df2],join="inner",axis=1)
df
time outdoor time2 indoor
0 1388530986 4.38 1388530986 21.94
1 1388531586 4.25 1388531586 22.00
2 1388532187 4.19 1388532187 22.00
3 1388532787 4.06 1388532787 22.00
4 1388533388 4.06 1388533388 22.00
... ... ... ... ...
49540 1419975991 1.44 1419977793 11.75
49541 1419976592 1.50 1419978393 11.75
49542 1419977192 1.50 1419978994 11.75
49543 1419977793 1.56 1419979595 11.75
49544 1419978393 1.62 1419980195 11.81

49545 rows × 4 columns

df.columns
Index(['time', 'outdoor', 'time2', 'indoor'], dtype='object')
df.drop('time2',axis=1,inplace=True)
df.head(2)
time outdoor indoor
0 1388530986 4.38 21.94
1 1388531586 4.25 22.00
dt1 = df.copy()
数据集一共有三列数据,其中一列是时间戳,两列是气温数据
这里有两种方法可以将其转化为时间按,第一种是在导入文件时,第二种是用to_datetime方法

二、查看数据基本信息

dt1.info(memory_usage="deep")    # 没有自动辨认成时间
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49545 entries, 0 to 49544
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   time     49545 non-null  int64  
 1   outdoor  49545 non-null  float64
 2   indoor   49545 non-null  float64
dtypes: float64(2), int64(1)
memory usage: 1.1 MB
dt1.values   #查看数据集的值
array([[1.38853099e+09, 4.38000000e+00, 2.19400000e+01],
       [1.38853159e+09, 4.25000000e+00, 2.20000000e+01],
       [1.38853219e+09, 4.19000000e+00, 2.20000000e+01],
       ...,
       [1.41997719e+09, 1.50000000e+00, 1.17500000e+01],
       [1.41997779e+09, 1.56000000e+00, 1.17500000e+01],
       [1.41997839e+09, 1.62000000e+00, 1.18100000e+01]])
dt1.values[:,0]   #查看数据集某一列的值
array([1.38853099e+09, 1.38853159e+09, 1.38853219e+09, ...,
       1.41997719e+09, 1.41997779e+09, 1.41997839e+09])
dt1.time.values  #也可以通过列名来查看值
array([1388530986, 1388531586, 1388532187, ..., 1419977192, 1419977793,
       1419978393], dtype=int64)

2.1 将时间戳转换为日期时间格式

dt1["time"] = pd.Timestamp(dt1["time"],unit="s") #这样转换时间戳是错误的
dt1
---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-140-255e05936dae> in <module>
----> 1 dt1["time"] = pd.Timestamp(dt1["time"],unit="s") #这样转换时间戳是错误的
      2 dt1


pandas\_libs\tslibs\timestamps.pyx in pandas._libs.tslibs.timestamps.Timestamp.__new__()


pandas\_libs\tslibs\conversion.pyx in pandas._libs.tslibs.conversion.convert_to_tsobject()


TypeError: Cannot convert input [0        1388530986
1        1388531586
2        1388532187
3        1388532787
4        1388533388
            ...    
49540    1419975991
49541    1419976592
49542    1419977192
49543    1419977793
49544    1419978393
Name: time, Length: 49545, dtype: int64] of type <class 'pandas.core.series.Series'> to Timestamp
df["time"] = df["time"].apply(lambda x:pd.TimeStamp(x))  #这样转换时间戳也是错误的
df
---------------------------------------------------------------------------

AttributeError                            Traceback (most recent call last)

<ipython-input-11-ea11b62a5933> in <module>
----> 1 df["time"] = df["time"].apply(lambda x:pd.TimeStamp(x))
      2 df


D:\Anaconda3\lib\site-packages\pandas\core\series.py in apply(self, func, convert_dtype, args, **kwds)
   4136             else:
   4137                 values = self.astype(object)._values
-> 4138                 mapped = lib.map_infer(values, f, convert=convert_dtype)
   4139 
   4140         if len(mapped) and isinstance(mapped[0], Series):


pandas\_libs\lib.pyx in pandas._libs.lib.map_infer()


<ipython-input-11-ea11b62a5933> in <lambda>(x)
----> 1 df["time"] = df["time"].apply(lambda x:pd.TimeStamp(x))
      2 df


D:\Anaconda3\lib\site-packages\pandas\__init__.py in __getattr__(name)
    242         return _SparseArray
    243 
--> 244     raise AttributeError(f"module 'pandas' has no attribute '{name}'")
    245 
    246 


AttributeError: module 'pandas' has no attribute 'TimeStamp'
dt1["time"] =  pd.to_datetime(dt1["time"],unit="s")  #用to_datetime可以实现转换
dt1.head(2)
outdoor indoor month temperature_diff
time
2014-01-01 00:03:06+01:00 4.38 21.94 1 17.56
2014-01-01 00:13:06+01:00 4.25 22.00 1 17.75
dt1["time"] =  pd.to_datetime(dt1["time"].values,unit="s").tz_localize('UTC')  #UTC时间,协调世界时,和前面的时间好像一致
dt1.head(2)
outdoor indoor month temperature_diff
time
2014-01-01 00:03:06+01:00 4.38 21.94 1 17.56
2014-01-01 00:13:06+01:00 4.25 22.00 1 17.75
# 因为这是欧洲时间,因此还需要从协调世界时转换,比上面早了一个小时
dt1["time"] =  pd.to_datetime(dt1["time"].values).tz_localize('UTC').tz_convert("Europe/Stockholm")  
dt1
#然后把时间设置为索引,可以更方便操作
dt1.set_index("time",inplace =True)

2.2 查看整理好的数据最新信息

dt1.tail(2)
outdoor indoor month temperature_diff
time
2014-12-30 23:16:33+01:00 1.56 11.75 12 10.19
2014-12-30 23:26:33+01:00 1.62 11.81 12 10.19
dt1.describe()
outdoor indoor
count 49545.000000 49545.000000
mean 8.461685 23.396307
std 7.866008 4.684381
min -15.500000 10.310000
25% 2.620000 19.810000
50% 7.750000 22.940000
75% 13.880000 27.620000
max 34.380000 33.120000

三、气温随着时间变化可视化

#放在一起画
plt.figure(figsize=(12,6),dpi = 100)
plt.plot(dt1[["outdoor","indoor"]],color="r")
plt.title("斯德哥尔摩气温图")
plt.xlabel("时间")
plt.ylabel("气温",rotation =0)
plt.legend()
plt.tight_layout()
plt.show()

1001系列之案例0002如何从斯德哥尔摩气温数据集中可视化挖掘

#注意到在2014年8月到9月之间有异常,放大细部观察
plt.figure(figsize=(16,6),dpi = 100)
plt.plot(dt1.iloc[25000:35000,:],color="r")
plt.title("斯德哥尔摩气温图")
plt.xlabel("时间")
plt.ylabel("气温",rotation =0)
plt.tight_layout()
plt.show()

1001系列之案例0002如何从斯德哥尔摩气温数据集中可视化挖掘

#两列数据分开画
plt.figure(figsize=(12,6),dpi=100)
plt.plot(dt1["outdoor"],color="r",label="outdoor")
plt.plot(dt1["indoor"],color="b",label="indoor")
plt.title("斯德哥尔摩气温状况")
plt.xlabel("时间")
plt.ylabel("气温")
plt.legend()
plt.tight_layout()
plt.show()

1001系列之案例0002如何从斯德哥尔摩气温数据集中可视化挖掘

上图看到,可能存在数据缺失

查看一年的平均气温

dt1.mean()
outdoor     8.461685
indoor     23.396307
dtype: float64

查看一月份的气温

#问题分解:怎么索引出一月的气温
df1.filter(lambda x:x.index.month == 1) #这样过滤不出来
---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-59-1c58f5c242af> in <module>
      1 #2.查看一月份的气温
      2 #怎么索引出一月的气温
----> 3 df1.filter(lambda x:x.index.month == 1)


D:\Anaconda3\lib\site-packages\pandas\core\generic.py in filter(self, items, like, regex, axis)
   4974         if items is not None:
   4975             name = self._get_axis_name(axis)
-> 4976             return self.reindex(**{name: [r for r in items if r in labels]})
   4977         elif like:
   4978 


TypeError: 'function' object is not iterable
dt1.loc[dt1.index.month==1] #方法一:用索引作条件进行过滤
outdoor indoor month temperature_diff
time
2014-01-01 00:03:06+01:00 4.38 21.94 1 17.56
2014-01-01 00:13:06+01:00 4.25 22.00 1 17.75
2014-01-01 00:23:07+01:00 4.19 22.00 1 17.81
2014-01-01 00:33:07+01:00 4.06 22.00 1 17.94
2014-01-01 00:43:08+01:00 4.06 22.00 1 17.94
... ... ... ... ...
2014-01-31 23:16:56+01:00 -3.88 16.31 1 20.19
2014-01-31 23:26:57+01:00 -3.81 16.31 1 20.12
2014-01-31 23:36:57+01:00 -3.81 16.31 1 20.12
2014-01-31 23:46:58+01:00 -3.75 16.31 1 20.06
2014-01-31 23:56:58+01:00 -3.69 16.38 1 20.07

4452 rows × 4 columns

dt1.loc[(dt1.index>"2014-01-01") & (dt1.index<"2014-02-01")]  #方法二:用条件过滤
outdoor indoor
time
2014-01-01 00:03:06+01:00 4.38 21.94
2014-01-01 00:13:06+01:00 4.25 22.00
2014-01-01 00:23:07+01:00 4.19 22.00
2014-01-01 00:33:07+01:00 4.06 22.00
2014-01-01 00:43:08+01:00 4.06 22.00
... ... ...
2014-01-31 23:16:56+01:00 -3.88 16.31
2014-01-31 23:26:57+01:00 -3.81 16.31
2014-01-31 23:36:57+01:00 -3.81 16.31
2014-01-31 23:46:58+01:00 -3.75 16.31
2014-01-31 23:56:58+01:00 -3.69 16.38

4452 rows × 2 columns

dt1["2014-1-1":"2014-1-31"]    #方法三:切片索引
outdoor indoor
time
2014-01-01 00:03:06+01:00 4.38 21.94
2014-01-01 00:13:06+01:00 4.25 22.00
2014-01-01 00:23:07+01:00 4.19 22.00
2014-01-01 00:33:07+01:00 4.06 22.00
2014-01-01 00:43:08+01:00 4.06 22.00
... ... ...
2014-01-31 23:16:56+01:00 -3.88 16.31
2014-01-31 23:26:57+01:00 -3.81 16.31
2014-01-31 23:36:57+01:00 -3.81 16.31
2014-01-31 23:46:58+01:00 -3.75 16.31
2014-01-31 23:56:58+01:00 -3.69 16.38

4452 rows × 2 columns

dt1["2014-01"]  #方法四:用时间索引直接正则表达式过滤
outdoor indoor
time
2014-01-01 00:03:06+01:00 4.38 21.94
2014-01-01 00:13:06+01:00 4.25 22.00
2014-01-01 00:23:07+01:00 4.19 22.00
2014-01-01 00:33:07+01:00 4.06 22.00
2014-01-01 00:43:08+01:00 4.06 22.00
... ... ...
2014-01-31 23:16:56+01:00 -3.88 16.31
2014-01-31 23:26:57+01:00 -3.81 16.31
2014-01-31 23:36:57+01:00 -3.81 16.31
2014-01-31 23:46:58+01:00 -3.75 16.31
2014-01-31 23:56:58+01:00 -3.69 16.38

4452 rows × 2 columns

dt1["month"] = dt1.index.month  #增加一列月份特征
dt1["temperature_diff"] = dt1["indoor"]-dt1["outdoor"]  #增加一列温差特征
dt1.head()
outdoor indoor month temperature_diff
time
2014-01-01 00:03:06+01:00 4.38 21.94 1 17.56
2014-01-01 00:13:06+01:00 4.25 22.00 1 17.75
2014-01-01 00:23:07+01:00 4.19 22.00 1 17.81
2014-01-01 00:33:07+01:00 4.06 22.00 1 17.94
2014-01-01 00:43:08+01:00 4.06 22.00 1 17.94
dt1.loc[dt1["month"] ==1].iloc[:,0:2]  #方法五:通过其他新增特征过滤
outdoor indoor
time
2014-01-01 00:03:06+01:00 4.38 21.94
2014-01-01 00:13:06+01:00 4.25 22.00
2014-01-01 00:23:07+01:00 4.19 22.00
2014-01-01 00:33:07+01:00 4.06 22.00
2014-01-01 00:43:08+01:00 4.06 22.00
... ... ...
2014-01-31 23:16:56+01:00 -3.88 16.31
2014-01-31 23:26:57+01:00 -3.81 16.31
2014-01-31 23:36:57+01:00 -3.81 16.31
2014-01-31 23:46:58+01:00 -3.75 16.31
2014-01-31 23:56:58+01:00 -3.69 16.38

4452 rows × 2 columns

#画出一月份室内外气温图
plt.figure(figsize=(16,6),dpi = 100)
plt.plot(dt1.loc[dt1.index.month==1].iloc[:,0:1],color="r",label="一月室外气温")
plt.plot(dt1.loc[dt1.index.month==1].iloc[:,1:2],color="b",label="一月室内气温")
plt.title("斯德哥尔摩气温图")
plt.xlabel("时间")
plt.ylabel("气温",rotation =0)
plt.legend()
plt.tight_layout()
plt.show()

1001系列之案例0002如何从斯德哥尔摩气温数据集中可视化挖掘

#查看一月室内外温差的变动幅度
plt.figure(figsize=(16,6),dpi = 100)
plt.plot(dt1.loc[dt1.index.month==1].iloc[:,3:],color="r",label="一月室内外温差")
plt.title("斯德哥尔摩气温图")
plt.xlabel("时间")
plt.ylabel("气温",rotation =0)
plt.legend()
plt.tight_layout()
plt.show()

1001系列之案例0002如何从斯德哥尔摩气温数据集中可视化挖掘

1月13日这一天斯德哥尔摩内外温差达到最大,最大为35度。

每个月的平均气温(柱状图和箱线图)

#先计算出每个月的平均气温,再画图
#方法一  从时间这一特征中衍生出月份特征,根据月份进行分组
dt2 = dt1.groupby("month").mean()
dt2
outdoor indoor temperature_diff
month
1 -1.776646 19.862590 21.639236
2 2.231613 20.231675 18.000063
3 4.615437 19.597298 14.981861
4 8.105193 22.151119 14.045926
5 12.261396 26.334053 14.072656
6 15.586955 28.687025 13.100070
7 20.780314 30.607379 9.827065
8 16.494823 28.094698 11.599875
9 12.823905 26.949290 14.125385
10 9.352000 23.378314 14.026315
11 4.992142 20.608239 15.616097
12 -0.060139 16.464418 16.524557
#方法二  重采样
dt3 = dt1.to_period(freq="M").groupby(level=0).mean()
dt3
outdoor indoor month temperature_diff
time
2014-01 -1.776646 19.862590 1 21.639236
2014-02 2.231613 20.231675 2 18.000063
2014-03 4.615437 19.597298 3 14.981861
2014-04 8.105193 22.151119 4 14.045926
2014-05 12.261396 26.334053 5 14.072656
2014-06 15.586955 28.687025 6 13.100070
2014-07 20.780314 30.607379 7 9.827065
2014-08 16.494823 28.094698 8 11.599875
2014-09 12.823905 26.949290 9 14.125385
2014-10 9.352000 23.378314 10 14.026315
2014-11 4.992142 20.608239 11 15.616097
2014-12 -0.060139 16.464418 12 16.524557
#pandas自带画图
dt2[["outdoor","indoor"]].plot(kind="bar",color=["r","b"],figsize=(12,6))
<matplotlib.axes._subplots.AxesSubplot at 0x19a33701588>

1001系列之案例0002如何从斯德哥尔摩气温数据集中可视化挖掘

#matplotlib画图
plt.figure(figsize=(16,6),dpi = 100)
bar_width=0.35
plt.bar(dt2.index.values,dt2["outdoor"].values,color="r",width=0.4,label="一月室内外温差")
plt.bar(dt2.index.values+bar_width,dt2["indoor"].values,color="b",width=0.4,label="一月室内气温")
plt.title("斯德哥尔摩气温图")
plt.xlabel("时间")
plt.ylabel("气温",rotation =0)
plt.tight_layout()
plt.show()

1001系列之案例0002如何从斯德哥尔摩气温数据集中可视化挖掘

#两张图分开画
fig,ax = plt.subplots(1,2,figsize=(8,4))
fig.subplots_adjust(wspace=0.5)

ax[0].bar(dt2.index.values,dt2["outdoor"].values,color="r",label="室外气温")
ax[0].set_title("斯德哥尔摩气温图")
ax[0].set_xlabel("时间")
ax[0].set_ylabel("气温",rotation=0)
ax[0].legend()

ax[1].bar(dt2.index.values,dt2["indoor"].values,color="b",label="室内气温")
ax[1].set_title("斯德哥尔摩气温图")
ax[1].set_xlabel("时间")
ax[1].set_ylabel("气温",rotation=0)
ax[1].legend()

plt.tight_layout()
plt.show()

1001系列之案例0002如何从斯德哥尔摩气温数据集中可视化挖掘

#室内和室外气温箱线图
plt.figure(figsize=(16,6),dpi = 100)
plt.boxplot([dt2["outdoor"].values,dt2["indoor"].values],labels=["室外气温","室内气温"],whis=1.63)
plt.grid(axis="y",ls=":",lw=1,color="gray",alpha=0.4)
plt.show()

1001系列之案例0002如何从斯德哥尔摩气温数据集中可视化挖掘

上一篇:[Leetcode学习-c++&java]Maximum Product of Word Lengths


下一篇:USACO20JAN Cave Painting 【并查集】