Pandas重塑和轴向旋转

重塑和轴向旋转

Se

import pandas as pd
import numpy as np
from pandas import Series data=pd.DataFrame(np.arange(6).reshape(2,3),
index=['Ohio','Colorado'],
columns=['one','two','three']
)
data.index.names=['state']
data.columns.names=['number']
data number one two three
state
Ohio 0 1 2
Colorado 3 4 5 # 使用该unstack()方法可将列转为行,一一对应,得到一个Series
result = data.stack()
result state number
Ohio one 0
two 1
three 2
Colorado one 3
two 4
three 5
dtype: int32 # unstack()可以将其重排为一个DataFrame
result.unstack() number one two three
state
Ohio 0 1 2
Colorado 3 4 5 # 默认情况下,unstack操作的是最里面的那层number,这个对象可以接收索引的编号啊或名称
result.unstack('state') state Ohio Colorado
number
one 0 3
two 1 4
three 2 5
# 传入索引名称,都是讲state的层次化索引变为DataFrame格式
result.unstack(0) state Ohio Colorado
number
one 0 3
two 1 4
three 2 5 # 当组里的值不是都有的时候,unstack会引入缺失数
s1 = Series([0,1,2,3],index=['a','b','c','d'])
s2 = Series([4,5,6], index=['c','d','e'])
data2 = pd.concat([s1,s2],keys=['one','two'])
data2 one a 0
b 1
c 2
d 3
two c 4
d 5
e 6
dtype: int64 data2.unstack() a b c d e
one 0.0 1.0 2.0 3.0 NaN
two NaN NaN 4.0 5.0 6.0 # 但是stack却可以过滤掉缺失数据,如果不想过滤,可以dropna=False
data2.unstack().stack()
one a 0.0
b 1.0
c 2.0
d 3.0
two c 4.0
d 5.0
e 6.0
dtype: float64 # 这是不过滤的效果
data2.unstack().stack(dropna=False) one a 0.0
b 1.0
c 2.0
d 3.0
e NaN
two a NaN
b NaN
c 4.0
d 5.0
e 6.0
dtype: float64 # DataFrame中的stack和unstack result state number
Ohio one 0
two 1
three 2
Colorado one 3
two 4
three 5
dtype: int32 df = pd.DataFrame({'left':result, 'right':result+5},columns=pd.Index(['left','right'],name='side'))
df side left right
state number
Ohio one 0 5
two 1 6
three 2 7
Colorado one 3 8
two 4 9
three 5 10 # 对DataFrame进行unstack操作,会将旋转轴变为结果中的最低级别,变为层次化索引的最低级别
df.unstack('state') side left right
state Ohio Colorado Ohio Colorado
number
one 0 3 5 8
two 1 4 6 9
three 2 5 7 10 # side也会是最低级别,把side折叠
df.unstack('state').stack('side') state Colorado Ohio
number side
one left 3 0
right 8 5
two left 4 1
right 9 6
three left 5 2
right 10 7

时间序列数据的堆叠格式

data_c = [
['1959-03-31','realgdb',2710.349],
['1959-03-31','infl',0.000],
['1959-03-31','unemp',5.800],
['1959-06-30','realgdb',2778.801],
['1959-06-30','infl',2.340],
['1959-06-30','unemp',5.100],
['1959-09-30','realgdb',2775.488],
['1959-09-30','infl',2.740],
['1959-09-30','unemp',5.300],
]
ldata = pd.DataFrame(data_c,columns=['data','item','value'])
ldata data item value
0 1959-03-31 realgdb 2710.349
1 1959-03-31 infl 0.000
2 1959-03-31 unemp 5.800
3 1959-06-30 realgdb 2778.801
4 1959-06-30 infl 2.340
5 1959-06-30 unemp 5.100
6 1959-09-30 realgdb 2775.488
7 1959-09-30 infl 2.740
8 1959-09-30 unemp 5.300 # 将data作为行索引,item作为列索引,最简单的方法,pivot快捷函数
ldata.pivot('data','item','value') item infl realgdb unemp
data
1959-03-31 0.00 2710.349 5.8
1959-06-30 2.34 2778.801 5.1
1959-09-30 2.74 2775.488 5.3 # pivot其实是执行了如下两步,本质还是堆叠
#第一步
ldata.set_index(['data','item']) value
data item
1959-03-31 realgdb 2710.349
infl 0.000
unemp 5.800
1959-06-30 realgdb 2778.801
infl 2.340
unemp 5.100
1959-09-30 realgdb 2775.488
infl 2.740
unemp 5.300 # 第二步
ldata.set_index(['data','item']).unstack() value
item infl realgdb unemp
data
1959-03-31 0.00 2710.349 5.8
1959-06-30 2.34 2778.801 5.1
1959-09-30 2.74 2775.488 5.3
上一篇:Increase SharePoint Execution Timeout


下一篇:Leetcode Valid Sudoku