参考pandas官方文档:
http://pandas.pydata.org/pandas-docs/stable/10min.html#min
1.pandas中的数据类型
Series 带有索引标记的一维数组,可以存储任何数据类型
#基本方法
>>s =pd.Series(data, index=index) >>import pandas as pd
>>import numpy as np # 使用ndarray创建
>>indexs = ['a', 'b', 'c']
>>s = pd.Series(np.random.randn(3), index=indexs)
>>s
a -1.817485
b 0.012912
c 0.866929
dtype: float64
>>s.index
Index(['a', 'b', 'c'], dtype='object') #默认索引值
>>s = pd.Series(np.random.randn(3))
>>s
0 1.985833
1 0.467035
2 0.636828
dtype: float64 #使用dict创建
#默认使用dict的索引
>>d = {'a' : 0., 'b' : 1., 'c' : 2.}
>>pd.Series(d)
a 0.0
b 1.0
c 2.0
dtype: float64 #指明索引值
>>pd.Series(d, index=['b', 'c', 'd', 'a'])
b 1.0
c 2.0
d NaN
a 0.0
dtype: float64 #使用标量值创建
>>pd.Series(5., index=['a', 'b', 'c', 'd', 'e'])
a 5.0
b 5.0
c 5.0
d 5.0
e 5.0
dtype: float64
Series 类似ndarray,可以使用Numpy的很多语法
>>s = pd.Series(np.random.randn(5),index=['a', 'b', 'c', 'd', 'e'])
>>s
a -1.329486
b 0.396057
c -1.156737
d -1.152107
e -0.787661
dtype: float64 # 索引
>>s[0]
-1.3294860342555725 #切片
>>s[:3]
a -1.329486
b 0.396057
c -1.156737
dtype: float64 # 推导式
>>s[s > s.median()]
b 0.396057
e -0.787661
dtype: float64 # 按序索引
>>s[[4,3,1]]
e -0.787661
d -1.152107
b 0.396057
dtype: float64 >>np.exp(s)
a 0.264613
b 1.485954
c 0.314511
d 0.315970
e 0.454908
dtype: float64
Series 类似dict类型,可以操作索引值
>>s['a']
-1.3294860342555725 >>s['e']=12
>>s
a -1.329486
b 0.396057
c -1.156737
d -1.152107
e 12.000000
dtype: float64 >>'e' in s
True >>s.get('e')
12.0 >>s+s
a -2.658972
b 0.792115
c -2.313474
d -2.304214
e 24.000000
dtype: float64 >>s*2
a -2.658972
b 0.792115
c -2.313474
d -2.304214
e 24.000000
dtype: float64 #索引值自动对齐
#s[1:]中有a, s[:-1]中有e
>>s[1:] + s[:-1]
a NaN
b 0.792115
c -2.313474
d -2.304214
e NaN
dtype: float64
Series的name属性,创建新对象
#注意 name属性
>>s = pd.Series(np.random.randn(5),name='sth')
>>s
0 1.338578
1 2.074678
2 -0.462777
3 0.518763
4 -0.372692
Name: sth, dtype: float64 # 使用rename方法
>>s2 = s.rename('dif')
>>s2
0 1.338578
1 2.074678
2 -0.462777
3 0.518763
4 -0.372692
Name: dif, dtype: float64 >>id(s)
2669465319632 >>id(s2)
2669465320416 #s 与 s2是不同的对象,两者尽管值相同,但地址不同
DataFrame 带索引值的二维数组,类似SQL的表,列项通常是不同的数据类型
index 行索引,columns列索引
#使用Series字典或字典创建DataFrame
>>d= {'one':pd.Series([1.,2.,3.], index=['a','b','c']), 'two':pd.Series([1.,2.,3.,4.], index=['a','b','c','d'])}
>>df = pd.DataFrame(d)
>>df
one two
a 1.0 1.0
b 2.0 2.0
c 3.0 3.0
d NaN 4.0 # 按序输出
>>pd.DataFrame(d, index=['d','b','a'])
one two
d NaN 4.0
b 2.0 2.0
a 1.0 1.0 >>df.index
Index(['a', 'b', 'c', 'd'], dtype='object')
>>df.columns
Index(['one', 'two'], dtype='object') #使用ndarrays/list字典
>>d = {'one':[1.,2.,3.,4.],'two':[4.,3.,2.,1.]}
>>pd.DatdFrame(d)
one two
0 1.0 4.0
1 2.0 3.0
2 3.0 2.0
3 4.0 1.0 #指定index
>>pd.DataFrame(d,index=['a','b','c','d'])
one two
a 1.0 4.0
b 2.0 3.0
c 3.0 2.0
d 4.0 1.0
DataFrame操作
列选择、添加、删除
>>df['one']
a 1.0
b 2.0
c 3.0
d NaN
Name: one, dtype: float64 #添加 three 与 flag 列,总在尾部添加
>>df['three'] = df['one'] * df['two']
>>df['flag']=df['one']>2
>>df
one two three flag
a 1.0 1.0 1.0 False
b 2.0 2.0 4.0 False
c 3.0 3.0 9.0 True
d NaN 4.0 NaN False # 删除
>>del df['two']
>>three = df.pop('three')
>>three
a 1.0
b 4.0
c 9.0
d NaN
Name: three, dtype: float64 >>df
one flag
a 1.0 False
b 2.0 False
c 3.0 True
d NaN False #可以将列数据截断
>>df['one_trunc'] = df['one'][:2]
one flag one_trunc
a 1.0 False 1.0
b 2.0 False 2.0
c 3.0 True NaN
d NaN False NaN >>df['foo'] = 'bar'
>>df
one flag one_trunc foo
a 1.0 False 1.0 bar
b 2.0 False 2.0 bar
c 3.0 True NaN bar
d NaN False NaN bar #使用insert函数可以在指定列后插入
#在第1列后插入
>>df.insert(1,'ba',df['one'])
>>df
one ba flag one_trunc foo
a 1.0 1.0 False 1.0 bar
b 2.0 2.0 False 2.0 bar
c 3.0 3.0 True NaN bar
d NaN NaN False NaN bar
索引、选择行
选择列 df[col] Series
按照标签选择行 df.loc[label] Series
按照索引值选择行 df.iloc[loc] Series
切分行 df[5:10] DataFrame
按照布尔向量选择行 df[bool_vec] DataFrame
>>d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
>>df = pd.DataFrame(d)
>>df
one two
a 1.0 1.0
b 2.0 2.0
c 3.0 3.0
d NaN 4.0 #按照标签选择行
>>df.loc['b']
one 2.0
two 2.0
Name: b, dtype: float64
>>type(df.loc['b'])
pandas.core.series.Series #按照索引值选择行
>>df.iloc[2]
one 3.0
two 3.0
Name: c, dtype: float64 #切分行
>>df[1:3]
one two
b 2.0 2.0
c 3.0 3.0
>>type(df[1:3])
pandas.core.frame.DataFrame
选择列
>>df.one
a 1.0
b 2.0
c 3.0
d NaN
Name: one, dtype: float64 >>df['one']
a 1.0
b 2.0
c 3.0
d NaN
Name: one, dtype: float64
数据对齐与计算
对齐:列与行标签自动对齐
>>da = pd.DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D'])
>>db = pd.DataFrame(np.random.randn(7, 3), columns=['A', 'B', 'C'])
>>da +db
A B C D
0 -0.920370 -0.529455 -2.386419 NaN
1 -1.277148 1.292130 1.196099 NaN
2 1.182199 0.454546 0.381586 NaN
3 1.100170 -1.830894 1.105932 NaN
4 0.507649 1.291516 -2.084368 NaN
5 -1.198811 -2.180978 0.342185 NaN
6 0.667211 2.141364 0.044136 NaN
7 NaN NaN NaN NaN
8 NaN NaN NaN NaN
9 NaN NaN NaN NaN #支持Numpy操作
>>np.exp(da)
>>np.asarray(da)
3维数据类型Penel,在0.20.0及其后续版本中不再支持
新的类型xarray,用于支持多维数据