pandas教程

  对pandas做最简单的介绍,针对初学者。

  一、引入相关模块模块

 import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

   二、对象创建

  2.1 创建一个Series。

 '''
Series(data,index,dtype,copy)
data:array-like,dict, or scalar value
index:array-like or index(1d)
dtype:numpy.dtype or None
copy:boolean, default False
'''
'''结果:
0 1.0
1 3.0
2 5.0
3 NaN
4 6.0
5 8.0
dtype: float64
'''
s = pd.Series([1,3,5,np.nan,6,8])

  2.2 创建DataFrame。数据使用numpy的array,索引index使用datetime,列名使用标签

 '''
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
'2013-01-05', '2013-01-06'],
dtype='datetime64[ns]', freq='D')
'''
dates = pd.date_range('', periods=6) '''
A B C D
2013-01-01 0.469112 -0.282863 -1.509059 -1.135632
2013-01-02 1.212112 -0.173215 0.119209 -1.044236
2013-01-03 -0.861849 -2.104569 -0.494929 1.071804
2013-01-04 0.721555 -0.706771 -1.039575 0.271860
2013-01-05 -0.424972 0.567020 0.276232 -1.087401
2013-01-06 -0.673690 0.113648 -1.478427 0.524988
'''
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

  2.3 通过一个字典的数据创建一个dataFrame

 '''
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo
'''
df2 = pd.DataFrame({ 'A' : 1.,'B' : pd.Timestamp(''),'C' : pd.Series(1,index=list(range(4)),dtype='float32'),'D' : np.array([3] * 4,dtype='int32'),'E' :pd.Categorical(["test","train","test","train"]),'F' : 'foo' })

  三、取数据

  3.1 取前几行或者最后几行的数据

df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df.head()
df.tail()

  3.2 显示index、column、或者数据

df.index
df.columns
df.values

  3.3 显示数据的一些统计数据

 '''
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean 0.073711 -0.431125 -0.687758 -0.233103
std 0.843157 0.922818 0.779887 0.973118
min -0.861849 -2.104569 -1.509059 -1.135632
25% -0.611510 -0.600794 -1.368714 -1.076610
50% 0.022070 -0.228039 -0.767252 -0.386188
75% 0.658444 0.041933 -0.034326 0.461706
max 1.212112 0.567020 0.276232 1.071804
'''
df.describe()

  3.4 矩阵转置

 df.T

  3.5 根据某一维度进行排序

'''
对维度1的索引做降序排序
'''
df.sort_index(axis=1, ascending=False)

  3.6 对数据值做排序

'''
A B C D
2013-01-03 -0.861849 -2.104569 -0.494929 1.071804
2013-01-04 0.721555 -0.706771 -1.039575 0.271860
2013-01-01 0.469112 -0.282863 -1.509059 -1.135632
2013-01-02 1.212112 -0.173215 0.119209 -1.044236
2013-01-06 -0.673690 0.113648 -1.478427 0.524988
2013-01-05 -0.424972 0.567020 0.276232 -1.087401
'''
df.sort_values(by='B')

  四、数据选择

  4.1、根据维度上的值取数据

  4.1.1选择一个单独的列,df是Series对象

'''
2013-01-01 0.469112
2013-01-02 1.212112
2013-01-03 -0.861849
2013-01-04 0.721555
2013-01-05 -0.424972
2013-01-06 -0.673690
Freq: D, Name: A, dtype: float64
'''
print(df['A']) #等同于 df.A

  4.1.2类似[]一样做切割的操作

'''
A B C D
2013-01-01 0.469112 -0.282863 -1.509059 -1.135632
2013-01-02 1.212112 -0.173215 0.119209 -1.044236
2013-01-03 -0.861849 -2.104569 -0.494929 1.071804
'''
df[0:3] #等同于df['2013-01-01':'2013-01-04']

  4.1.3对于二维,根据纵轴上的值取数据

'''
A 0.469112
B -0.282863
C -1.509059
D -1.135632
Name: 2013-01-01 00:00:00, dtype: float64
'''
df.loc['2013-01-01']

  4.1.4[]的切割和轴上取值的综合使用

'''
A B
2013-01-01 0.469112 -0.282863
2013-01-02 1.212112 -0.173215
2013-01-03 -0.861849 -2.104569
2013-01-04 0.721555 -0.706771
2013-01-05 -0.424972 0.567020
2013-01-06 -0.673690 0.113648
'''
df.loc[:,['A','B']]

  4.1.5[]的切割和轴上取值的综合使用的进阶版

'''
A B
2013-01-02 1.212112 -0.173215
2013-01-03 -0.861849 -2.104569
2013-01-04 0.721555 -0.706771
'''
df.loc['':'',['A','B']] #20130102会自动格式化

  4.1.6相关的额外的用例

'''
A 1.212112
B -0.173215
Name: 2013-01-02 00:00:00, dtype: float64
'''
df.loc['',['A','B']]
'''
0.46911229990718628
'''
df.loc[dates[0],'A'] # dates[0]等价于 '20130101'

  4.2、根据位置取数据

'''
A 0.721555
B -0.706771
C -1.039575
D 0.271860
Name: 2013-01-04 00:00:00, dtype: float64
'''
df.iloc[3] '''
        A B
2013-01-04 0.721555 -0.706771
2013-01-05 -0.424972 0.567020
'''
df.iloc[3:5,0:2] '''
        A C
2013-01-02 1.212112 0.119209
2013-01-03 -0.861849 -0.494929
2013-01-05 -0.424972 0.276232
'''
df.iloc[[1,2,4],[0,2]] '''
          A B C D
2013-01-02 1.212112 -0.173215 0.119209 -1.044236
2013-01-03 -0.861849 -2.104569 -0.494929 1.071804
'''
df.iloc[1:3,:] '''
        B C
2013-01-01 -0.282863 -1.509059
2013-01-02 -0.173215 0.119209
2013-01-03 -2.104569 -0.494929
2013-01-04 -0.706771 -1.039575
2013-01-05 0.567020 0.276232
2013-01-06 0.113648 -1.478427
'''
df.iloc[:,1:3] '''
-0.17321464905330858
'''
df.iloc[1,1]

  五、数据选择的中布尔条件的筛选用法

'''
           A B C D
2013-01-01 0.469112 -0.282863 -1.509059 -1.135632
2013-01-02 1.212112 -0.173215 0.119209 -1.044236
2013-01-04 0.721555 -0.706771 -1.039575 0.271860
'''
df[df.A > 0] #针对A 条件成立的数据显示,不成立的数据过滤
'''
          A B C D
2013-01-01 0.469112 NaN NaN NaN
2013-01-02 1.212112 NaN 0.119209 NaN
2013-01-03 NaN NaN NaN 1.071804
2013-01-04 0.721555 NaN NaN 0.271860
2013-01-05 NaN 0.567020 0.276232 NaN
2013-01-06 NaN 0.113648 NaN 0.524988
'''
df[df > 0] # 对整个数据做判断,成立的显示,不成立的显示nan
# 复制
df2 = df.copy()
# 增加一个列
df2['E'] = ['one', 'one','two','three','four','three'] '''
          A B C D E
2013-01-01 0.469112 -0.282863 -1.509059 -1.135632 one
2013-01-02 1.212112 -0.173215 0.119209 -1.044236 one
2013-01-03 -0.861849 -2.104569 -0.494929 1.071804 two
2013-01-04 0.721555 -0.706771 -1.039575 0.271860 three
2013-01-05 -0.424972 0.567020 0.276232 -1.087401 four
2013-01-06 -0.673690 0.113648 -1.478427 0.524988 three
'''
print(df2) '''
          A B C D E
2013-01-03 -0.861849 -2.104569 -0.494929 1.071804 two
2013-01-05 -0.424972 0.567020 0.276232 -1.087401 four
'''
df2[df2['E'].isin(['two','four'])] # isin()方法用于过滤,显示条件成立的结果

  六、赋值(setting)

  6.1 增加一列

# 新列的值
s = pd.Series([1,2,3,4,5,6], index=pd.date_range('',periods=6)) '''
           A B C D E
2013-01-01 -0.330600 -1.326650 1.956782 0.328470 1
2013-01-02 0.173402 -0.373742 -0.121202 0.382443 2
2013-01-03 -0.579300 -0.381537 -2.955372 -0.557058 3
2013-01-04 1.358076 0.907546 0.629780 -1.579100 4
2013-01-05 2.269737 1.224567 0.591703 -1.022714 5
2013-01-06 0.966249 -0.205897 -0.003112 1.925219 6 '''
df['E'] = s # 对新列赋值

  6.2 增加一行,具体使用拼接的或者添加(concat、append)

'''
          A B C D E
2013-01-07 1.105365 0.027329 2.210636 1.497980 0.761118
2013-01-08 0.387425 -1.506767 0.416878 -1.479918 -0.716363 '''
su_df = pd.DataFrame(np.random.randn(2,5), index=pd.date_range('',periods=2),columns=list('ABCDE')) '''
          A B C D E
2013-01-01 -2.476921 -0.961169 0.063422 2.010977 1.000000
2013-01-02 1.060736 0.265674 0.092731 -0.423340 2.000000
2013-01-03 0.036753 1.757448 0.987356 0.344027 3.000000
2013-01-04 -0.429803 0.783153 -0.124511 -0.678557 4.000000
2013-01-05 -0.266420 -3.515056 -0.138616 1.244520 5.000000
2013-01-06 0.217777 -0.327220 0.266039 0.672814 6.000000
2013-01-07 1.105365 0.027329 2.210636 1.497980 0.761118
2013-01-08 0.387425 -1.506767 0.416878 -1.479918 -0.716363 '''
df.append(su_df)

  6.3 根据刻度赋值

df.at['','A'] = 0

  6.4 根据位置赋值

df.iat[0,1] = 0

  6.5 为某一列赋值

df.loc[:,'E'] = np.array([5] * len(df))

  七、其他对数据处理的函数

'''
dates = ['2013-01-01','2013-01-02','2013-01-03','2013-01-04',...]
A         B         C  D    F    E
2013-01-01 0.000000 0.000000 -1.509059 5 NaN 1.0
2013-01-02 1.212112 -0.173215 0.119209 5 1.0 1.0
2013-01-03 -0.861849 -2.104569 -0.494929 5 2.0 NaN
2013-01-04 0.721555 -0.706771 -1.039575 5 3.0 NaN
''' df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E']) # 可以对原有的数据进行增删改,返回一个经过增删改后新的数据'''
A         B         C         D    E
2013-01-01 -2.476921 -0.961169 0.063422 2.010977 1.0
2013-01-02 1.060736 0.265674 0.092731 -0.423340 1.0
2013-01-03 0.036753 1.757448 0.987356 0.344027 1.0
2013-01-04 -0.429803 0.783153 -0.124511 -0.678557 NaN
2013-01-05 -0.266420 -3.515056 -0.138616 1.244520 NaN
2013-01-06 0.217777 -0.327220 0.266039 0.672814 NaN '''
df '''
A B C D E
2013-01-01 -2.476921 -0.961169 0.063422 2.010977 1.0
2013-01-02 1.060736 0.265674 0.092731 -0.423340 1.0
2013-01-03 0.036753 1.757448 0.987356 0.344027 1.0 '''
df.dropna(how='any')
'''
A B C D E
2013-01-01 -2.476921 -0.961169 0.063422 2.010977 1.0
2013-01-02 1.060736 0.265674 0.092731 -0.423340 1.0
2013-01-03 0.036753 1.757448 0.987356 0.344027 1.0
2013-01-04 -0.429803 0.783153 -0.124511 -0.678557 777.0
2013-01-05 -0.266420 -3.515056 -0.138616 1.244520 777.0
2013-01-06 0.217777 -0.327220 0.266039 0.672814 777.0 '''
df.fillna(value=5) # 对为NAN的值进行填充

  八、使用回调函数处理数据

df.apply(np.cumsum)
df.apply(lambda x : x.max() - x.min())

  九、对key和value的处理函数

'''
0 4
1 2
2 1
3 2
4 6
5 4
6 4
7 6
8 4
9 4
'''
s = pd.Series(np.random.randint(0, 7, size=10)) '''
4 5
6 2
2 2
1 1
dtype: int64
'''
s.value_counts() # 统计对应的值出现的次数
上一篇:Hadoop基础-MapReduce的常用文件格式介绍


下一篇:Hadoop基础-MapReduce的排序