1、Series
obj = pd.Series([4, 7, -5, 3]) #创建series obj.values #获取值 obj.index #获取索引 obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c']) #指定索引创建Series obj2['a'] #获取值 obj2[['c', 'a', 'd']] obj2[obj2 > 0] #使用boolean数组过滤 np.exp(obj2) #表达式 #可以作为固定长度 有序的字典使用 'b' in obj2 #通过字典创建Series sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000} obj3 = pd.Series(sdata) #更改索引 states = ['California', 'Ohio', 'Oregon', 'Texas'] obj4= pd.Series(sdata, index=states) #判断是否为空 pd.isnull(obj4) pd.notnull(obj4) obj4.isnull() #设置name obj4.name = 'population' obj4.index.name = 'state' #更改索引 obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
2、DataFrame
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'], 'year': [2000, 2001, 2002, 2001, 2002, 2003], 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]} frame = pd.DataFrame(data) #指定有序列 pd.DataFrame(data, columns=['year', 'state', 'pop']) #指定列和索引,没有则显示空 frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'], index=['one', 'two', 'three', 'four', 'five', 'six']) frame2.columns frame2['state'] frame2.loc['three'] #获取某行值 #frame 列赋值 frame2['debt'] = np.arange(6.) val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five']) frame2['debt'] = val frame2['eastern'] = frame2.state == 'Ohio' del frame2['eastern'] #删除列 #嵌套字典组成frame pop = {'Nevada': {2001: 2.4, 2002: 2.9},'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}} frame3 = pd.DataFrame(pop) frame3.index.name = 'year'; frame3.columns.name = 'state' frame2.values #索引 obj = pd.Series(range(3), index=['a', 'b', 'c']) index = obj.index labels = pd.Index(np.arange(3)) obj2 = pd.Series([1.5, -2.5, 0], index=labels) obj2.index is labels 'Ohio' in frame3.columns 2003 in frame3.index pd.Index(['foo', 'foo', 'bar', 'bar'])#pandas index可以重复
3、重要函数
#reindex obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c']) obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e']) #reindex 可以改变index column frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'], columns=['Ohio', 'Texas', 'California']) frame2 = frame.reindex(['a', 'b', 'c', 'd']) states = ['Texas', 'Utah', 'California'] frame.reindex(columns=states) frame.loc[['a', 'b', 'c', 'd'], states] #drop obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e']) obj.drop(['d', 'c']) data = pd.DataFrame(np.arange(16).reshape((4, 4)),index=['Ohio', 'Colorado', 'Utah', 'New York'], columns=['one', 'two', 'three', 'four']) data.drop(['Colorado', 'Ohio'])#drop row data.drop('two', axis=1) #通过axis drop列 data.drop(['two', 'four'], axis='columns')#通过columns drop列 obj.drop('c', inplace=True) #inplace 不创建新对象
4、选择索引
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd']) obj['b'] obj[1] obj[2:4] obj[['b', 'a', 'd']] obj[[1, 3]] obj[obj < 2] obj['b':'c'] obj['b':'c'] = 5 data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=['Ohio', 'Colorado', 'Utah', 'New York'], columns=['one', 'two', 'three', 'four']) data['two'] data[['three', 'one']] data[:2] #选择行 data[data['three'] > 5] #Selection with loc and iloc data.loc['Colorado', ['two', 'three']] data.iloc[2, [3, 0, 1]] data.iloc[[1, 2], [3, 0, 1]] data.loc[:'Utah', 'two'] data.iloc[:, :3][data.three > 5]
5、运算和排列
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd')) df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde')) df2.loc[1, 'b'] = np.nan df1.add(df2, fill_value=0) 1 / df1 df1.reindex(columns=df2.columns, fill_value=0) frame = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) series = frame.iloc[0] frame - series series3 = frame['d'] frame.sub(series3, axis='index')
6、功能应用和映射
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) np.abs(frame) f = lambda x: x.max() - x.min() frame.apply(f) frame.apply(f,axis='columns') def f(x): return pd.Series([x.min(), x.max()], index=['min', 'max']) format = lambda x: '%.2f' % x frame.applymap(format) frame['e'].map(format)
7、排序和rank
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),index=['three', 'one'],columns=['d', 'a', 'b', 'c']) frame.sort_index() frame.sort_index(axis=1, ascending=False) obj = pd.Series([4, 7, -3, 2]) obj.sort_values() frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]}) frame.sort_values(by='b') frame.sort_values(by=['a', 'b'])
8、统计计算
df= pd.DataFrame([[1.4, np.nan], [7.1, -4.5],[np.nan, np.nan], [0.75, -1.3]], index=['a', 'b', 'c', 'd'], columns=['one', 'two']) df df.sum() df.sum(axis='columns') df.mean(axis='columns', skipna=False) df.idxmax() df.idxmin() df.cumsum() df.describe() obj = pd.Series(['a', 'a', 'b', 'c'] * 4) obj.describe() import pandas_datareader.data as web
9、Unique Values, Value Counts, and Membership
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c']) uniques = obj.unique() obj.value_counts() pd.value_counts(obj.values, sort=False) mask = obj.isin(['b', 'c']) obj[mask] to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a']) unique_vals = pd.Series(['c', 'b', 'a']) pd.Index(unique_vals).get_indexer(to_match)