【python数据分析】pandas库之文本处理

Pandas针对字符串配备的一套方法,使其易于对数组的每个元素进行操作

1、常用方法 lower,upper,len,startswith,endswith

s = pd.Series(['A','b','C','bbhello','123',np.nan,'hj'])
df = pd.DataFrame({'key1':list('abcdef'),
                  'key2':['hee','fv','w','hija','123',np.nan]})

#统计b出现的次数
print(s.str.count('b'))
print(s.str.lower(),'→ lower小写\n')
print(s.str.upper(),'→ upper大写\n')
print(s.str.len(),'→ len字符长度\n')
print(s.str.startswith('b'),'→ 判断起始是否为a\n')
print(s.str.endswith('3'),'→ 判断结束是否为3\n')

2、字符串常用方法 - strip

s = pd.Series([' jack', 'jill ', ' jesse ', 'frank'])
df = pd.DataFrame(np.random.randn(3, 2), columns=[' Column A ', ' Column B '],
                  index=range(3))

print(s.str.strip())  # 去除字符串中的空格
print(s.str.lstrip())  # 去除字符串中的左空格
print(s.str.rstrip())  # 去除字符串中的右空格

df.columns = df.columns.str.strip()
print(df)
# 这里去掉了columns的前后空格,但没有去掉中间空格

3、 合并 merge、join

df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
df2 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                      'C': ['C0', 'C1', 'C2', 'C3'],
                      'D': ['D0', 'D1', 'D2', 'D3']})
df3 = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                    'key2': ['K0', 'K1', 'K0', 'K1'],
                    'A': ['A0', 'A1', 'A2', 'A3'],
                    'B': ['B0', 'B1', 'B2', 'B3']})
df4 = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                    'key2': ['K0', 'K0', 'K0', 'K0'],
                    'C': ['C0', 'C1', 'C2', 'C3'],
                    'D': ['D0', 'D1', 'D2', 'D3']})
print(pd.merge(df1, df2, on='key'))
print('------')
# left:第一个df
# right:第二个df
# on:参考键

print(pd.merge(df3, df4, on=['key1','key2']))
# 多个链接键

-----------------结果----------------
    A   B key   C   D
0  A0  B0  K0  C0  D0
1  A1  B1  K1  C1  D1
2  A2  B2  K2  C2  D2
3  A3  B3  K3  C3  D3
------
    A   B key1 key2   C   D
0  A0  B0   K0   K0  C0  D0
1  A2  B2   K1   K0  C1  D1
2  A2  B2   K1   K0  C2  D2

# 参数how → 合并方式

print(pd.merge(df3, df4,on=['key1','key2'], how = 'inner'))  
print('------')
# inner:默认,取交集

print(pd.merge(df3, df4, on=['key1','key2'], how = 'outer'))  
print('------')
# outer:取并集,数据缺失范围NaN

print(pd.merge(df3, df4, on=['key1','key2'], how = 'left'))  
print('------')
# left:按照df3为参考合并,数据缺失范围NaN

print(pd.merge(df3, df4, on=['key1','key2'], how = 'right'))  
# right:按照df4为参考合并,数据缺失范围NaN


# 参数 left_on, right_on, left_index, right_index → 当键不为一个列时,可以单独设置左键与右键

df1 = pd.DataFrame({'lkey':list('bbacaab'),
                   'data1':range(7)})
df2 = pd.DataFrame({'rkey':list('abd'),
                   'date2':range(3)})
print(pd.merge(df1, df2, left_on='lkey', right_on='rkey'))
print('------')
# df1以‘lkey’为键,df2以‘rkey’为键

df1 = pd.DataFrame({'key':list('abcdfeg'),
                   'data1':range(7)})
df2 = pd.DataFrame({'date2':range(100,105)},
                  index = list('abcde'))
print(pd.merge(df1, df2, left_on='key', right_index=True))
# df1以‘key’为键,df2以index为键
# left_index:为True时,第一个df以index为键,默认False
# right_index:为True时,第二个df以index为键,默认False

# 所以left_on, right_on, left_index, right_index可以相互组合:
# left_on + right_on, left_on + right_index, left_index + right_on, left_index + right_index

4、join

left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                    index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                      'D': ['D0', 'D2', 'D3']},
                     index=['K0', 'K2', 'K3'])
print(left)
print(right)
print(left.join(right))
print(left.join(right, how='outer'))  
print('-----')
# 等价于:pd.merge(left, right, left_index=True, right_index=True, how='outer')

df1 = pd.DataFrame({'key':list('bbacaab'),
                   'data1':[1,3,2,4,5,9,7]})
df2 = pd.DataFrame({'key':list('abd'),
                   'date2':[11,2,33]})
print(df1)
print(df2)
print(pd.merge(df1, df2, left_index=True, right_index=True, suffixes=('_1', '_2')))  
print(df1.join(df2['date2']))
print('-----')
# suffixes=('_x', '_y')默认

left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3'],
                     'key': ['K0', 'K1', 'K0', 'K1']})
right = pd.DataFrame({'C': ['C0', 'C1'],
                      'D': ['D0', 'D1']},
                     index=['K0', 'K1'])
print(left)
print(right)
print(left.join(right, on = 'key'))
# 等价于pd.merge(left, right, left_on='key', right_index=True, how='left', sort=False);
# left的‘key’和right的index

 

上一篇:Pandas高级教程之:Dataframe的合并


下一篇:pandas学习进阶1