用特定于分组的值填充缺失值
用平均值去填充nan
s=pd.Series(np.random.randn(6))
s[::2]=np.nan
s
0 NaN
1 -0.118174
2 NaN
3 -0.308161
4 NaN
5 -0.655565
dtype: float64
s.fillna(s.mean())
stats=['Ohio','New York','Vermont','Florida','Oregon','Nevada','California','Idaho']
group_key=['East']*4 + ['West']*4
data=pd.Series(np.random.randn(8),index=stats)
data[['Vermont','Nevada','Idaho']]=np.nan
data
Ohio 0.345378
New York 1.538009
Vermont NaN
Florida 0.113972
Oregon 1.618781
Nevada NaN
California 0.728396
Idaho NaN
dtype: float64
其中group_key是
group_key
['East', 'East', 'East', 'East', 'West', 'West', 'West', 'West']
data.groupby(group_key).mean()
East 0.665786
West 1.173589
dtype: float64
fill_mean=lambda g:g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)
Ohio 0.345378
New York 1.538009
Vermont 0.665786
Florida 0.113972
Oregon 1.618781
Nevada 1.173589
California 0.728396
Idaho 1.173589
dtype: float64
也可自定义字典作为填充值
fill_values={'East':0.5,'West':-1}
fill_func=lambda g:g.fillna(fill_values[g.name])
data.groupby(group_key).apply(fill_func)
Ohio 0.345378
New York 1.538009
Vermont 0.500000
Florida 0.113972
Oregon 1.618781
Nevada -1.000000
California 0.728396
Idaho -1.000000
dtype: float64
card_val=(list(range(1,11)) + [10]*3 )*4
card_val
#需要强制将rang转成list否则会报错
base_names=['A']+list(range(2,11))+['J','K','Q']
base_names
['A', 2, 3, 4, 5, 6, 7, 8, 9, 10, 'J', 'K', 'Q']
未完待续