pandas入门:处理缺失数据

# pandas使用浮点值NaN(Not a Number)表示浮点和非浮点数组中的缺失数据。
from pandas import Series,DataFrame
import pandas as pd
import numpy as np

string_data = Series(['aardvark','artichoke',np.nan,'avocado'])
print(string_data)
'''
0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object
'''
print(string_data.isnull())
'''
0    False
1    False
2     True
3    False
dtype: bool
'''
# python内置的None值也会被当做Na处理
string_data[0] = None
print(string_data.isnull())
'''
0     True
1    False
2     True
3    False
dtype: bool
'''
  • dropna: 根据各标签的值中是否存在缺失数据对轴标签进行过滤,可通过阀值调节对缺失值的容忍度
  • fillna: 用指定值或插值方法(如ffill或bfill)填充缺失数据
  • isnull: 返回一个含有布尔值的对象,这些布尔值博鳌是哪些值是缺失值NA
  • notnull: isnull的否定式

滤除缺失数据

from pandas import Series,DataFrame
import pandas as pd
import numpy as np
from numpy import nan as NA

# 过滤缺失数据
data = Series([1,NA,3.5,NA,7])
print(data.dropna())
'''
0    1.0
2    3.5
4    7.0
dtype: float64
'''
print(data.notna())
'''
0     True
1    False
2     True
3    False
4     True
dtype: bool
'''
print(data[data.notna()])
'''
0    1.0
2    3.5
4    7.0
dtype: float64
'''

# dropna默认丢弃任何含有缺失值的行
data = DataFrame([[1,6.5,3],
                  [1,NA,NA],
                  [NA,NA,NA],
                  [NA,6.5,3]])
print(data)
'''
     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0
'''
print(data.dropna())
'''
     0    1    2
0  1.0  6.5  3.0
'''
# 传入how='all'将只丢弃全部NA的行
print(data.dropna(how='all'))
'''
     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
3  NaN  6.5  3.0
'''
# 要用这种方式丢失列,只需要传入axis=1即可
data[4]=NA
print(data)
'''
     0    1    2   4
0  1.0  6.5  3.0 NaN
1  1.0  NaN  NaN NaN
2  NaN  NaN  NaN NaN
3  NaN  6.5  3.0 NaN
'''
print(data.dropna(axis=1))
'''
Empty DataFrame
Columns: []
Index: [0, 1, 2, 3]
'''
print(data.dropna(axis=1,how='all'))
'''
     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0
'''

df = DataFrame(np.random.randn(7,3))
df.iloc[:5,1] = NA
df.iloc[:3,2] = NA
print(df)
'''
          0         1         2
0  1.034046       NaN       NaN
1  0.205577       NaN       NaN
2  0.669042       NaN       NaN
3 -1.081377       NaN -0.850690
4 -0.129405       NaN  2.280089
5 -0.720506  0.719188 -0.698185
6  1.482302  1.589606  1.712550
'''
print(df.dropna(thresh=3))
'''
          0         1         2
5 -0.720506  0.719188 -0.698185
6  1.482302  1.589606  1.712550
'''

填充缺失数据

print(df.fillna(0))
'''
          0         1         2
0 -0.044841  0.000000  0.000000
1 -0.432459  0.000000  0.000000
2  0.036653  0.000000  0.000000
3  1.647238  0.000000  0.623209
4  0.395201  0.000000  0.216717
5 -1.792629  1.167120  1.424606
6  1.986463  0.691374  0.361006
'''
print(df.fillna({1:0.5,2:-1})) # 实现对不同列填充不同值
'''
          0         1         2
0  0.704205  0.500000 -1.000000
1 -0.002524  0.500000 -1.000000
2  1.241561  0.500000 -1.000000
3 -0.340080  0.500000  0.038028
4 -0.616660  0.500000 -0.104324
5 -0.254113  1.020461  0.596161
6 -0.026914 -0.359409 -0.876534
'''
#fillna默认返回新对象,也可以对现有对象进行修改
df.fillna(0,inplace=True)
print(df)
'''
          0         1         2
0 -0.187450  0.000000  0.000000
1  0.205142  0.000000  0.000000
2 -0.032737  0.000000  0.000000
3 -1.207977  0.000000 -0.079890
4  2.244593  0.000000  0.753733
5 -0.775953  0.553931 -0.137147
6  0.087671  0.426827  0.272821
'''

df = DataFrame(np.random.randn(6,3))
df.iloc[2:,1] = NA
df.iloc[4:,2] = NA
print(df)
'''
          0         1         2
0 -0.040906  0.507198 -0.466641
1 -0.231033 -0.741952 -0.443290
2  2.194688       NaN  0.672457
3  1.002863       NaN -0.338136
4 -0.429903       NaN       NaN
5 -0.371691       NaN       NaN
'''
print(df.fillna(method='ffill'))
'''
          0         1         2
0 -0.117091  0.793242 -1.603526
1  0.911199 -0.062944  0.861507
2  1.529839 -0.062944 -0.206347
3 -0.180341 -0.062944 -0.121404
4  0.568776 -0.062944 -0.121404
5  1.673478 -0.062944 -0.121404
'''
print(df.fillna(method='ffill',limit=2))
'''
          0         1         2
0  0.150973 -0.613426 -1.263605
1 -1.282189  0.420040  0.092557
2  0.919253  0.420040  0.754515
3 -1.570130  0.420040 -0.692602
4 -1.812111       NaN -0.692602
5 -0.568409       NaN -0.692602
'''
data = Series([1,NA,3.5,NA,7])
print(data.fillna(data.mean()))
'''
0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64
'''
上一篇:【二分查找】162. 寻找峰值


下一篇:百度智能小程序框架性能优化实践