数据清洗和准备
二、数据转换
移除重复数据
data = pd.DataFrame({'k1':['one','two']*3+['two'],
'k2':[1,1,2,3,3,4,4]})
data
Out:
k1 k2
0 one 1
1 two 1
2 one 2
3 two 3
4 one 3
5 two 4
6 two 4
#检查 哪个重复
data.duplicated()
Out:
0 False
1 False
2 False
3 False
4 False
5 False
6 True
dtype: bool
# 删除重复数据
data.drop_duplicates()
Out:
k1 k2
0 one 1
1 two 1
2 one 2
3 two 3
4 one 3
5 two 4
data['v1'] = range(7)
data
Out:
k1 k2 v1
0 one 1 0
1 two 1 1
2 one 2 2
3 two 3 3
4 one 3 4
5 two 4 5
6 two 4 6
data.drop_duplicates(['k1']) # 按照k1 这一列去除重复项
Out:
k1 k2 v1
0 one 1 0
1 two 1 1
data.drop_duplicates(['k2'],keep='last')
Out:
k1 k2 v1
1 two 1 1
2 one 2 2
4 one 3 4
6 two 4 6
data = pd.DataFrame({'k1':['one','two']*3+['two'],
'k2':[1,1,2,3,3,4,4]})
data
Out:
k1 k2
0 one 1
1 two 1
2 one 2
3 two 3
4 one 3
5 two 4
6 two 4
data.drop_duplicates(keep='last')
Out:
k1 k2
0 one 1
1 two 1
2 one 2
3 two 3
4 one 3
6 two 4
利用函数或映射进行数据转换
data = pd.DataFrame({'food': ['Apple', 'banana', 'orange','apple','Mango', 'tomato'],
'price': [4, 3, 3.5, 6, 12,3]})
data
Out:
food price
0 Apple 4.0
1 banana 3.0
2 orange 3.5
3 apple 6.0
4 Mango 12.0
5 tomato 3.0
meat = {'apple':'fruit',
'banana':'fruit',
'orange':'fruit',
'mango':'fruit',
'tomato':'vagetables'}
#值小写
low = data['food'].str.lower()
low
Out:
0 apple
1 banana
2 orange
3 apple
4 mango
5 tomato
Name: food, dtype: object
data['class'] = low.map(meat)
data
Out:
food price class class1
0 Apple 4.0 fruit fruit
1 banana 3.0 fruit fruit
2 orange 3.5 fruit fruit
3 apple 6.0 fruit fruit
4 Mango 12.0 fruit fruit
5 tomato 3.0 vagetables vagetables
data['class1'] = data['food'].map(lambda x:meat[x.lower()])
data
Out:
food price class class1
0 Apple 4.0 fruit fruit
1 banana 3.0 fruit fruit
2 orange 3.5 fruit fruit
3 apple 6.0 fruit fruit
4 Mango 12.0 fruit fruit
5 tomato 3.0 vagetables vagetables
data['class1'] = data['food'].map(lambda x: meat[x.lower()])
data
Out:
food price class class1
0 Apple 4.0 fruit fruit
1 banana 3.0 fruit fruit
2 orange 3.5 fruit fruit
3 apple 6.0 fruit fruit
4 Mango 12.0 fruit fruit
5 tomato 3.0 vegetables vegetables
替换值
data = pd.Series([1,-999,2,-1000,3])
data
Out:
0 1
1 -999
2 2
3 -1000
4 3
dtype: int64
data.replace(-999,np.nan)
Out:
0 1.0
1 NaN
2 2.0
3 -1000.0
4 3.0
dtype: float64
data.replace([-999,-1000],np.nan) # 替换多个
Out:
0 1.0
1 NaN
2 2.0
3 NaN
4 3.0
dtype: float64
data1 = data.replace([-999,-1000],[np.nan,0]) # replace 会返回一个新的对象
data1
Out:
0 1.0
1 NaN
2 2.0
3 0.0
4 3.0
dtype: float64
data.replace({-999:np.nan,-1000:0})
Out:
0 1
1 -999
2 2
3 -1000
4 3
dtype: int64
重命名索引
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
index=['BeiJing', 'Tokyo', 'New York'],
columns=['one', 'two', 'three', 'four'])
data
Out:
one two three four
BeiJing 0 1 2 3
Tokyo 4 5 6 7
New York 8 9 10 11
# 重新索引
data.reindex(['a', 'b', 'c']) # reindex 只能修改已有的标签名
Out:
one two three four
a NaN NaN NaN NaN
b NaN NaN NaN NaN
c NaN NaN NaN NaN
data
Out:
one two three four
BeiJing 0 1 2 3
Tokyo 4 5 6 7
New York 8 9 10 11
#大写
tran = lambda x:x[:4].upper()
data.index.map(tran)
Out:
Index(['BEIJ', 'TOKY', 'NEW '], dtype='object')
data.index = data.index.map(tran)
data
Out:
one two three four
BEIJ 0 1 2 3
TOKY 4 5 6 7
NEW 8 9 10 11
# rename
data.rename(index=str.title,columns = str.upper)
Out:
ONE TWO THREE FOUR
Beij 0 1 2 3
Toky 4 5 6 7
New 8 9 10 11
#结合字典型对象对标签更新
data.rename(index={'TOKY':'东京'},columns={'three':'第三年'})
Out:
one two 第三年 four
BEIJ 0 1 2 3
东京 4 5 6 7
NEW 8 9 10 11
data.rename(index={'TOKY':'东京'},columns={'three':'第三年'},inplace = True)
data
Out:
one two 第三年 four
BEIJ 0 1 2 3
东京 4 5 6 7
NEW 8 9 10 11