python数据分析-06DataFrame继续深入

import pandas as pd
import numpy as np
from pandas import Series,DataFrame

#重命名DataFrame的index

df1 = DataFrame(np.arange(9).reshape(3,3),index=["BJ","SH","GZ"],columns=["A","B","C"])
# print(df1)
# A B C
# BJ 0 1 2
# SH 3 4 5
# GZ 6 7 8

#方法1:
# df1.index = Series(["bj","sh","gz"])
# print(df1)
# A B C
# bj 0 1 2
# sh 3 4 5
# gz 6 7 8

#方法2:
# df1.index = df1.index.map(str.upper)
# print(df1)
# A B C
# BJ 0 1 2
# SH 3 4 5
# GZ 6 7 8

#方法3:
# print(df1.rename(index=str.lower,columns=str.lower))
# a b c
# bj 0 1 2
# sh 3 4 5
# gz 6 7 8

# print(df1)
# A B C
# BJ 0 1 2
# SH 3 4 5
# GZ 6 7 8
# print(df1.rename(index={"BJ":"beijing"},columns={"A":"a"}))
# a B C
# beijing 0 1 2
# SH 3 4 5
# GZ 6 7 8


#map函数的用法:
# list1 = [1,2,3,4]
# list2 = ['1','2','3','4']
# print([str(x) for x in list1])
# #['1', '2', '3', '4']
# print(list(map(str,list1)))
# #['1', '2', '3', '4']


#例子:
# def test_map(x):
# return x + "_ABC"
#
# print(df1.index.map(test_map))
# #Index(['BJ_ABC', 'SH_ABC', 'GZ_ABC'], dtype='object')
# print(df1.rename(index=test_map))
# A B C
# BJ_ABC 0 1 2
# SH_ABC 3 4 5
# GZ_ABC 6 7 8


#------------------------
#DataFrame的merge操作
# df1 = DataFrame({"key":["X","Y","Z"],"data_set_1":[1,2,3]})
# print(df1)
# # key data_set_1
# # 0 X 1
# # 1 Y 2
# # 2 Z 3
# df2 = DataFrame({"key":["A","B","C"],"data_set_1":[4,5,6]})
# print(df2)
# # key data_set_2
# # 0 A 4
# # 1 B 5
# # 2 C 6
# print(pd.merge(df1,df2))#没有任何数据能够merge
# # Empty DataFrame
# # Columns: [key, data_set_1, data_set_2]
# # Index: []


#改变数值
# df1 = DataFrame({"key":["X","Y","Z"],"data_set_1":[1,2,3]})
# df2 = DataFrame({"key":["X","B","C"],"data_set_2":[1,2,3]})
# print(pd.merge(df1,df2))
# key data_set_1 data_set_2
# 0 X 1 1

"""
#有多个相似值
df1 = DataFrame({"key":["X","Y","Z","X"],"data_set_1":[1,2,3,4]})
df2 = DataFrame({"key":["X","B","C"],"data_set_2":[1,2,3]})
print(pd.merge(df1,df2))
# key data_set_1 data_set_2
# 0 X 1 1
# 1 X 4 1

df1 = DataFrame({"key":["X","Y","Z","X"],"data_set_1":[1,2,3,4]})
df2 = DataFrame({"key":["X","B","C"],"data_set_2":[1,2,3]})
print(pd.merge(df1,df2,on=None))
# key data_set_1 data_set_2
# 0 X 1 1
# 1 X 4 1

df1 = DataFrame({"key":["X","Y","Z","X"],"data_set_1":[1,2,3,4]})
df2 = DataFrame({"key":["X","B","C"],"data_set_2":[1,2,3]})
print(pd.merge(df1,df2,on="key"))
# key data_set_1 data_set_2
# 0 X 1 1
# 1 X 4 1

df1 = DataFrame({"key":["X","Y","Z","X"],"data_set_1":[1,2,3,4]})
df2 = DataFrame({"key":["X","B","C"],"data_set_2":[1,2,3]})
print(pd.merge(df1,df2,on="key",how='inner'))
# key data_set_1 data_set_2
# 0 X 1 1
# 1 X 4 1

df1 = DataFrame({"key":["X","Y","Z","X"],"data_set_1":[1,2,3,4]})
df2 = DataFrame({"key":["X","B","C"],"data_set_2":[1,2,3]})
print(pd.merge(df1,df2,on="key",how='left'))
# key data_set_1 data_set_2
# 0 X 1 1.0
# 1 Y 2 NaN
# 2 Z 3 NaN
# 3 X 4 1.0

df1 = DataFrame({"key":["X","Y","Z","X"],"data_set_1":[1,2,3,4]})
df2 = DataFrame({"key":["X","B","C"],"data_set_2":[1,2,3]})
print(pd.merge(df1,df2,on="key",how='right'))
# key data_set_1 data_set_2
# 0 X 1.0 1
# 1 X 4.0 1
# 2 B NaN 2
# 3 C NaN 3

df1 = DataFrame({"key":["X","Y","Z","X"],"data_set_1":[1,2,3,4]})
df2 = DataFrame({"key":["X","B","C"],"data_set_2":[1,2,3]})
print(pd.merge(df1,df2,on="key",how='outer'))
# key data_set_1 data_set_2
# 0 X 1.0 1.0
# 1 X 4.0 1.0
# 2 Y 2.0 NaN
# 3 Z 3.0 NaN
# 4 B NaN 2.0
# 5 C NaN 3.0
"""


#------------------
#Concatenate和Combine
# arr1 = np.arange(9).reshape(3,3)
# print(arr1)
# # [[0 1 2]
# # [3 4 5]
# # [6 7 8]]
# arr2 = np.arange(9).reshape(3,3)
# print(arr2)
# # [[0 1 2]
# # [3 4 5]
# # [6 7 8]]
# print(np.concatenate([arr1,arr2]))
# # [[0 1 2]
# # [3 4 5]
# # [6 7 8]
# # [0 1 2]
# # [3 4 5]
# # [6 7 8]]
# print(np.concatenate([arr1,arr2],axis=1))
# # [[0 1 2 0 1 2]
# # [3 4 5 3 4 5]
# # [6 7 8 6 7 8]]

"""
s1 = Series([1,2,3],index=["X","Y","Z"])
s2 = Series([4,5],index=["A","B"])
print(s1)
# X 1
# Y 2
# Z 3
# dtype: int64
print(s2)
# A 4
# B 5
# dtype: int64
print(pd.concat([s1,s2]))
# X 1
# Y 2
# Z 3
# A 4
# B 5
# dtype: int64
print(pd.concat([s1,s2],axis=1))
# 0 1
# A NaN 4.0
# B NaN 5.0
# X 1.0 NaN
# Y 2.0 NaN
# Z 3.0 NaN
"""
"""
df1 = DataFrame(np.random.rand(4,3),columns=["X","Y","Z"])
print(df1)
# X Y Z
# 0 0.450571 0.778564 0.671383
# 1 0.409462 0.186622 0.646048
# 2 0.591728 0.611530 0.255930
# 3 0.075048 0.456145 0.115155
df2 = DataFrame(np.random.rand(3,3),columns=["X","Y","A"])
print(df2)
# X Y A
# 0 0.172247 0.668018 0.455111
# 1 0.080489 0.444335 0.989141
# 2 0.137628 0.947074 0.071194
print(pd.concat([df1,df2]))
# A X Y Z
# 0 NaN 0.578878 0.554048 0.500013
# 1 NaN 0.674911 0.961550 0.406136
# 2 NaN 0.120490 0.588172 0.626855
# 3 NaN 0.447513 0.080341 0.221906
# 0 0.921862 0.444732 0.947915 NaN
# 1 0.550756 0.970955 0.244399 NaN
# 2 0.082333 0.231270 0.937708 NaN
"""

#--------
#Combine
"""
s1 = Series([2,np.nan,4,np.nan],index=["A","B","C","D"])
print(s1)
# A 2.0
# B NaN
# C 4.0
# D NaN
# dtype: float64
s2 = Series([1,2,3,4],index=["A","B","C","D"])
print(s2)
# A 1
# B 2
# C 3
# D 4
# dtype: int64
print(s1.combine_first(s2))#用s2的值去填充s1,如果s1没有值
# A 2.0
# B 2.0
# C 4.0
# D 4.0
# dtype: float64
"""

"""
df1 = DataFrame({
"X":[1,np.nan,3,np.nan],
"Y":[5,np.nan,7,np.nan],
"Z":[9,np.nan,11,np.nan]
})
print(df1)
# X Y Z
# 0 1.0 5.0 9.0
# 1 NaN NaN NaN
# 2 3.0 7.0 11.0
# 3 NaN NaN NaN

df2 = DataFrame({
"Z":[np.nan,10,np.nan,12],
"A":[1,2,3,4]
})
print(df2)
# Z A
# 0 NaN 1
# 1 10.0 2
# 2 NaN 3
# 3 12.0 4
print(df1.combine_first(df2))
# A X Y Z
# 0 1.0 1.0 5.0 9.0
# 1 2.0 NaN NaN 10.0
# 2 3.0 3.0 7.0 11.0
# 3 4.0 NaN NaN 12.0
"""

#--------------------
#通过apply进行数据预处理
#假设有文件"apply_demp.csv"
# df = pd.read_csv("apply_demp.csv")
# print(df.size)#7978
# s1 = Series(["a"]*7978)
# df["A"] = s1
# print(df)#增加了A列,数值全为a
# df["A"] = df["A"].apply(str.upper)#将A列全部大写
# print(df)

# print(df["data"][0])
# #'Symbol:APPL Seqno:0 Price:1623'
# l1 = df["data"][0].strip().split("")
# print(l1)
# #['Symbol:','APPL','Seqno:','0','Price:','1623']
#
# print(l1[1],l1[3],l1[5])#要获取这三列数据,赋值
# #('APPL','0','1623')
#
# def foo(line):
# items = line.strip().split("")
# return Series(items[1],items[3],items[5])
#
# df_tmp = df["data"].apply(foo)
# df_tmp = df_tmp.rename(columns={0:"Symbol",1:"Seqno",2:"Price"})
# df_new = df.combine_first(df_tmp)


#----------------
#数据去重进行数据清洗
#"demo_duplicate.csv"
df = pd.read_csv("demo_duplicate.csv")
print(df)
# Price Seqno Symbol time
# 0 1623.0 0.0 APPL 1473411962
# 1 ............
print(df.size)#15956
print(len(df))#3989
print(len(df["Seqno"].unique()))#1000
print(df["Seqno"].duplicated())#会按照顺序表示布尔值,第一个为False,后面重复全是|True
print(df["Seqno"].drop_duplicates())#删掉所有重复数据
print(df.drop_duplicates(["Seqno"]))#类似
print(df.drop_duplicates(["Seqno"],keep="last"))#表示去重取值最后的数据
上一篇:06 Spark SQL 及其DataFrame的基本操作


下一篇:Python之Pandas初体验