目录
1. 字符串离散化
1)获取字符串去重后的列表;
2)构造全为0的数组,其中colums为字符串的列表;
3)遍历,给全为0的数组赋值
#根据一组从2006年到2016年1000部最流行的电影数据,统计电影体裁(genre)的情况
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
pd.set_option('display.max_columns', 100)
df = pd.read_csv("./IMDB-Movie-Data.csv")
#获取字符串去重后的列表
temp_list = df["Genre"].str.split(",").tolist()
genre_list = list(set([i for j in temp_list for i in j]))
#构造全为0的数组
zeros_df = pd.DataFrame(np.zeros((df.shape[0],len(genre_list))),columns=genre_list)
# print(zeros_df)
#按照每个电影的体裁,给数组赋值"1"
for i in range(df.shape[0]):
zeros_df.loc[i,temp_list[i]] = 1
#统计每种体裁的电影数量
genre_count = zeros_df.sum(axis=0)
#排序
genre_count = genre_count.sort_values(ascending=False)
# print("*"*100)
# print(genre_count)
#画图
plt.figure(figsize=(20,8),dpi=80)
_x = genre_count.index
_y = genre_count.values
plt.bar(range(len(_x)),_y)
plt.xticks(range(len(_x)),_x)
plt.savefig("./movie_genre.png")
plt.show()
2. 数据合并
2.1 join
按照index进行分组,默认情况下将行索引相同的数据合并到一起:t1.join(t2)
#测试数据合并 join
import pandas as pd
import numpy as np
df1 = pd.DataFrame(np.ones((2,4)),index=["A","B"],columns=list("abcd"))
df2 = pd.DataFrame(np.ones((3,3)),index=["A","B","C"],columns=list("xyz"))
temp1 = df1.join(df2)
print(temp1)
temp2 = df2.join(df1)
print(temp2)
'''
输出结果:
a b c d x y z
A 1.0 1.0 1.0 1.0 1.0 1.0 1.0
B 1.0 1.0 1.0 1.0 1.0 1.0 1.0
****************************************************************************************************
x y z a b c d
A 1.0 1.0 1.0 1.0 1.0 1.0 1.0
B 1.0 1.0 1.0 1.0 1.0 1.0 1.0
C 1.0 1.0 1.0 NaN NaN NaN NaN
'''
2.2 merge
按照指定的列,将数据以一定的方式合并到一起:t1.merge(t2, on="")
#测试数据合并 merge
import pandas as pd
import numpy as np
df1 = pd.DataFrame(np.ones((2,4)),index=["A","B"],columns=list("abcd"))
df2 = pd.DataFrame(np.ones((3,3)),index=["A","B","C"],columns=list("xyz"))
df3 = pd.DataFrame(np.arange(9).reshape((3,3)),columns=list("fax"))
temp3 = df1.merge(df3,on="a") #默认为“inner”,内连接
print(temp3) #df1中有两行“a”列的取值都为1,所以合并后有两行
print("*"*100)
df1.loc["A","a"] = 100
temp4 = df1.merge(df3,on="a",how="outer")
print(temp4)
print("*"*100)
temp5 = df1.merge(df3,on="a",how="left")
print(temp5)
print("*"*100)
temp6 = df1.merge(df3,on="a",how="right")
print(temp6)
'''
输出数据:
a b c d f x
0 1.0 1.0 1.0 1.0 0 2
1 1.0 1.0 1.0 1.0 0 2
****************************************************************************************************
a b c d f x
0 100.0 1.0 1.0 1.0 NaN NaN
1 1.0 1.0 1.0 1.0 0.0 2.0
2 4.0 NaN NaN NaN 3.0 5.0
3 7.0 NaN NaN NaN 6.0 8.0
****************************************************************************************************
a b c d f x
0 100.0 1.0 1.0 1.0 NaN NaN
1 1.0 1.0 1.0 1.0 0.0 2.0
****************************************************************************************************
a b c d f x
0 1.0 1.0 1.0 1.0 0 2
1 4.0 NaN NaN NaN 3 5
2 7.0 NaN NaN NaN 6 8
'''
3. 数据分组聚合
df.groupby(by="columns_name")
grouped是一个可迭代的DataFrameGroupBy对象,其中的每一个元素是一个元组:(索引(分组的值),分组之后的DataFrame)
#根据一组关于全球星巴克店铺的统计数据,测试pandas 分组聚合
import pandas as pd
pd.set_option('display.max_columns', 100)
df = pd.read_csv("./starbucks_store_worldwide.csv")
grouped1 = df.groupby(by="Country") #按列分组,DataFrameGroupBy 对象,可迭代
print(grouped1)
#DataFrameGroupBy
#可以进行遍历
for i,j in grouped1:
print(i)
print("-"*50)
print(j)
print("*"*50)
#调用聚合方法
print(grouped1.count())
country_count = grouped1["Brand"].count()
print(country_count["US"])
print(country_count["CN"])
#统计中国每个省份的店铺数量
China_data = df[df["Country"] == "CN"]
grouped2 = China_data.groupby(by="State/Province").count()["Brand"]
print(grouped2)
#按照多个条件进行分组,返回Series
grouped3 = df["Brand"].groupby(by=[df["Country"],df["State/Province"]]).count()
print(grouped3) #Series类型
grouped4 = df.groupby(by=["Country","State/Province"])["Country"].count()
grouped5 = df.groupby(by=["Country","State/Province"]).count()["Brand"]
print(grouped4)
print(grouped5)
#按照多个条件进行分组,返回DataFrame
grouped6 = df[["Brand"]].groupby(by=[df["Country"],df["State/Province"]]).count()
grouped7 = df.groupby(by=[df["Country"],df["State/Province"]])[["Brand"]].count()
grouped8 = df.groupby(by=[df["Country"],df["State/Province"]]).count()[["Brand"]]
print(grouped6)
print(grouped7)
print(grouped8)
4. 数据索引、复合索引
简单的索引操作:
• 获取 index : df.index • 指定 index : df.index = [' x','y '] • 重新设置 index : df.reindex ( list(" abcedf ")) • 指定某一列作为 index : df.set_index (" Country",drop =False) • 返回 index 的唯一值: df.set_index ("Country"). index.unique ()#测试pandas 索引和复合索引
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 100)
df = pd.read_csv("./starbucks_store_worldwide.csv")
grouped = df[["Brand"]].groupby(by=[df["Country"],df["State/Province"]]).count()
# print(grouped)
print(grouped.index)
print("*"*100)
df1 = pd.DataFrame(np.ones((2,4)),index=["A","B"],columns=list("abcd"))
df1.loc["A","a"] = 100
df1.index = ["x","y"]
print(df1)
print("*"*100)
df2 = df1.reindex(["x","z"]) #从df1中选取“x”和“z”两行赋值给df2
print(df2)
print("*"*100)
# print(df1)
df3 = df1.set_index("a",drop=False) #drop参数设定是否将指定列从dataframe中删除,默认为True
print(df3)
print("*"*100)
df4 = df1.set_index("b",drop=False).index.unique()
print(df4)
print("*"*100)
df5 = df1.set_index(["a","c"])
print(df5.index)
print("*"*100)
a = pd.DataFrame({'a': range(7),'b': range(7, 0, -1),'c': ['one','one','one','two','two','two', 'two'],'d': list("hjklmno")})
print(a)
print("*"*100)
b = a.set_index(["c","d"])
print(b)
'''
输出结果:
MultiIndex([('AD', '7'),
('AE', 'AJ'),
('AE', 'AZ'),
('AE', 'DU'),
('AE', 'FU'),
('AE', 'RK'),
('AE', 'SH'),
('AE', 'UQ'),
('AR', 'B'),
('AR', 'C'),
...
('US', 'UT'),
('US', 'VA'),
('US', 'VT'),
('US', 'WA'),
('US', 'WI'),
('US', 'WV'),
('US', 'WY'),
('VN', 'HN'),
('VN', 'SG'),
('ZA', 'GT')],
names=['Country', 'State/Province'], length=545)
****************************************************************************************************
a b c d
x 100.0 1.0 1.0 1.0
y 1.0 1.0 1.0 1.0
****************************************************************************************************
a b c d
x 100.0 1.0 1.0 1.0
z NaN NaN NaN NaN
****************************************************************************************************
a b c d
a
100.0 100.0 1.0 1.0 1.0
1.0 1.0 1.0 1.0 1.0
****************************************************************************************************
Float64Index([1.0], dtype='float64', name='b')
****************************************************************************************************
MultiIndex([(100.0, 1.0),
( 1.0, 1.0)],
names=['a', 'c'])
****************************************************************************************************
a b c d
0 0 7 one h
1 1 6 one j
2 2 5 one k
3 3 4 two l
4 4 3 two m
5 5 2 two n
6 6 1 two o
****************************************************************************************************
a b
c d
one h 0 7
j 1 6
k 2 5
two l 3 4
m 4 3
n 5 2
o 6 1
'''
#测试Series复合索引
import pandas as pd
a = pd.DataFrame({'a': range(7),'b': range(7, 0, -1),'c': ['one','one','one','two','two','two', 'two'],'d': list("hjklmno")})
b = a.set_index(["c","d"])
c = b["a"]
print(c["one"])
print("*"*100)
print(c["one"]["j"])
print("*"*100)
print(c["one","j"])
print("*"*100)
d = a.set_index(["d","c"])["a"]
d1 = a.set_index(["d","c"])[["a"]] #DataFrame
print(d)
print("*"*100)
print(d1)
print("*"*100)
e = d.swaplevel() #交换内外侧索引层次,可帮助取内层索引
print(e)
print("*"*100)
print(b.loc["one"].loc["h"])
'''
输出结果:
d
h 0
j 1
k 2
Name: a, dtype: int64
****************************************************************************************************
1
****************************************************************************************************
1
****************************************************************************************************
d c
h one 0
j one 1
k one 2
l two 3
m two 4
n two 5
o two 6
Name: a, dtype: int64
****************************************************************************************************
a
d c
h one 0
j one 1
k one 2
l two 3
m two 4
n two 5
o two 6
****************************************************************************************************
c d
one h 0
j 1
k 2
two l 3
m 4
n 5
o 6
Name: a, dtype: int64
****************************************************************************************************
a 0
b 7
Name: h, dtype: int64
'''