数据分析-Day05 pandas(2)

目录

1. 字符串离散化

2. 数据合并

2.1 join

2.2 merge

3. 数据分组聚合

4. 数据索引、复合索引


1. 字符串离散化

1)获取字符串去重后的列表;

2)构造全为0的数组,其中colums为字符串的列表;

3)遍历,给全为0的数组赋值

#根据一组从2006年到2016年1000部最流行的电影数据,统计电影体裁(genre)的情况

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

pd.set_option('display.max_columns', 100)

df = pd.read_csv("./IMDB-Movie-Data.csv")

#获取字符串去重后的列表
temp_list = df["Genre"].str.split(",").tolist()
genre_list = list(set([i for j in temp_list for i in j]))

#构造全为0的数组
zeros_df = pd.DataFrame(np.zeros((df.shape[0],len(genre_list))),columns=genre_list)
# print(zeros_df)

#按照每个电影的体裁,给数组赋值"1"
for i in range(df.shape[0]):
    zeros_df.loc[i,temp_list[i]] = 1

#统计每种体裁的电影数量
genre_count = zeros_df.sum(axis=0)

#排序
genre_count = genre_count.sort_values(ascending=False)
# print("*"*100)
# print(genre_count)

#画图
plt.figure(figsize=(20,8),dpi=80)

_x = genre_count.index
_y = genre_count.values

plt.bar(range(len(_x)),_y)

plt.xticks(range(len(_x)),_x)

plt.savefig("./movie_genre.png")

plt.show()

数据分析-Day05 pandas(2)

2. 数据合并

2.1 join

按照index进行分组,默认情况下将行索引相同的数据合并到一起:t1.join(t2)

#测试数据合并 join

import pandas as pd
import numpy as np

df1 = pd.DataFrame(np.ones((2,4)),index=["A","B"],columns=list("abcd"))
df2 = pd.DataFrame(np.ones((3,3)),index=["A","B","C"],columns=list("xyz"))
temp1 = df1.join(df2)
print(temp1)
temp2 = df2.join(df1)
print(temp2)


'''
输出结果:
a    b    c    d    x    y    z
A  1.0  1.0  1.0  1.0  1.0  1.0  1.0
B  1.0  1.0  1.0  1.0  1.0  1.0  1.0
****************************************************************************************************
     x    y    z    a    b    c    d
A  1.0  1.0  1.0  1.0  1.0  1.0  1.0
B  1.0  1.0  1.0  1.0  1.0  1.0  1.0
C  1.0  1.0  1.0  NaN  NaN  NaN  NaN
'''

2.2 merge

按照指定的列,将数据以一定的方式合并到一起:t1.merge(t2, on="")

#测试数据合并 merge

import pandas as pd
import numpy as np

df1 = pd.DataFrame(np.ones((2,4)),index=["A","B"],columns=list("abcd"))
df2 = pd.DataFrame(np.ones((3,3)),index=["A","B","C"],columns=list("xyz"))
df3 = pd.DataFrame(np.arange(9).reshape((3,3)),columns=list("fax"))
temp3 = df1.merge(df3,on="a")     #默认为“inner”,内连接
print(temp3)                      #df1中有两行“a”列的取值都为1,所以合并后有两行
print("*"*100)

df1.loc["A","a"] = 100
temp4 = df1.merge(df3,on="a",how="outer")        
print(temp4)
print("*"*100)

temp5 = df1.merge(df3,on="a",how="left")
print(temp5)
print("*"*100)

temp6 = df1.merge(df3,on="a",how="right")
print(temp6)


'''
输出数据:
a    b    c    d  f  x
0  1.0  1.0  1.0  1.0  0  2
1  1.0  1.0  1.0  1.0  0  2
****************************************************************************************************
       a    b    c    d    f    x
0  100.0  1.0  1.0  1.0  NaN  NaN
1    1.0  1.0  1.0  1.0  0.0  2.0
2    4.0  NaN  NaN  NaN  3.0  5.0
3    7.0  NaN  NaN  NaN  6.0  8.0
****************************************************************************************************
       a    b    c    d    f    x
0  100.0  1.0  1.0  1.0  NaN  NaN
1    1.0  1.0  1.0  1.0  0.0  2.0
****************************************************************************************************
     a    b    c    d  f  x
0  1.0  1.0  1.0  1.0  0  2
1  4.0  NaN  NaN  NaN  3  5
2  7.0  NaN  NaN  NaN  6  8
'''

3. 数据分组聚合

df.groupby(by="columns_name")

grouped是一个可迭代的DataFrameGroupBy对象,其中的每一个元素是一个元组:(索引(分组的值),分组之后的DataFrame)

#根据一组关于全球星巴克店铺的统计数据,测试pandas 分组聚合

import pandas as pd

pd.set_option('display.max_columns', 100)

df = pd.read_csv("./starbucks_store_worldwide.csv")

grouped1 = df.groupby(by="Country")    #按列分组,DataFrameGroupBy 对象,可迭代
print(grouped1)

#DataFrameGroupBy
#可以进行遍历
for i,j in grouped1:
    print(i)
    print("-"*50)
    print(j)
    print("*"*50)

#调用聚合方法
print(grouped1.count())
country_count = grouped1["Brand"].count()
print(country_count["US"])
print(country_count["CN"])

#统计中国每个省份的店铺数量
China_data = df[df["Country"] == "CN"]
grouped2 = China_data.groupby(by="State/Province").count()["Brand"]
print(grouped2)

#按照多个条件进行分组,返回Series
grouped3 = df["Brand"].groupby(by=[df["Country"],df["State/Province"]]).count()
print(grouped3)    #Series类型
grouped4 = df.groupby(by=["Country","State/Province"])["Country"].count()
grouped5 = df.groupby(by=["Country","State/Province"]).count()["Brand"]
print(grouped4)
print(grouped5)

#按照多个条件进行分组,返回DataFrame
grouped6 = df[["Brand"]].groupby(by=[df["Country"],df["State/Province"]]).count()
grouped7 = df.groupby(by=[df["Country"],df["State/Province"]])[["Brand"]].count()
grouped8 = df.groupby(by=[df["Country"],df["State/Province"]]).count()[["Brand"]]
print(grouped6)
print(grouped7)
print(grouped8)

4. 数据索引、复合索引

简单的索引操作:

• 获取 index : df.index • 指定 index : df.index = [' x','y '] • 重新设置 index : df.reindex ( list(" abcedf ")) • 指定某一列作为 index : df.set_index (" Country",drop =False) • 返回 index 的唯一值: df.set_index ("Country"). index.unique ()

#测试pandas 索引和复合索引

import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 100)

df = pd.read_csv("./starbucks_store_worldwide.csv")

grouped = df[["Brand"]].groupby(by=[df["Country"],df["State/Province"]]).count()
# print(grouped)
print(grouped.index)
print("*"*100)

df1 = pd.DataFrame(np.ones((2,4)),index=["A","B"],columns=list("abcd"))
df1.loc["A","a"] = 100
df1.index = ["x","y"]
print(df1)
print("*"*100)

df2 = df1.reindex(["x","z"])   #从df1中选取“x”和“z”两行赋值给df2
print(df2)
print("*"*100)
# print(df1)
df3 = df1.set_index("a",drop=False)    #drop参数设定是否将指定列从dataframe中删除,默认为True
print(df3)
print("*"*100)
df4 = df1.set_index("b",drop=False).index.unique()
print(df4)
print("*"*100)
df5 = df1.set_index(["a","c"])
print(df5.index)
print("*"*100)

a = pd.DataFrame({'a': range(7),'b': range(7, 0, -1),'c': ['one','one','one','two','two','two', 'two'],'d': list("hjklmno")})
print(a)
print("*"*100)
b = a.set_index(["c","d"])
print(b)


'''
输出结果:
MultiIndex([('AD',  '7'),
            ('AE', 'AJ'),
            ('AE', 'AZ'),
            ('AE', 'DU'),
            ('AE', 'FU'),
            ('AE', 'RK'),
            ('AE', 'SH'),
            ('AE', 'UQ'),
            ('AR',  'B'),
            ('AR',  'C'),
            ...
            ('US', 'UT'),
            ('US', 'VA'),
            ('US', 'VT'),
            ('US', 'WA'),
            ('US', 'WI'),
            ('US', 'WV'),
            ('US', 'WY'),
            ('VN', 'HN'),
            ('VN', 'SG'),
            ('ZA', 'GT')],
           names=['Country', 'State/Province'], length=545)
****************************************************************************************************
       a    b    c    d
x  100.0  1.0  1.0  1.0
y    1.0  1.0  1.0  1.0
****************************************************************************************************
       a    b    c    d
x  100.0  1.0  1.0  1.0
z    NaN  NaN  NaN  NaN
****************************************************************************************************
           a    b    c    d
a                          
100.0  100.0  1.0  1.0  1.0
1.0      1.0  1.0  1.0  1.0
****************************************************************************************************
Float64Index([1.0], dtype='float64', name='b')
****************************************************************************************************
MultiIndex([(100.0, 1.0),
            (  1.0, 1.0)],
           names=['a', 'c'])
****************************************************************************************************
   a  b    c  d
0  0  7  one  h
1  1  6  one  j
2  2  5  one  k
3  3  4  two  l
4  4  3  two  m
5  5  2  two  n
6  6  1  two  o
****************************************************************************************************
       a  b
c   d      
one h  0  7
    j  1  6
    k  2  5
two l  3  4
    m  4  3
    n  5  2
    o  6  1
'''
#测试Series复合索引

import pandas as pd

a = pd.DataFrame({'a': range(7),'b': range(7, 0, -1),'c': ['one','one','one','two','two','two', 'two'],'d': list("hjklmno")})
b = a.set_index(["c","d"])
c = b["a"]
print(c["one"])
print("*"*100)
print(c["one"]["j"])
print("*"*100)
print(c["one","j"])
print("*"*100)

d = a.set_index(["d","c"])["a"]
d1 = a.set_index(["d","c"])[["a"]]    #DataFrame
print(d)
print("*"*100)
print(d1)
print("*"*100)

e = d.swaplevel()       #交换内外侧索引层次,可帮助取内层索引
print(e)
print("*"*100)

print(b.loc["one"].loc["h"])


'''
输出结果:
d
h    0
j    1
k    2
Name: a, dtype: int64
****************************************************************************************************
1
****************************************************************************************************
1
****************************************************************************************************
d  c  
h  one    0
j  one    1
k  one    2
l  two    3
m  two    4
n  two    5
o  two    6
Name: a, dtype: int64
****************************************************************************************************
       a
d c     
h one  0
j one  1
k one  2
l two  3
m two  4
n two  5
o two  6
****************************************************************************************************
c    d
one  h    0
     j    1
     k    2
two  l    3
     m    4
     n    5
     o    6
Name: a, dtype: int64
****************************************************************************************************
a    0
b    7
Name: h, dtype: int64
'''

上一篇:跨域问题总结


下一篇:ios中时间转换时间戳NAN问题-小程序