Python3数据分析处理库pandas

用pandas封装函数对数据进行读取,预处理,数据分析等操作。
pandas库是基于numpy库编写的, 在命令行窗口安装完numpy后,安装pandas:pip install pandas。

相关numpy库的内容参考
http://blog.csdn.net/cymy001/article/details/78163468

通常需要pandas读取的数据文件的文本格式为.txt,.csv,.json
pandas里定义的数据类型:
(1.)object字符值(2.)int整型(3.)float浮点型(4.)datatime时间值(5.)bool布尔值

    #Python pandas introduce
    #导入数据集
    #import csv
    #food_info=csv.reader('D:\PYTHON35\idle\database\pandas\food_info.csv')
    #print(type(food_info))
    ##<class '_csv.reader'>
    
    import pandas as pd
    import os
    food_info_site = r"D:\PYTHON35\idle\database\pandas\food_info.csv"
    pwd = os.getcwd()  #获取当前工作目录
    os.chdir(os.path.dirname(food_info_site))
    #os.chdir改变当前工作目录到指定参数目录,os.path.dirname获取参数路径所在文件夹地址
    food_info = pd.read_csv(os.path.basename(food_info_site)) #read_csv的参数只能是文件名,不能是地址
    #os.path.basename返回文件名,无论参数是一个路径还是一个文件(这里food_info是路径)
    print(type(food_info))
    #<class 'pandas.core.frame.DataFrame'>,pandas读进来的数据流的格式dataframe
    print(food_info.dtypes)  #查看数据集food_info各列的数据类型,每一列的格式相同
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    first_rows=food_info.head(3) #head方法读取数据集food_info的前几行,默认参数是5
    print(first_rows)
    print(food_info.columns) #columns方法查看数据集每一列都是什么特征
    print(food_info.shape) #shape方法查看数据集维度,样本有多少行多少列
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    print(food_info.loc[0]) #loc方法查找某些行,从数据开始读,不算标题
    #pandas里object类型相当于python里的str类型
    print(food_info.loc[6]) #打印第6行,参数不能超过数据集最大的行号
    print(food_info.loc[3:6])  #打印出来多行——3,4,5,6行
    two_five_ten=[2,5,10]
    print(food_info.loc[two_five_ten])  #挑选打印2,5,10行
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    ndb_col=food_info['NDB_No']  #参数是列名称
    print(ndb_col)  #查找数据集的某一列
    print('_________________________________________')
    columns=['Zinc_(mg)','Copper_(mg)']   #一次查找多个列
    zinc_copper=food_info[columns]
    print(zinc_copper)
    #print(food_info[['Zinc_(mg)','Copper_(mg)']])
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    #查找出以(g)为单位的列属性
    print(food_info.columns)
    print(food_info.head(2))
    col_names=food_info.columns.tolist() #将列属性名字索引转化成列表
    print(col_names)
    gram_columns=[] #挑选出属性单位是g的列,放入列表中,先创建一个空列表
    for c in col_names:
        if c.endswith('(g)'):
            gram_columns.append(c)
    gram_df=food_info[gram_columns] #查找出gram_columns列表中包含的列
    print(gram_df.head(3))
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    #对以(mg)为单位的列转化成以(g)为单位的数值计算
    div_1000=food_info['Iron_(mg)']/1000  #对Iron_(mg)列的值整列变换,除以1000
    add_100=food_info['Iron_(mg)']+100
    sub_100=food_info['Iron_(mg)']-100
    mult_2=food_info['Iron_(mg)']*2
    
    water_energy=food_info['Water_(g)']*food_info['Energ_Kcal']  #对两列运算,对应位置作运算
    iron_grams=food_info['Iron_(mg)']/1000
    food_info['Iron_(g)']=iron_grams  #新得到的列,加入数据集,原数据集中没有Iron_(g)这一名字的列
    
    weighted_protein=food_info['Protein_(g)']*2
    weighted_fat=-0.75*food_info['Lipid_Tot_(g)']
    initial_rating=weighted_protein+weighted_fat  #对两列进行代数运算
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    #对不同列进行自己列的归一化处理
    max_calories=food_info['Energ_Kcal'].max()
    normalized_calories=food_info['Energ_Kcal']/max_calories  #除以当前列最大值
    normalized_protein=food_info['Protein_(g)']/food_info['Protein_(g)'].max()
    food_info['Normalized_Protein']=normalized_protein  #将归一化的值加入到数据集中
    normalized_fat=food_info['Lipid_Tot_(g)']/food_info['Lipid_Tot_(g)'].max()
    food_info['Normalized_Fat']=normalized_fat
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    #pandas读取数据的排序问题
    print(food_info['Sodium_(mg)'])
    print('_________________________________________')
    food_info.sort_values('Sodium_(mg)',inplace=True)
    #sort_values方法从小到大排序,inplace是否新建一个新的Dataframe,True是不需要
    print(food_info['Sodium_(mg)'])
    print('_________________________________________')
    food_info.sort_values('Sodium_(mg)',inplace=True,ascending=False) #ascending参数控制排序升降
    print(food_info['Sodium_(mg)'])
[/code]

```code
    import pandas as pd
    import numpy as np
    import os
    titanic_survival_site = r"D:\PYTHON35\idle\database\pandas\titanic_train.csv"
    pwd = os.getcwd()  
    os.chdir(os.path.dirname(titanic_survival_site))
    titanic_survival = pd.read_csv(os.path.basename(titanic_survival_site)) 
    print(type(titanic_survival))
    print(titanic_survival.head())
    #<class 'pandas.core.frame.DataFrame'>
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    #对数据集中的缺失值进行预处理
    age=titanic_survival['Age']
    print(age.loc[0:10])  #缺失值NaN
    age_is_null=pd.isnull(age)  #isnull方法判断是否是缺失值,返回布尔值列表
    print(age_is_null) #True代表值缺失,False代表不缺失
    age_null_true=age[age_is_null] #取出age列中age_is_null为True对应的值,缺失值
    print(len(age_null_true))
    print('_________________________________________')
    mean_age=sum(titanic_survival['Age'])/len(titanic_survival['Age'])
    print(mean_age) #直接计算,由于有缺失值,输出nan
    good_ages=titanic_survival['Age'][age_is_null==False] #把age列不缺失的值都取出
    print(good_ages)
    correct_mean_age=sum(good_ages)/len(good_ages)
    print(correct_mean_age)
    print('_________________________________________')
    #pandas直接封装的API函数,自动过滤缺失值计算
    correct_mean_age=titanic_survival['Age'].mean() #mean方法自动过滤缺失值
    print(correct_mean_age)
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    #统计各仓位等级的船票价格
    passenger_classes=[1,2,3]
    fares_by_class={}
    for this_class in passenger_classes:
        pclass_rows=titanic_survival[titanic_survival['Pclass']==this_class] #找出this_class等舱的人
        pclass_fares=pclass_rows['Fare'] #对应人True的位置找出船票价格
        fare_for_class=pclass_fares.mean()
        fares_by_class[this_class]=fare_for_class
    print(fares_by_class)
    print('_________________________________________')
    #找出两个量的关系,具体什么关系由aggfunc参数指定,aggfunc默认值就是求均值
    passenger_survival=titanic_survival.pivot_table(index='Pclass',values='Survived',aggfunc=np.mean)
    #pivot_table方法,数据透视表,返回index和values这两个的关系
    print(passenger_survival)
    #同时考虑某个量与其余多个量关系,values列表参数
    port_stats=titanic_survival.pivot_table(index='Embarked',values=['Fare','Survived'],aggfunc=np.sum)
    print(port_stats)
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    #数据属性确实,把整行数据丢弃
    drop_na_columns=titanic_survival.dropna(axis=1) #dropna方法是按行检查,每一行的任一项有缺失值,就把这一行去掉
    new_titanic_survival=titanic_survival.dropna(axis=0,subset=['Age','Sex']) #考查Age和Sex列,有缺失值的行去掉
    print(new_titanic_survival)
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    row_index_83_age=titanic_survival.loc[83,'Age']  #loc方法,查看某一行的某一属性值
    print(row_index_83_age)
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    #按某一列属性排序后,索引编号重新定义reset_index方法
    new_titanic_survival=titanic_survival.sort_values('Age',ascending=False)
    print(new_titanic_survival[0:10])
    titanic_reindex=new_titanic_survival.reset_index(drop=True)  #对Age降序排列之后重新加索引
    print(titanic_reindex.iloc[0:10])
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    #自定义函数作用于Dataframe
    def hundredth_row(column):
        hundredth_item=column.iloc[99]
        return hundredth_item
    hundredth_row=titanic_survival.apply(hundredth_row)  #apply方法参数为自定义函数,Dateframe apply这个自定义函数,就可以得到函数作用Dataframe结果
    print(hundredth_row)
    print('_________________________________________')
    def not_null_count(column):
        column_null=pd.isnull(column)
        null=column[column_null]
        return len(null)
    column_null_count=titanic_survival.apply(not_null_count)
    print(column_null_count)
    print('_________________________________________')
    def which_class(row):
        pclass=row['Pclass']
        if pd.isnull(pclass):
            return 'Unknown'
        elif pclass==1:
            return 'First Class'
        elif pclass==2:
            return 'Second Class'
        elif pclass==3:
            return 'Third Class'
    classes=titanic_survival.apply(which_class,axis=1)
    print(classes)
[/code]

**pandas的3种主要数据结构** :  
**Series** ——一些值的集合,数据元素;支持float,int,bool,
datatime,timedelta,category,object类型  
**DataFrame** ——Series的集合  
**Panel** ——DataFrame的集合

```code
    import pandas as pd
    import os
    fandango_site = r"D:\PYTHON35\idle\database\pandas\fandango_score_comparison.csv"
    pwd = os.getcwd()  
    os.chdir(os.path.dirname(fandango_site))
    fandango = pd.read_csv(os.path.basename(fandango_site)) 
    print(type(fandango))
    series_film=fandango['FILM']  #取第一列电影名
    print(series_film[0:5]) #以“索引:电影名”形式列出
    series_rt=fandango['RottenTomatoes']  #取第二列电影评分
    print(series_rt[0:5])
    print('_________________________________________')
    #Series可以改变索引,将任意指定属性列定义为索引列
    from pandas import Series
    film_names=series_film.values #values方法,以列表形式给出对应名字
    print(type(film_names))
    print(film_names)
    rt_scores=series_rt.values
    print(rt_scores)
    series_custom=Series(rt_scores,index=film_names) #Series函数,index参数为索引列,另一个参数评分值为值
    print(series_custom)
    print('_____________')
    print(series_custom[['Ant-Man (2015)','The Water Diviner (2015)']]) #通过电影名字找样本
    print('_____________')
    #有了Series索引后,仍然可以利用index编号去寻找样本
    fiveten=series_custom[5:10]
    print(fiveten)
    print('_____________')
    original_index=series_custom.index.tolist() #取按电影名字作为索引的列表
    sorted_index=sorted(original_index) #按字母升序排列
    sorted_by_index=series_custom.reindex(sorted_index)  #reindex方法定义新索引
    print(sorted_by_index)
    print('_____________')
    sc2=series_custom.sort_index() #先找到Series变量,按index排序
    print(sc2[0:10])
    sc3=series_custom.sort_values() #先找到Series变量,按values排序
    print(sc3[0:10])
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    print(type(series_custom.values)) #查看series创建对象的数据结构
    #<class 'numpy.ndarray'>
    import numpy as np
    print(np.add(series_custom,series_custom)) #series对象可以进行numpy的相关函数操作
    print(np.sin(series_custom))
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    #过滤操作
    criteria_one=series_custom>50
    criteria_two=series_custom<75
    both_criteria=series_custom[criteria_one & criteria_two]
    print(both_criteria)
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    #对两个Series进行算术运算
    rt_critics=Series(fandango['RottenTomatoes'].values,index=fandango['FILM'])
    re_users=Series(fandango['RottenTomatoes_User'].values,index=fandango['FILM'])
    rt_mean=(rt_critics+re_users)/2
    print(rt_mean)
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
[/code]

```code
    #在DataFrame结构里设置索引列
    import pandas as pd
    import os
    fandango_site = r"D:\PYTHON35\idle\database\pandas\fandango_score_comparison.csv"
    pwd = os.getcwd()  
    os.chdir(os.path.dirname(fandango_site))
    fandango = pd.read_csv(os.path.basename(fandango_site))
    print(type(fandango))
    fandango_films=fandango.set_index('FILM',drop=False) #set_index函数设置‘FILM’列为索引项,drop参数False表示‘FILM’列还在,不仅是索引项,‘FILM’在值里也还可查
    print(fandango_films.index)
    
    movies=['Ant-Man (2015)','The Water Diviner (2015)']
    print(fandango_films.loc[movies]) #loc方法用电影名字索引做参数查找对应行信息
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    #numpy和DataFrame结构结合使用
    types=fandango_films.dtypes
    print(types)
    float_columns=types[types.values=='float64'].index #将每一列的值类型是float64的列属性取出来
    float_df=fandango_films[float_columns] #将对应列的值取出来
    deviations=float_df.apply(lambda x:np.std(x)) #利用,numpy里的std方法,对float64值类型对应的每一“列”求方差
    print(deviations)
    
    rt_mt_user=float_df[['RT_user_norm','Metacritic_user_nom']]
    rowdeviation=rt_mt_user.apply(lambda x:np.std(x),axis=1) #axis=1参数,表示横向求方差
    print(rowdeviation)

Python3数据分析处理库pandas

上一篇:java内存分析


下一篇:21 Survival of Desert Life 沙漠生命的延续