用pandas封装函数对数据进行读取,预处理,数据分析等操作。
pandas库是基于numpy库编写的, 在命令行窗口安装完numpy后,安装pandas:pip install pandas。
相关numpy库的内容参考
http://blog.csdn.net/cymy001/article/details/78163468
通常需要pandas读取的数据文件的文本格式为.txt,.csv,.json
pandas里定义的数据类型:
(1.)object字符值(2.)int整型(3.)float浮点型(4.)datatime时间值(5.)bool布尔值
#Python pandas introduce
#导入数据集
#import csv
#food_info=csv.reader('D:\PYTHON35\idle\database\pandas\food_info.csv')
#print(type(food_info))
##<class '_csv.reader'>
import pandas as pd
import os
food_info_site = r"D:\PYTHON35\idle\database\pandas\food_info.csv"
pwd = os.getcwd() #获取当前工作目录
os.chdir(os.path.dirname(food_info_site))
#os.chdir改变当前工作目录到指定参数目录,os.path.dirname获取参数路径所在文件夹地址
food_info = pd.read_csv(os.path.basename(food_info_site)) #read_csv的参数只能是文件名,不能是地址
#os.path.basename返回文件名,无论参数是一个路径还是一个文件(这里food_info是路径)
print(type(food_info))
#<class 'pandas.core.frame.DataFrame'>,pandas读进来的数据流的格式dataframe
print(food_info.dtypes) #查看数据集food_info各列的数据类型,每一列的格式相同
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
first_rows=food_info.head(3) #head方法读取数据集food_info的前几行,默认参数是5
print(first_rows)
print(food_info.columns) #columns方法查看数据集每一列都是什么特征
print(food_info.shape) #shape方法查看数据集维度,样本有多少行多少列
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
print(food_info.loc[0]) #loc方法查找某些行,从数据开始读,不算标题
#pandas里object类型相当于python里的str类型
print(food_info.loc[6]) #打印第6行,参数不能超过数据集最大的行号
print(food_info.loc[3:6]) #打印出来多行——3,4,5,6行
two_five_ten=[2,5,10]
print(food_info.loc[two_five_ten]) #挑选打印2,5,10行
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
ndb_col=food_info['NDB_No'] #参数是列名称
print(ndb_col) #查找数据集的某一列
print('_________________________________________')
columns=['Zinc_(mg)','Copper_(mg)'] #一次查找多个列
zinc_copper=food_info[columns]
print(zinc_copper)
#print(food_info[['Zinc_(mg)','Copper_(mg)']])
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
#查找出以(g)为单位的列属性
print(food_info.columns)
print(food_info.head(2))
col_names=food_info.columns.tolist() #将列属性名字索引转化成列表
print(col_names)
gram_columns=[] #挑选出属性单位是g的列,放入列表中,先创建一个空列表
for c in col_names:
if c.endswith('(g)'):
gram_columns.append(c)
gram_df=food_info[gram_columns] #查找出gram_columns列表中包含的列
print(gram_df.head(3))
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
#对以(mg)为单位的列转化成以(g)为单位的数值计算
div_1000=food_info['Iron_(mg)']/1000 #对Iron_(mg)列的值整列变换,除以1000
add_100=food_info['Iron_(mg)']+100
sub_100=food_info['Iron_(mg)']-100
mult_2=food_info['Iron_(mg)']*2
water_energy=food_info['Water_(g)']*food_info['Energ_Kcal'] #对两列运算,对应位置作运算
iron_grams=food_info['Iron_(mg)']/1000
food_info['Iron_(g)']=iron_grams #新得到的列,加入数据集,原数据集中没有Iron_(g)这一名字的列
weighted_protein=food_info['Protein_(g)']*2
weighted_fat=-0.75*food_info['Lipid_Tot_(g)']
initial_rating=weighted_protein+weighted_fat #对两列进行代数运算
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
#对不同列进行自己列的归一化处理
max_calories=food_info['Energ_Kcal'].max()
normalized_calories=food_info['Energ_Kcal']/max_calories #除以当前列最大值
normalized_protein=food_info['Protein_(g)']/food_info['Protein_(g)'].max()
food_info['Normalized_Protein']=normalized_protein #将归一化的值加入到数据集中
normalized_fat=food_info['Lipid_Tot_(g)']/food_info['Lipid_Tot_(g)'].max()
food_info['Normalized_Fat']=normalized_fat
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
#pandas读取数据的排序问题
print(food_info['Sodium_(mg)'])
print('_________________________________________')
food_info.sort_values('Sodium_(mg)',inplace=True)
#sort_values方法从小到大排序,inplace是否新建一个新的Dataframe,True是不需要
print(food_info['Sodium_(mg)'])
print('_________________________________________')
food_info.sort_values('Sodium_(mg)',inplace=True,ascending=False) #ascending参数控制排序升降
print(food_info['Sodium_(mg)'])
[/code]
```code
import pandas as pd
import numpy as np
import os
titanic_survival_site = r"D:\PYTHON35\idle\database\pandas\titanic_train.csv"
pwd = os.getcwd()
os.chdir(os.path.dirname(titanic_survival_site))
titanic_survival = pd.read_csv(os.path.basename(titanic_survival_site))
print(type(titanic_survival))
print(titanic_survival.head())
#<class 'pandas.core.frame.DataFrame'>
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
#对数据集中的缺失值进行预处理
age=titanic_survival['Age']
print(age.loc[0:10]) #缺失值NaN
age_is_null=pd.isnull(age) #isnull方法判断是否是缺失值,返回布尔值列表
print(age_is_null) #True代表值缺失,False代表不缺失
age_null_true=age[age_is_null] #取出age列中age_is_null为True对应的值,缺失值
print(len(age_null_true))
print('_________________________________________')
mean_age=sum(titanic_survival['Age'])/len(titanic_survival['Age'])
print(mean_age) #直接计算,由于有缺失值,输出nan
good_ages=titanic_survival['Age'][age_is_null==False] #把age列不缺失的值都取出
print(good_ages)
correct_mean_age=sum(good_ages)/len(good_ages)
print(correct_mean_age)
print('_________________________________________')
#pandas直接封装的API函数,自动过滤缺失值计算
correct_mean_age=titanic_survival['Age'].mean() #mean方法自动过滤缺失值
print(correct_mean_age)
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
#统计各仓位等级的船票价格
passenger_classes=[1,2,3]
fares_by_class={}
for this_class in passenger_classes:
pclass_rows=titanic_survival[titanic_survival['Pclass']==this_class] #找出this_class等舱的人
pclass_fares=pclass_rows['Fare'] #对应人True的位置找出船票价格
fare_for_class=pclass_fares.mean()
fares_by_class[this_class]=fare_for_class
print(fares_by_class)
print('_________________________________________')
#找出两个量的关系,具体什么关系由aggfunc参数指定,aggfunc默认值就是求均值
passenger_survival=titanic_survival.pivot_table(index='Pclass',values='Survived',aggfunc=np.mean)
#pivot_table方法,数据透视表,返回index和values这两个的关系
print(passenger_survival)
#同时考虑某个量与其余多个量关系,values列表参数
port_stats=titanic_survival.pivot_table(index='Embarked',values=['Fare','Survived'],aggfunc=np.sum)
print(port_stats)
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
#数据属性确实,把整行数据丢弃
drop_na_columns=titanic_survival.dropna(axis=1) #dropna方法是按行检查,每一行的任一项有缺失值,就把这一行去掉
new_titanic_survival=titanic_survival.dropna(axis=0,subset=['Age','Sex']) #考查Age和Sex列,有缺失值的行去掉
print(new_titanic_survival)
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
row_index_83_age=titanic_survival.loc[83,'Age'] #loc方法,查看某一行的某一属性值
print(row_index_83_age)
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
#按某一列属性排序后,索引编号重新定义reset_index方法
new_titanic_survival=titanic_survival.sort_values('Age',ascending=False)
print(new_titanic_survival[0:10])
titanic_reindex=new_titanic_survival.reset_index(drop=True) #对Age降序排列之后重新加索引
print(titanic_reindex.iloc[0:10])
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
#自定义函数作用于Dataframe
def hundredth_row(column):
hundredth_item=column.iloc[99]
return hundredth_item
hundredth_row=titanic_survival.apply(hundredth_row) #apply方法参数为自定义函数,Dateframe apply这个自定义函数,就可以得到函数作用Dataframe结果
print(hundredth_row)
print('_________________________________________')
def not_null_count(column):
column_null=pd.isnull(column)
null=column[column_null]
return len(null)
column_null_count=titanic_survival.apply(not_null_count)
print(column_null_count)
print('_________________________________________')
def which_class(row):
pclass=row['Pclass']
if pd.isnull(pclass):
return 'Unknown'
elif pclass==1:
return 'First Class'
elif pclass==2:
return 'Second Class'
elif pclass==3:
return 'Third Class'
classes=titanic_survival.apply(which_class,axis=1)
print(classes)
[/code]
**pandas的3种主要数据结构** :
**Series** ——一些值的集合,数据元素;支持float,int,bool,
datatime,timedelta,category,object类型
**DataFrame** ——Series的集合
**Panel** ——DataFrame的集合
```code
import pandas as pd
import os
fandango_site = r"D:\PYTHON35\idle\database\pandas\fandango_score_comparison.csv"
pwd = os.getcwd()
os.chdir(os.path.dirname(fandango_site))
fandango = pd.read_csv(os.path.basename(fandango_site))
print(type(fandango))
series_film=fandango['FILM'] #取第一列电影名
print(series_film[0:5]) #以“索引:电影名”形式列出
series_rt=fandango['RottenTomatoes'] #取第二列电影评分
print(series_rt[0:5])
print('_________________________________________')
#Series可以改变索引,将任意指定属性列定义为索引列
from pandas import Series
film_names=series_film.values #values方法,以列表形式给出对应名字
print(type(film_names))
print(film_names)
rt_scores=series_rt.values
print(rt_scores)
series_custom=Series(rt_scores,index=film_names) #Series函数,index参数为索引列,另一个参数评分值为值
print(series_custom)
print('_____________')
print(series_custom[['Ant-Man (2015)','The Water Diviner (2015)']]) #通过电影名字找样本
print('_____________')
#有了Series索引后,仍然可以利用index编号去寻找样本
fiveten=series_custom[5:10]
print(fiveten)
print('_____________')
original_index=series_custom.index.tolist() #取按电影名字作为索引的列表
sorted_index=sorted(original_index) #按字母升序排列
sorted_by_index=series_custom.reindex(sorted_index) #reindex方法定义新索引
print(sorted_by_index)
print('_____________')
sc2=series_custom.sort_index() #先找到Series变量,按index排序
print(sc2[0:10])
sc3=series_custom.sort_values() #先找到Series变量,按values排序
print(sc3[0:10])
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
print(type(series_custom.values)) #查看series创建对象的数据结构
#<class 'numpy.ndarray'>
import numpy as np
print(np.add(series_custom,series_custom)) #series对象可以进行numpy的相关函数操作
print(np.sin(series_custom))
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
#过滤操作
criteria_one=series_custom>50
criteria_two=series_custom<75
both_criteria=series_custom[criteria_one & criteria_two]
print(both_criteria)
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
#对两个Series进行算术运算
rt_critics=Series(fandango['RottenTomatoes'].values,index=fandango['FILM'])
re_users=Series(fandango['RottenTomatoes_User'].values,index=fandango['FILM'])
rt_mean=(rt_critics+re_users)/2
print(rt_mean)
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
[/code]
```code
#在DataFrame结构里设置索引列
import pandas as pd
import os
fandango_site = r"D:\PYTHON35\idle\database\pandas\fandango_score_comparison.csv"
pwd = os.getcwd()
os.chdir(os.path.dirname(fandango_site))
fandango = pd.read_csv(os.path.basename(fandango_site))
print(type(fandango))
fandango_films=fandango.set_index('FILM',drop=False) #set_index函数设置‘FILM’列为索引项,drop参数False表示‘FILM’列还在,不仅是索引项,‘FILM’在值里也还可查
print(fandango_films.index)
movies=['Ant-Man (2015)','The Water Diviner (2015)']
print(fandango_films.loc[movies]) #loc方法用电影名字索引做参数查找对应行信息
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
#numpy和DataFrame结构结合使用
types=fandango_films.dtypes
print(types)
float_columns=types[types.values=='float64'].index #将每一列的值类型是float64的列属性取出来
float_df=fandango_films[float_columns] #将对应列的值取出来
deviations=float_df.apply(lambda x:np.std(x)) #利用,numpy里的std方法,对float64值类型对应的每一“列”求方差
print(deviations)
rt_mt_user=float_df[['RT_user_norm','Metacritic_user_nom']]
rowdeviation=rt_mt_user.apply(lambda x:np.std(x),axis=1) #axis=1参数,表示横向求方差
print(rowdeviation)