python基础-Pandas数据处理

python基础-Pandas数据处理

pandas 是基于NumPy 的一种工具,该工具是为了解决数据分析任务而创建的。Pandas 纳入了大量库和一些标准的数据模型,提供了高效地操作大型数据集所需的工具。pandas提供了大量能使我们快速便捷地处理数据的函数和方法。你很快就会发现,它是使Python成为强大而高效的数据分析环境的重要因素之一。

代码

import numpy as np
#数据读取和显示
import pandas as pd

food_info = pd.read_csv ("food_info.csv")
print(type(food_info))
print(food_info.dtypes)
print(help(pd.read_csv))
print(food_info.head())
print(food_info.tail())
#具体显示列名,返回列名list结构
print(food_info.columns)
#看维度,看数据规模
print(food_info.shape)

#取某列所需数据
print(food_info.loc[0])
#通过切片取数
print(food_info.loc[3:6])
#按列名进行定位
print(food_info["NDB_No"])
columns = ["Zinc_(mg)","Copper_(mg)"]
zinc_copper = food_info[columns]
#查找
col_names = food_info.columns.tolist()
print(col_names)
gram_columns = []
for c in col_names:
    if c.endswith("(g)"):
        gram_columns.append(c)
gram_df = food_info[gram_columns]
print(gram_df.head(3))

#做加减乘除
print(food_info["Iron_(mg)"])
div_1000 = food_info["Iron_(mg)"]/1000
print(div_1000)

#乘法,对应位置相乘
water_energy = food_info["Water_(g)"]*food_info["Energ_Kcal"]
print(water_energy)
#加一个列
iron_grams = food_info["Iron_(mg)"]/1000
print(food_info.shape)
food_info["Iron_(g)"] = iron_grams
print(food_info.shape)
#求最大、最小、均值等
max_calories = food_info["Energ_Kcal"].max()
#排序,新生成还是在原列上排序,排升序还是降序
food_info.sort_values("Sodium_(mg)",inplace=True)
print(food_info["Sodium_(mg)"])
food_info.sort_values("Sodium_(mg)",inplace=True,ascending=False)
print(food_info["Sodium_(mg)"])
#泰坦尼克号数据预处理实例
titanic_survival = pd.read_csv("titanic_train.csv")
print(titanic_survival.head())
#统计年龄为空的行数
age = titanic_survival["Age"]
age_is_null = pd.isnull(age)
age_null_true = age[age_is_null]
age_null_count = len(age_null_true)
print(age_null_count)
#如果对缺失值不处理,做分母将报错
mean_age = sum(titanic_survival['Age'])/len(titanic_survival['Age'])
print(mean_age)

good_ages = titanic_survival["Age"][age_is_null == False]
correct_mean_age = sum(good_ages)/len(good_ages)
print(correct_mean_age)
#现成的函数
correct_mean_age = titanic_survival["Age"].mean()
print(correct_mean_age)
#不同船舱等级船票价格,不同属性的关系的函数pivot_table
passenger_survival = titanic_survival.pivot_table(index="Pclass",values="Survived",aggfunc=np.mean)
print(passenger_survival)
#缺省状态下为求平均
passenger_age = titanic_survival.pivot_table(index="Pclass",values="Age")
print(passenger_age)
#求和
port_stats = titanic_survival.pivot_table(index="Embarked",values=["Fare","Survived"],aggfunc = np.sum)
print(port_stats)
#丢掉缺失值
drop_na_columns = titanic_survival.dropna(axis = 1)
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age","Sex"])
#定位到具体值
row_index_83_age = titanic_survival.loc[83,"Age"]
row_index_1000_pclass = titanic_survival.loc[766,"Pclass"]
print(row_index_83_age)
print(row_index_1000_pclass)
#排序
new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)
print(new_titanic_survival[0:10])
titanic_reindexed = new_titanic_survival.reset_index(drop=True)
print(titanic_reindexed)
#自定义函数 取100行数据
def hundredth_rwo(column):
    hundredth_item = column.loc[99]
    return  hundredth_item
hundredth_rwo = titanic_survival.apply(hundredth_rwo)
print('******',hundredth_rwo)
#自定义函数,判断空值
def not_null_count(column):
    column_null = pd.isnull(column)
    null = column[column_null]
    return  len(null)
column_null_count = titanic_survival.apply(not_null_count)
#自定义函数,连续值离散化
def generate_age_label(row):
    age = row["Age"]
    if pd.isnull(age):
        return "unknown"
    elif age < 18:
        return "minor"
    else:
        return "adult"
age_labels = titanic_survival.apply(generate_age_label,axis=1)
print(age_labels)

#电影评分案例
fandango = pd.read_csv('fandango_score_comparison.csv')
series_film = fandango['FILM']
print(type(series_film))
print(series_film[0:5])
series_rt = fandango['RottenTomatoes']
print(series_rt[0:5])

from pandas import Series
film_names = series_film.values
print(type(film_names))
rt_scores = series_rt.values
series_custom = Series(rt_scores,index=film_names)
series_custom[['Minions (2015)','Leviathan (2014)']]
fiveten = series_custom[5:10]
print(fiveten)

总结

Series:一维数组,与Numpy中的一维array类似。二者与Python基本的数据结构List也很相近。Series如今能保存不同种数据类型,字符串、boolean值、数字等都能保存在Series中。

Time- Series:以时间为索引的Series。

DataFrame:二维的表格型数据结构。很多功能与R中的data.frame类似。可以将DataFrame理解为Series的容器。

Panel :三维的数组,可以理解为DataFrame的容器。

Panel4D:是像Panel一样的4维数据容器。

PanelND:拥有factory集合,可以创建像Panel4D一样N维命名容器的模块。

上一篇:R学习 / 生存分析


下一篇:Orphans Of The Cold War