2.machinelearning的好伙伴pandas
文件链接和提取码
链接:https://pan.baidu.com/s/1Nwa9N5ah9Otkyrxv9-hFSQ
提取码:go0a
import pandas """ 读取得到dataFrame结构 """ # 读取数据 citi_info = pandas.read_csv(‘citi.csv‘) print(type(citi_info)) # 输出文件类型 print(citi_info.dtypes) # 输出文件中的元素名称和类型,注意:object为字符型 print(help(pandas.read_csv)) # read_csv的使用文本 <class ‘pandas.core.frame.DataFrame‘> Date object Open float64 High float64 Low float64 Close float64 Volume int64 Adj Close float64 dtype: object Help on function read_csv in module pandas.io.parsers: # 显示某几行几列 print(citi_info.head(10)) # 将数据显示出来,但只显示前指定条数据 print(citi_info.tail(10)) # 将数据显示出来,但只显示尾部指定条数据 Date Open High Low Close Volume Adj Close 0 2000-01-03 55.623610 55.623610 51.998701 52.998676 1681900 276.498726 1 2000-01-04 51.998701 52.186196 49.748757 49.748757 2403200 259.543615 2 2000-01-05 50.873729 51.998701 49.498763 51.748707 1742500 269.977529 3 2000-01-06 51.311218 54.686134 51.248720 54.248645 1863200 283.019922 4 2000-01-07 53.998651 54.936128 52.811181 53.998651 1394500 281.715683 5 2000-01-10 54.936128 54.936128 53.561162 53.811156 850300 280.737503 6 2000-01-11 53.373667 54.373642 52.873679 53.123673 996600 277.150845 7 2000-01-12 53.561162 54.998626 53.436165 54.998626 1145700 286.932640 8 2000-01-13 54.998626 56.061099 54.686134 55.623610 1237900 290.193238 9 2000-01-14 56.498589 58.561037 56.436090 57.998551 2225400 302.583511 Date Open High Low Close Volume 4160 2016-07-18 44.279999 44.900002 44.240002 44.570000 18683900 4161 2016-07-19 44.189999 44.689999 44.060001 44.349998 15297300 4162 2016-07-20 44.529999 44.700001 44.200001 44.470001 16547100 4163 2016-07-21 44.500000 44.700001 44.110001 44.130001 14924900 4164 2016-07-22 44.099998 44.360001 43.830002 44.299999 12764200 4165 2016-07-25 44.310001 44.360001 43.910000 44.040001 14391400 4166 2016-07-26 43.930000 44.240002 43.900002 44.150002 16152500 4167 2016-07-27 44.200001 44.709999 44.130001 44.290001 17814200 4168 2016-07-28 44.000000 44.169998 43.680000 44.080002 13239600 4169 2016-07-29 43.869999 44.160000 43.759998 43.810001 13773700 Adj Close 4160 44.408987 4161 44.189781 4162 44.309350 4163 43.970578 4164 44.139962 4165 43.880903 4166 43.990506 4167 44.130000 4168 44.080002 4169 43.810001 # 显示元素名称以及元素类型 print(citi_info.columns) Index([‘Date‘, ‘Open‘, ‘High‘, ‘Low‘, ‘Close‘, ‘Volume‘, ‘Adj Close‘], dtype=‘object‘)
# 显示数据的大小 print(citi_info.shape) (4170, 7)
# 对数据进行切片操作,显示某几列某几行 print(citi_info.loc[0]) print(citi_info.loc[1:3]) print(citi_info["Date"]) columns = [‘Date‘,‘Open‘] print(citi_info[columns]) Date 2000-01-03 Open 55.6236 High 55.6236 Low 51.9987 Close 52.9987 Volume 1681900 Adj Close 276.499 Name: 0, dtype: object Date Open High Low Close Volume Adj Close 1 2000-01-04 51.998701 52.186196 49.748757 49.748757 2403200 259.543615 2 2000-01-05 50.873729 51.998701 49.498763 51.748707 1742500 269.977529 3 2000-01-06 51.311218 54.686134 51.248720 54.248645 1863200 283.019922 0 2000-01-03 1 2000-01-04 2 2000-01-05 3 2000-01-06 4 2000-01-07 ... 4165 2016-07-25 4166 2016-07-26 4167 2016-07-27 4168 2016-07-28 4169 2016-07-29 Name: Date, Length: 4170, dtype: object Date Open 0 2000-01-03 55.623610 1 2000-01-04 51.998701 2 2000-01-05 50.873729 3 2000-01-06 51.311218 4 2000-01-07 53.998651 ... ... ... 4165 2016-07-25 44.310001 4166 2016-07-26 43.930000 4167 2016-07-27 44.200001 4168 2016-07-28 44.000000 4169 2016-07-29 43.869999 [4170 rows x 2 columns]
# 找出指定的元素的集合 print(‘找出指定的元素的集合‘) columns = citi_info.columns.tolist() print(columns) gram_list = [] for c in columns: if c.endswith(‘e‘): gram_list.append(c) print(gram_list) print(citi_info[gram_list]) 找出指定的元素的集合 [‘Date‘, ‘Open‘, ‘High‘, ‘Low‘, ‘Close‘, ‘Volume‘, ‘Adj Close‘] [‘Date‘, ‘Close‘, ‘Volume‘, ‘Adj Close‘] Date Close Volume Adj Close 0 2000-01-03 52.998676 1681900 276.498726 1 2000-01-04 49.748757 2403200 259.543615 2 2000-01-05 51.748707 1742500 269.977529 3 2000-01-06 54.248645 1863200 283.019922 4 2000-01-07 53.998651 1394500 281.715683 ... ... ... ... ... 4165 2016-07-25 44.040001 14391400 43.880903 4166 2016-07-26 44.150002 16152500 43.990506 4167 2016-07-27 44.290001 17814200 44.130000 4168 2016-07-28 44.080002 13239600 44.080002 4169 2016-07-29 43.810001 13773700 43.810001 [4170 rows x 4 columns]
# 对每一列进行数学运算 divided = citi_info[‘Close‘]*100 print(divided) print(citi_info.shape) subtraction = citi_info[‘High‘] - citi_info[‘Low‘] citi_info[‘output‘] = subtraction print(citi_info.shape) print(citi_info[‘output‘]) 0 5299.8676 1 4974.8757 2 5174.8707 3 5424.8645 4 5399.8651 ... 4165 4404.0001 4166 4415.0002 4167 4429.0001 4168 4408.0002 4169 4381.0001 Name: Close, Length: 4170, dtype: float64 (4170, 7) (4170, 8) 0 3.624909 1 2.437439 2 2.499938 3 3.437414 4 2.124947 ... 4165 0.450001 4166 0.340000 4167 0.579998 4168 0.489998 4169 0.400002 Name: output, Length: 4170, dtype: float64
# 对一列的极值,归一化操作 max_date = citi_info[‘High‘].max() print(max_date) normalize_date = citi_info[‘High‘]/max_date citi_info[‘High_normalize_date‘] = normalize_date print(citi_info[‘High_normalize_date‘]) 78.310544 0 0.710295 1 0.666401 2 0.664006 3 0.698324 4 0.701516 ... 4165 0.566463 4166 0.564930 4167 0.570932 4168 0.564036 4169 0.563909 Name: High_normalize_date, Length: 4170, dtype: float64
# 对数据进行排序 citi_info.sort_values(‘Close‘,inplace=True) # 升序替换原来列不生成新列 print(citi_info[‘Close‘]) citi_info.sort_values(‘Close‘,inplace=True,ascending=False) # 降序 print(citi_info[‘Close‘]) # 2305 1.020000 2306 1.030000 2307 1.050000 2304 1.130000 2302 1.200000 ... 162 75.373117 164 75.935603 163 76.748083 160 76.873080 161 77.310569 Name: Close, Length: 4170, dtype: float64 161 77.310569 160 76.873080 163 76.748083 164 75.935603 162 75.373117 ... 2302 1.200000 2304 1.130000 2307 1.050000 2306 1.030000 2305 1.020000 Name: Close, Leng
pandas调用函数扩展
import numpy as np
import pandas as pd titanic_survival = pd.read_csv(‘titanic_train.csv‘) print(titanic_survival.head()) age = titanic_survival["Age"] print(age) age_is_null = pd.isnull(age) # 判断是否缺失,保留年龄为空的数据 print(age_is_null) age_null_true = age[age_is_null] # 答应年龄为空的数据 print(age_null_true) age_null_count = len(age_null_true) print(age_null_count) PassengerId Survived Pclass 0 1 0 3 1 2 1 1 2 3 1 3 3 4 1 1 4 5 0 3 Name Sex Age SibSp 0 Braund, Mr. Owen Harris male 22.0 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 2 Heikkinen, Miss. Laina female 26.0 0 3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 4 Allen, Mr. William Henry male 35.0 0 Parch Ticket Fare Cabin Embarked 0 0 A/5 21171 7.2500 NaN S 1 0 PC 17599 71.2833 C85 C 2 0 STON/O2. 3101282 7.9250 NaN S 3 0 113803 53.1000 C123 S 4 0 373450 8.0500 NaN S 0 22.0 1 38.0 2 26.0 3 35.0 4 35.0 ... 886 27.0 887 19.0 888 NaN 889 26.0 890 32.0 Name: Age, Length: 891, dtype: float64 0 False 1 False 2 False 3 False 4 False ... 886 False 887 False 888 True 889 False 890 False Name: Age, Length: 891, dtype: bool 5 NaN 17 NaN 19 NaN 26 NaN 28 NaN .. 859 NaN 863 NaN 868 NaN 878 NaN 888 NaN Name: Age, Length: 177, dtype: float64 177
# 得到平均年龄错误做法 mean_age = sum(titanic_survival[‘Age‘]/len(titanic_survival["Age"])) print(mean_age) nan
# 去掉空值得到平均年龄 # 对于缺失值可以用均值,中值填充 good_age = titanic_survival["Age"][age_is_null==False] correct_mean_age = sum(good_age)/len(good_age) correct_mean_age2 = titanic_survival["Age"].mean() print(correct_mean_age,correct_mean_age2) 29.69911764705882 29.69911764705882
# 对每个等级的船票价格统计求平均值 passenger_class = [1,2,3] fares_by_class = {} for this_class in passenger_class: Pclass_passengers = titanic_survival[titanic_survival[‘Pclass‘] == this_class] Pclass_fares = Pclass_passengers["Fare"] mean_fare = Pclass_fares.mean() fares_by_class[this_class] = mean_fare print(fares_by_class) {1: 84.1546875, 2: 20.662183152173913, 3: 13.675550101832993}
# 快速统计量之间的关系 # 计算每个船舱等级的所获救人数的平均值 passenger_survival = titanic_survival.pivot_table(index = ‘Pclass‘,values = ‘Survived‘, aggfunc = np.mean)#对每个Pclass的平均获救人数 print(passenger_survival) # 计算年龄在每个等级的船舱的平均值 passenger_age = titanic_survival.pivot_table(index=‘Pclass‘,values="Age",aggfunc=np.mean) # 默认aggfunc=np.mean print(passenger_age) # 计算每个码头收的钱总数和获救人数 port_stats = titanic_survival.pivot_table(index=‘Embarked‘, values=[‘Fare‘,‘Survived‘],aggfunc=np.sum) print(port_stats) Survived Pclass 1 0.629630 2 0.472826 3 0.242363 Age Pclass 1 38.233441 2 29.877630 3 25.140620 Fare Survived Embarked C 10072.2962 93 Q 1022.2543 30 S 17439.3988 217
# 对缺失值的处理 drop_na_columns = titanic_survival.dropna(axis=1) # 去除有缺失值的行 new_titanic_survival = titanic_survival.dropna(axis=0,subset=[‘Age‘,"Sex"]) # 对指定列进行遍历,去除有缺失值的行 print(new_titanic_survival) PassengerId Survived Pclass 0 1 0 3 1 2 1 1 2 3 1 3 3 4 1 1 4 5 0 3 .. ... ... ... 885 886 0 3 886 887 0 2 887 888 1 1 889 890 1 1 890 891 0 3 Name Sex Age SibSp 0 Braund, Mr. Owen Harris male 22.0 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 2 Heikkinen, Miss. Laina female 26.0 0 3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 4 Allen, Mr. William Henry male 35.0 0 .. ... ... ... ... 885 Rice, Mrs. William (Margaret Norton) female 39.0 0 886 Montvila, Rev. Juozas male 27.0 0 887 Graham, Miss. Margaret Edith female 19.0 0 889 Behr, Mr. Karl Howell male 26.0 0 890 Dooley, Mr. Patrick male 32.0 0 Parch Ticket Fare Cabin Embarked 0 0 A/5 21171 7.2500 NaN S 1 0 PC 17599 71.2833 C85 C 2 0 STON/O2. 3101282 7.9250 NaN S 3 0 113803 53.1000 C123 S 4 0 373450 8.0500 NaN S .. ... ... ... ... ... 885 5 382652 29.1250 NaN Q 886 0 211536 13.0000 NaN S 887 0 112053 30.0000 B42 S 889 0 111369 30.0000 C148 C 890 0 370376 7.7500 NaN Q [714 rows x 12 columns]
# 定位到一个具体值 passenger_83_age = titanic_survival.loc[83,‘Age‘] print(‘83号乘客的年龄:‘,passenger_83_age) 83号乘客的年龄: 28.0
# 对元素重新排列,并重新规划索引值 new_titanic_survival = titanic_survival.sort_values(‘Age‘,ascending=False) titanic_reinex = new_titanic_survival.reset_index(drop=True) # 将原来的索引值去除 print(titanic_reinex) PassengerId Survived Pclass Name 0 631 1 1 Barkworth, Mr. Algernon Henry Wilson 1 852 0 3 Svensson, Mr. Johan 2 494 0 1 Artagaveytia, Mr. Ramon 3 97 0 1 Goldschmidt, Mr. George B 4 117 0 3 Connors, Mr. Patrick .. ... ... ... ... 886 860 0 3 Razi, Mr. Raihed 887 864 0 3 Sage, Miss. Dorothy Edith "Dolly" 888 869 0 3 van Melkebeke, Mr. Philemon 889 879 0 3 Laleff, Mr. Kristo 890 889 0 3 Johnston, Miss. Catherine Helen "Carrie" Sex Age SibSp Parch Ticket Fare Cabin Embarked 0 male 80.0 0 0 27042 30.0000 A23 S 1 male 74.0 0 0 347060 7.7750 NaN S 2 male 71.0 0 0 PC 17609 49.5042 NaN C 3 male 71.0 0 0 PC 17754 34.6542 A5 C 4 male 70.5 0 0 370369 7.7500 NaN Q .. ... ... ... ... ... ... ... ... 886 male NaN 0 0 2629 7.2292 NaN C 887 female NaN 8 2 CA. 2343 69.5500 NaN S 888 male NaN 0 0 345777 9.5000 NaN S 889 male NaN 0 0 349217 7.8958 NaN S 890 female NaN 1 2 W./C. 6607 23.4500 NaN S [891 rows x 12 columns]
# pandas自定义函数 def hundredth_row(column): hundredth_item = column.loc[99] return hundredth_item hundredth_row = titanic_survival.apply(hundredth_row) print(hundredth_row) PassengerId 100 Survived 0 Pclass 2 Name Kantor, Mr. Sinai Sex male Age 34 SibSp 1 Parch 0 Ticket 244367 Fare 26 Cabin NaN Embarked S dtype: object
pandas之series
from pandas import Series import pandas as pd """ 每一列为serie结构 """ fandango = pd.read_csv(‘fandango_score_comparison.csv‘) series_film = fandango["FILM"] print(type(series_film)) <class ‘pandas.core.series.Series‘>
# 对指定元素进行切片 print(series_film[0:5]) series_rt = fandango["RottenTomatoes"] film_names = series_film.values print(type(film_names)) # pandas封装了numpy所以每个元素的数值集合是ndarray结构 rt_scores = series_rt.values 0 Avengers: Age of Ultron (2015) 1 Cinderella (2015) 2 Ant-Man (2015) 3 Do You Believe? (2015) 4 Hot Tub Time Machine 2 (2015) Name: FILM, dtype: object <class ‘numpy.ndarray‘>
# 以film_name为索引统计分数,输出固定值 series_custom = Series(rt_scores,index=film_names) print(series_custom[[‘Cinderella (2015)‘,‘Ant-Man (2015)‘]]) Cinderella (2015) 85 Ant-Man (2015) 80 dtype: int64
# 输出第5到10的数据 fiveten = series_custom[5:10] print(fiveten) The Water Diviner (2015) 63 Irrational Man (2015) 42 Top Five (2014) 86 Shaun the Sheep Movie (2015) 99 Love & Mercy (2015) 89 dtype: int64
# 设置索引值 set_film_index = fandango.set_index("FILM", drop=False)