3-1:
import pandas
food_info = pandas.read_csv("food_info.csv")
#print(type(food_info))
print(food_info.dtypes)
print("------------------------")
#first_rows = food_info.head()
#print first_rows
#print(food_info.head(3))
#print food_info.columns
print(food_info.shape)
print("------------------------")
#pandas uses zero-indexing
#Series object representing the row at index 0.
#print food_info.loc[0]
# Series object representing the seventh row.
#food_info.loc[6]
# Will throw an error: "KeyError: 'the label [8620] is not in the [index]'"
#food_info.loc[8620]
#The object dtype is equivalent to a string in Python
#object - For string values
#int - For integer values
#float - For float values
#datetime - For time values
#bool - For Boolean values
#print(food_info.dtypes)
# Returns a DataFrame containing the rows at indexes 3, 4, 5, and 6.
#food_info.loc[3:6]
# Returns a DataFrame containing the rows at indexes 2, 5, and 10. Either of the following approaches will work.
# Method 1
#two_five_ten = [2,5,10]
#food_info.loc[two_five_ten]
# Method 2
print(food_info.loc[[2,5,10]])
print("------------------------")
#Series object representing the "NDB_No" column.
ndb_col = food_info["NDB_No"]#索引列名为NDB_No的列
print(ndb_col)
#Alternatively, you can access a column by passing in a string variable.
# col_name = "NDB_No"
# ndb_col = food_info[col_name]
print("------------------------")
columns = ["Zinc_(mg)", "Copper_(mg)"]#定位两个列
zinc_copper = food_info[columns]
print(zinc_copper)
# print zinc_copper
# Skipping the assignment.
# zinc_copper = food_info[["Zinc_(mg)", "Copper_(mg)"]]
print("------------------------")
#print(food_info.columns)
#print(food_info.head(2))
col_names = food_info.columns.tolist()#列名list
print(col_names)
gram_columns = []
for c in col_names:
if c.endswith("(g)"):#列名以g结尾
gram_columns.append(c)
gram_df = food_info[gram_columns]
print(gram_df.head(3))
结果:
NDB_No int64
Shrt_Desc object
Water_(g) float64
Energ_Kcal int64
Protein_(g) float64
Lipid_Tot_(g) float64
Ash_(g) float64
Carbohydrt_(g) float64
Fiber_TD_(g) float64
Sugar_Tot_(g) float64
Calcium_(mg) float64
Iron_(mg) float64
Magnesium_(mg) float64
Phosphorus_(mg) float64
Potassium_(mg) float64
Sodium_(mg) float64
Zinc_(mg) float64
Copper_(mg) float64
Manganese_(mg) float64
Selenium_(mcg) float64
Vit_C_(mg) float64
Thiamin_(mg) float64
Riboflavin_(mg) float64
Niacin_(mg) float64
Vit_B6_(mg) float64
Vit_B12_(mcg) float64
Vit_A_IU float64
Vit_A_RAE float64
Vit_E_(mg) float64
Vit_D_mcg float64
Vit_D_IU float64
Vit_K_(mcg) float64
FA_Sat_(g) float64
FA_Mono_(g) float64
FA_Poly_(g) float64
Cholestrl_(mg) float64
dtype: object
------------------------
(8618, 36)
------------------------
NDB_No Shrt_Desc ... FA_Poly_(g) Cholestrl_(mg)
2 1003 BUTTER OIL ANHYDROUS ... 3.694 256.0
5 1006 CHEESE BRIE ... 0.826 100.0
10 1011 CHEESE COLBY ... 0.953 95.0
[3 rows x 36 columns]
------------------------
0 1001
1 1002
2 1003
3 1004
4 1005
5 1006
6 1007
7 1008
8 1009
9 1010
10 1011
11 1012
12 1013
13 1014
14 1015
15 1016
16 1017
17 1018
18 1019
19 1020
20 1021
21 1022
22 1023
23 1024
24 1025
25 1026
26 1027
27 1028
28 1029
29 1030
...
8588 43544
8589 43546
8590 43550
8591 43566
8592 43570
8593 43572
8594 43585
8595 43589
8596 43595
8597 43597
8598 43598
8599 44005
8600 44018
8601 44048
8602 44055
8603 44061
8604 44074
8605 44110
8606 44158
8607 44203
8608 44258
8609 44259
8610 44260
8611 48052
8612 80200
8613 83110
8614 90240
8615 90480
8616 90560
8617 93600
Name: NDB_No, Length: 8618, dtype: int64
------------------------
Zinc_(mg) Copper_(mg)
0 0.09 0.000
1 0.05 0.016
2 0.01 0.001
3 2.66 0.040
4 2.60 0.024
5 2.38 0.019
6 2.38 0.021
7 2.94 0.024
8 3.43 0.056
9 2.79 0.042
10 3.07 0.042
11 0.40 0.029
12 0.33 0.040
13 0.47 0.030
14 0.51 0.033
15 0.38 0.028
16 0.51 0.019
17 3.75 0.036
18 2.88 0.032
19 3.50 0.025
20 1.14 0.080
21 3.90 0.036
22 3.90 0.032
23 2.10 0.021
24 3.00 0.032
25 2.92 0.011
26 2.46 0.022
27 2.76 0.025
28 3.61 0.034
29 2.81 0.031
... ... ...
8588 3.30 0.377
8589 0.05 0.040
8590 0.05 0.030
8591 1.15 0.116
8592 5.03 0.200
8593 3.83 0.545
8594 0.08 0.035
8595 3.90 0.027
8596 4.10 0.100
8597 3.13 0.027
8598 0.13 0.000
8599 0.02 0.000
8600 0.09 0.037
8601 0.21 0.026
8602 2.77 0.571
8603 0.41 0.838
8604 0.05 0.028
8605 0.03 0.023
8606 0.10 0.112
8607 0.02 0.020
8608 1.49 0.854
8609 0.19 0.040
8610 0.10 0.038
8611 0.85 0.182
8612 1.00 0.250
8613 1.10 0.100
8614 1.55 0.033
8615 0.19 0.020
8616 1.00 0.400
8617 1.00 0.250
[8618 rows x 2 columns]
------------------------
['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)', 'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)', 'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)', 'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)', 'Copper_(mg)', 'Manganese_(mg)', 'Selenium_(mcg)', 'Vit_C_(mg)', 'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)', 'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg', 'Vit_D_IU', 'Vit_K_(mcg)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)', 'Cholestrl_(mg)']
Water_(g) Protein_(g) Lipid_Tot_(g) ... FA_Sat_(g) FA_Mono_(g) FA_Poly_(g)
0 15.87 0.85 81.11 ... 51.368 21.021 3.043
1 15.87 0.85 81.11 ... 50.489 23.426 3.012
2 0.24 0.28 99.48 ... 61.924 28.732 3.694
[3 rows x 10 columns]
3-2:
import pandas
food_info = pandas.read_csv("food_info.csv")
col_names = food_info.columns.tolist()
print(col_names)
print(food_info.head(3))
print("------------------------")
print(food_info["Iron_(mg)"])
div_1000 = food_info["Iron_(mg)"] / 1000
print(div_1000)
# Adds 100 to each value in the column and returns a Series object.
#add_100 = food_info["Iron_(mg)"] + 100
# Subtracts 100 from each value in the column and returns a Series object.
#sub_100 = food_info["Iron_(mg)"] - 100
# Multiplies each value in the column by 2 and returns a Series object.
#mult_2 = food_info["Iron_(mg)"]*2
print("------------------------")
#It applies the arithmetic operator to the first value in both columns, the second value in both columns, and so on
water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]
water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]
iron_grams = food_info["Iron_(mg)"] / 1000
print(food_info.shape)
food_info["Iron_(g)"] = iron_grams
print(food_info.shape)
print("------------------------")
#Score=2×(Protein_(g))−0.75×(Lipid_Tot_(g))
weighted_protein = food_info["Protein_(g)"] * 2
weighted_fat = -0.75 * food_info["Lipid_Tot_(g)"]
initial_rating = weighted_protein + weighted_fat
print("------------------------")
# the "Vit_A_IU" column ranges from 0 to 100000, while the "Fiber_TD_(g)" column ranges from 0 to 79
#For certain calculations, columns like "Vit_A_IU" can have a greater effect on the result,
#due to the scale of the values
# The largest value in the "Energ_Kcal" column.
max_calories = food_info["Energ_Kcal"].max()#列的最大值
# Divide the values in "Energ_Kcal" by the largest value.
normalized_calories = food_info["Energ_Kcal"] / max_calories
normalized_protein = food_info["Protein_(g)"] / food_info["Protein_(g)"].max()#归一化
normalized_fat = food_info["Lipid_Tot_(g)"] / food_info["Lipid_Tot_(g)"].max()
food_info["Normalized_Protein"] = normalized_protein
food_info["Normalized_Fat"] = normalized_fat
print("------------------------")
#By default, pandas will sort the data by the column we specify in ascending order and return a new DataFrame
# Sorts the DataFrame in-place, rather than returning a new DataFrame.
#print food_info["Sodium_(mg)"]
food_info.sort_values("Sodium_(mg)", inplace=True)#从小到大排序
print(food_info["Sodium_(mg)"])
#Sorts by descending order, rather than ascending.
food_info.sort_values("Sodium_(mg)", inplace=True, ascending=False)#从大到小排序,降序
print(food_info["Sodium_(mg)"])
结果:
['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)', 'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)', 'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)', 'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)', 'Copper_(mg)', 'Manganese_(mg)', 'Selenium_(mcg)', 'Vit_C_(mg)', 'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)', 'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg', 'Vit_D_IU', 'Vit_K_(mcg)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)', 'Cholestrl_(mg)']
NDB_No Shrt_Desc ... FA_Poly_(g) Cholestrl_(mg)
0 1001 BUTTER WITH SALT ... 3.043 215.0
1 1002 BUTTER WHIPPED WITH SALT ... 3.012 219.0
2 1003 BUTTER OIL ANHYDROUS ... 3.694 256.0
[3 rows x 36 columns]
------------------------
0 0.02
1 0.16
2 0.00
3 0.31
4 0.43
5 0.50
6 0.33
7 0.64
8 0.16
9 0.21
10 0.76
11 0.07
12 0.16
13 0.15
14 0.13
15 0.14
16 0.38
17 0.44
18 0.65
19 0.23
20 0.52
21 0.24
22 0.17
23 0.13
24 0.72
25 0.44
26 0.20
27 0.22
28 0.23
29 0.41
...
8588 9.00
8589 0.30
8590 0.10
8591 1.63
8592 34.82
8593 2.28
8594 0.17
8595 0.17
8596 4.86
8597 0.25
8598 0.23
8599 0.13
8600 0.11
8601 0.68
8602 7.83
8603 3.11
8604 0.30
8605 0.18
8606 0.80
8607 0.04
8608 3.87
8609 0.05
8610 0.38
8611 5.20
8612 1.50
8613 1.40
8614 0.58
8615 3.60
8616 3.50
8617 1.40
Name: Iron_(mg), Length: 8618, dtype: float64
0 0.00002
1 0.00016
2 0.00000
3 0.00031
4 0.00043
5 0.00050
6 0.00033
7 0.00064
8 0.00016
9 0.00021
10 0.00076
11 0.00007
12 0.00016
13 0.00015
14 0.00013
15 0.00014
16 0.00038
17 0.00044
18 0.00065
19 0.00023
20 0.00052
21 0.00024
22 0.00017
23 0.00013
24 0.00072
25 0.00044
26 0.00020
27 0.00022
28 0.00023
29 0.00041
...
8588 0.00900
8589 0.00030
8590 0.00010
8591 0.00163
8592 0.03482
8593 0.00228
8594 0.00017
8595 0.00017
8596 0.00486
8597 0.00025
8598 0.00023
8599 0.00013
8600 0.00011
8601 0.00068
8602 0.00783
8603 0.00311
8604 0.00030
8605 0.00018
8606 0.00080
8607 0.00004
8608 0.00387
8609 0.00005
8610 0.00038
8611 0.00520
8612 0.00150
8613 0.00140
8614 0.00058
8615 0.00360
8616 0.00350
8617 0.00140
Name: Iron_(mg), Length: 8618, dtype: float64
------------------------
(8618, 36)
(8618, 37)
------------------------
------------------------
------------------------
760 0.0
758 0.0
405 0.0
761 0.0
2269 0.0
763 0.0
764 0.0
770 0.0
774 0.0
396 0.0
395 0.0
6827 0.0
394 0.0
393 0.0
391 0.0
390 0.0
787 0.0
788 0.0
2270 0.0
2231 0.0
407 0.0
748 0.0
409 0.0
747 0.0
702 0.0
703 0.0
704 0.0
705 0.0
706 0.0
707 0.0
...
8153 NaN
8155 NaN
8156 NaN
8157 NaN
8158 NaN
8159 NaN
8160 NaN
8161 NaN
8163 NaN
8164 NaN
8165 NaN
8167 NaN
8169 NaN
8170 NaN
8172 NaN
8173 NaN
8174 NaN
8175 NaN
8176 NaN
8177 NaN
8178 NaN
8179 NaN
8180 NaN
8181 NaN
8183 NaN
8184 NaN
8185 NaN
8195 NaN
8251 NaN
8267 NaN
Name: Sodium_(mg), Length: 8618, dtype: float64
276 38758.0
5814 27360.0
6192 26050.0
1242 26000.0
1245 24000.0
1243 24000.0
1244 23875.0
292 17000.0
1254 11588.0
5811 10600.0
8575 9690.0
291 8068.0
1249 8031.0
5812 7893.0
1292 7851.0
293 7203.0
4472 7027.0
4836 6820.0
1261 6580.0
3747 6008.0
1266 5730.0
4835 5586.0
4834 5493.0
1263 5356.0
1553 5203.0
1552 5053.0
1251 4957.0
1257 4843.0
294 4616.0
8613 4450.0
...
8153 NaN
8155 NaN
8156 NaN
8157 NaN
8158 NaN
8159 NaN
8160 NaN
8161 NaN
8163 NaN
8164 NaN
8165 NaN
8167 NaN
8169 NaN
8170 NaN
8172 NaN
8173 NaN
8174 NaN
8175 NaN
8176 NaN
8177 NaN
8178 NaN
8179 NaN
8180 NaN
8181 NaN
8183 NaN
8184 NaN
8185 NaN
8195 NaN
8251 NaN
8267 NaN
Name: Sodium_(mg), Length: 8618, dtype: float64
3-3:
import pandas as pd
import numpy as np
titanic_survival = pd.read_csv("titanic_train.csv")
print(titanic_survival.head())
print("------------------------")
#The Pandas library uses NaN, which stands for "not a number", to indicate a missing value.
#we can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values
age = titanic_survival["Age"]#age列
#print(age.loc[0:10])
age_is_null = pd.isnull(age)#判断当前元素是否是一个缺失值
# print(age_is_null)
age_null_true = age[age_is_null]
print(age_null_true)
age_null_count = len(age_null_true)#缺失值长度,即个数
print(age_null_count)
print("------------------------")
#The result of this is that mean_age would be nan. This is because any calculations we do with a null value also result in a null value
mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"])
print(mean_age)
print("------------------------")
#we have to filter out the missing values before we calculate the mean.
good_ages = titanic_survival["Age"][age_is_null == False]#没有缺失值的age
#print good_ages
correct_mean_age = sum(good_ages) / len(good_ages)
print(correct_mean_age)
print("------------------------")
# missing data is so common that many pandas methods automatically filter for it
correct_mean_age = titanic_survival["Age"].mean()#平均年龄
print(correct_mean_age)
print("------------------------")
#mean fare for each class
passenger_classes = [1, 2, 3]
fares_by_class = {}
for this_class in passenger_classes:
pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]
pclass_fares = pclass_rows["Fare"]
fare_for_class = pclass_fares.mean()
fares_by_class[this_class] = fare_for_class
print(fares_by_class)
print("-------------------------")
#index tells the method which column to group by
#values is the column that we want to apply the calculation to
#aggfunc specifies the calculation we want to perform
passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean)
print (passenger_survival)
print("--------------------------")
passenger_age = titanic_survival.pivot_table(index="Pclass", values="Age")#各等级舱平均年龄
print(passenger_age)
print("--------------------------")
port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum)
print(port_stats)
print("--------------------------")
#specifying axis=1 or axis='columns' will drop any columns that have null values
drop_na_columns = titanic_survival.dropna(axis=1)#把含有缺失值的行扔掉
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age", "Sex"])#扔掉age和sex两列中的缺失值
#print new_titanic_survival
print("--------------------------")
row_index_83_age = titanic_survival.loc[83,"Age"]#第83个样本的age
row_index_1000_pclass = titanic_survival.loc[766,"Pclass"]
print(row_index_83_age)
print(row_index_1000_pclass)
print("--------------------------")
new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)
print(new_titanic_survival[0:10])
titanic_reindexed = new_titanic_survival.reset_index(drop=True)#index重新排序
print(titanic_reindexed.iloc[0:10])
print("--------------------------")
# This function returns the hundredth item from a series
def hundredth_row(column):#函数:第100行数据
# Extract the hundredth item
hundredth_item = column.iloc[99]
return hundredth_item
# Return the hundredth item from each column
hundredth_row = titanic_survival.apply(hundredth_row)#调用函数
print(hundredth_row)
print("--------------------------")
def not_null_count(column):
column_null = pd.isnull(column)#判断是否为空值
null = column[column_null]
return len(null)#缺失值个数
column_null_count = titanic_survival.apply(not_null_count)
print(column_null_count)
print("--------------------------")
#By passing in the axis=1 argument, we can use the DataFrame.apply() method to iterate over rows instead of columns.
def which_class(row):
pclass = row['Pclass']
if pd.isnull(pclass):
return "Unknown"
elif pclass == 1:
return "First Class"
elif pclass == 2:
return "Second Class"
elif pclass == 3:
return "Third Class"
classes = titanic_survival.apply(which_class, axis=1)
print(classes)
print("------------------------")
def is_minor(row):
if row["Age"] < 18:
return True
else:
return False
minors = titanic_survival.apply(is_minor, axis=1)
#print minors
def generate_age_label(row):
age = row["Age"]
if pd.isnull(age):
return "unknown"
elif age < 18:
return "minor"#未成年
else:
return "adult"
age_labels = titanic_survival.apply(generate_age_label, axis=1)
print(age_labels)
print("----------------------")
titanic_survival['age_labels'] = age_labels
age_group_survival = titanic_survival.pivot_table(index="age_labels", values="Survived")#成年情况和获救情况之间的关系
print(age_group_survival)
结果:
PassengerId Survived Pclass ... Fare Cabin Embarked
0 1 0 3 ... 7.2500 NaN S
1 2 1 1 ... 71.2833 C85 C
2 3 1 3 ... 7.9250 NaN S
3 4 1 1 ... 53.1000 C123 S
4 5 0 3 ... 8.0500 NaN S
[5 rows x 12 columns]
------------------------
5 NaN
17 NaN
19 NaN
26 NaN
28 NaN
29 NaN
31 NaN
32 NaN
36 NaN
42 NaN
45 NaN
46 NaN
47 NaN
48 NaN
55 NaN
64 NaN
65 NaN
76 NaN
77 NaN
82 NaN
87 NaN
95 NaN
101 NaN
107 NaN
109 NaN
121 NaN
126 NaN
128 NaN
140 NaN
154 NaN
..
718 NaN
727 NaN
732 NaN
738 NaN
739 NaN
740 NaN
760 NaN
766 NaN
768 NaN
773 NaN
776 NaN
778 NaN
783 NaN
790 NaN
792 NaN
793 NaN
815 NaN
825 NaN
826 NaN
828 NaN
832 NaN
837 NaN
839 NaN
846 NaN
849 NaN
859 NaN
863 NaN
868 NaN
878 NaN
888 NaN
Name: Age, Length: 177, dtype: float64
177
------------------------
nan
------------------------
29.69911764705882
------------------------
29.69911764705882
------------------------
{1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}
-------------------------
Survived
Pclass
1 0.629630
2 0.472826
3 0.242363
--------------------------
Age
Pclass
1 38.233441
2 29.877630
3 25.140620
--------------------------
Fare Survived
Embarked
C 10072.2962 93
Q 1022.2543 30
S 17439.3988 217
--------------------------
--------------------------
28.0
1
--------------------------
PassengerId Survived Pclass ... Fare Cabin Embarked
630 631 1 1 ... 30.0000 A23 S
851 852 0 3 ... 7.7750 NaN S
493 494 0 1 ... 49.5042 NaN C
96 97 0 1 ... 34.6542 A5 C
116 117 0 3 ... 7.7500 NaN Q
672 673 0 2 ... 10.5000 NaN S
745 746 0 1 ... 71.0000 B22 S
33 34 0 2 ... 10.5000 NaN S
54 55 0 1 ... 61.9792 B30 C
280 281 0 3 ... 7.7500 NaN Q
[10 rows x 12 columns]
PassengerId Survived Pclass ... Fare Cabin Embarked
0 631 1 1 ... 30.0000 A23 S
1 852 0 3 ... 7.7750 NaN S
2 494 0 1 ... 49.5042 NaN C
3 97 0 1 ... 34.6542 A5 C
4 117 0 3 ... 7.7500 NaN Q
5 673 0 2 ... 10.5000 NaN S
6 746 0 1 ... 71.0000 B22 S
7 34 0 2 ... 10.5000 NaN S
8 55 0 1 ... 61.9792 B30 C
9 281 0 3 ... 7.7500 NaN Q
[10 rows x 12 columns]
--------------------------
PassengerId 100
Survived 0
Pclass 2
Name Kantor, Mr. Sinai
Sex male
Age 34
SibSp 1
Parch 0
Ticket 244367
Fare 26
Cabin NaN
Embarked S
dtype: object
--------------------------
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
--------------------------
0 Third Class
1 First Class
2 Third Class
3 First Class
4 Third Class
5 Third Class
6 First Class
7 Third Class
8 Third Class
9 Second Class
10 Third Class
11 First Class
12 Third Class
13 Third Class
14 Third Class
15 Second Class
16 Third Class
17 Second Class
18 Third Class
19 Third Class
20 Second Class
21 Second Class
22 Third Class
23 First Class
24 Third Class
25 Third Class
26 Third Class
27 First Class
28 Third Class
29 Third Class
...
861 Second Class
862 First Class
863 Third Class
864 Second Class
865 Second Class
866 Second Class
867 First Class
868 Third Class
869 Third Class
870 Third Class
871 First Class
872 First Class
873 Third Class
874 Second Class
875 Third Class
876 Third Class
877 Third Class
878 Third Class
879 First Class
880 Second Class
881 Third Class
882 Third Class
883 Second Class
884 Third Class
885 Third Class
886 Second Class
887 First Class
888 Third Class
889 First Class
890 Third Class
Length: 891, dtype: object
------------------------
0 adult
1 adult
2 adult
3 adult
4 adult
5 unknown
6 adult
7 minor
8 adult
9 minor
10 minor
11 adult
12 adult
13 adult
14 minor
15 adult
16 minor
17 unknown
18 adult
19 unknown
20 adult
21 adult
22 minor
23 adult
24 minor
25 adult
26 unknown
27 adult
28 unknown
29 unknown
...
861 adult
862 adult
863 unknown
864 adult
865 adult
866 adult
867 adult
868 unknown
869 minor
870 adult
871 adult
872 adult
873 adult
874 adult
875 minor
876 adult
877 adult
878 unknown
879 adult
880 adult
881 adult
882 adult
883 adult
884 adult
885 adult
886 adult
887 adult
888 unknown
889 adult
890 adult
Length: 891, dtype: object
----------------------
Survived
age_labels
adult 0.381032
minor 0.539823
unknown 0.293785
3-4:
import pandas as pd
fandango = pd.read_csv('fandango_score_comparison.csv')
series_film = fandango['FILM']
print(series_film[0:5])
series_rt = fandango['RottenTomatoes']
print (series_rt[0:5])
print("-----------1-------------")
# Import the Series object from pandas
from pandas import Series
film_names = series_film.values
print(type(film_names))
#print film_names
rt_scores = series_rt.values
#print rt_scores
series_custom = Series(rt_scores , index=film_names)
print(series_custom[['Minions (2015)', 'Leviathan (2014)']])
print("----------2--------------")
# int index is also aviable
series_custom = Series(rt_scores , index=film_names)
# series_custom[['Minions (2015)', 'Leviathan (2014)']]
fiveten = series_custom[5:10]
print(fiveten)
print("------------3------------")
original_index = series_custom.index.tolist()
#print original_index
sorted_index = sorted(original_index)
sorted_by_index = series_custom.reindex(sorted_index)
print(sorted_by_index)
print("----------4--------------")
sc2 = series_custom.sort_index()
sc3 = series_custom.sort_values()
#print(sc2[0:10])
print(sc3[0:10])
print("----------5--------------")
#The values in a Series object are treated as an ndarray, the core data type in NumPy
import numpy as np
# Add each value with each other
print(np.add(series_custom, series_custom))
# Apply sine function to each value
np.sin(series_custom)
# Return the highest value (will return a single value not a Series)
np.max(series_custom)
print("----------6---------------")
#will actually return a Series object with a boolean value for each film
series_custom > 50
series_greater_than_50 = series_custom[series_custom > 50]
criteria_one = series_custom > 50
criteria_two = series_custom < 75
both_criteria = series_custom[criteria_one & criteria_two]
print(both_criteria)
print("-----------7---------------")
#data alignment same index
rt_critics = Series(fandango['RottenTomatoes'].values, index=fandango['FILM'])
rt_users = Series(fandango['RottenTomatoes_User'].values, index=fandango['FILM'])
rt_mean = (rt_critics + rt_users)/2
print(rt_mean)
print("--------------------------")
结果:
0 Avengers: Age of Ultron (2015)
1 Cinderella (2015)
2 Ant-Man (2015)
3 Do You Believe? (2015)
4 Hot Tub Time Machine 2 (2015)
Name: FILM, dtype: object
0 74
1 85
2 80
3 18
4 14
Name: RottenTomatoes, dtype: int64
-----------1-------------
<class 'numpy.ndarray'>
Minions (2015) 54
Leviathan (2014) 99
dtype: int64
----------2--------------
The Water Diviner (2015) 63
Irrational Man (2015) 42
Top Five (2014) 86
Shaun the Sheep Movie (2015) 99
Love & Mercy (2015) 89
dtype: int64
------------3------------
'71 (2015) 97
5 Flights Up (2015) 52
A Little Chaos (2015) 40
A Most Violent Year (2014) 90
About Elly (2015) 97
Aloha (2015) 19
American Sniper (2015) 72
American Ultra (2015) 46
Amy (2015) 97
Annie (2014) 27
Ant-Man (2015) 80
Avengers: Age of Ultron (2015) 74
Big Eyes (2014) 72
Birdman (2014) 92
Black Sea (2015) 82
Black or White (2015) 39
Blackhat (2015) 34
Cake (2015) 49
Chappie (2015) 30
Child 44 (2015) 26
Cinderella (2015) 85
Clouds of Sils Maria (2015) 89
Danny Collins (2015) 77
Dark Places (2015) 26
Do You Believe? (2015) 18
Dope (2015) 87
Entourage (2015) 32
Escobar: Paradise Lost (2015) 52
Ex Machina (2015) 92
Fantastic Four (2015) 9
..
The Loft (2015) 11
The Longest Ride (2015) 31
The Man From U.N.C.L.E. (2015) 68
The Overnight (2015) 82
The Salt of the Earth (2015) 96
The Second Best Exotic Marigold Hotel (2015) 62
The SpongeBob Movie: Sponge Out of Water (2015) 78
The Stanford * Experiment (2015) 84
The Vatican Tapes (2015) 13
The Water Diviner (2015) 63
The Wedding Ringer (2015) 27
The Wolfpack (2015) 84
The Woman In Black 2 Angel of Death (2015) 22
The Wrecking Crew (2015) 93
Timbuktu (2015) 99
Tomorrowland (2015) 50
Top Five (2014) 86
Trainwreck (2015) 85
True Story (2015) 45
Two Days, One Night (2014) 97
Unbroken (2014) 51
Unfinished Business (2015) 11
Unfriended (2015) 60
Vacation (2015) 27
Welcome to Me (2015) 71
What We Do in the Shadows (2015) 96
When Marnie Was There (2015) 89
While We're Young (2015) 83
Wild Tales (2014) 96
Woman in Gold (2015) 52
Length: 146, dtype: int64
----------4--------------
Paul Blart: Mall Cop 2 (2015) 5
Hitman: Agent 47 (2015) 7
Hot Pursuit (2015) 8
Fantastic Four (2015) 9
Taken 3 (2015) 9
The Boy Next Door (2015) 10
The Loft (2015) 11
Unfinished Business (2015) 11
Mortdecai (2015) 12
Seventh Son (2015) 12
dtype: int64
----------5--------------
Avengers: Age of Ultron (2015) 148
Cinderella (2015) 170
Ant-Man (2015) 160
Do You Believe? (2015) 36
Hot Tub Time Machine 2 (2015) 28
The Water Diviner (2015) 126
Irrational Man (2015) 84
Top Five (2014) 172
Shaun the Sheep Movie (2015) 198
Love & Mercy (2015) 178
Far From The Madding Crowd (2015) 168
Black Sea (2015) 164
Leviathan (2014) 198
Unbroken (2014) 102
The Imitation Game (2014) 180
Taken 3 (2015) 18
Ted 2 (2015) 92
Southpaw (2015) 118
Night at the Museum: Secret of the Tomb (2014) 100
Pixels (2015) 34
McFarland, USA (2015) 158
Insidious: Chapter 3 (2015) 118
The Man From U.N.C.L.E. (2015) 136
Run All Night (2015) 120
Trainwreck (2015) 170
Selma (2014) 198
Ex Machina (2015) 184
Still Alice (2015) 176
Wild Tales (2014) 192
The End of the Tour (2015) 184
...
Clouds of Sils Maria (2015) 178
Testament of Youth (2015) 162
Infinitely Polar Bear (2015) 160
Phoenix (2015) 198
The Wolfpack (2015) 168
The Stanford * Experiment (2015) 168
Tangerine (2015) 190
Magic Mike XXL (2015) 124
Home (2015) 90
The Wedding Ringer (2015) 54
Woman in Gold (2015) 104
The Last Five Years (2015) 120
Mission: Impossible – Rogue Nation (2015) 184
Amy (2015) 194
Jurassic World (2015) 142
Minions (2015) 108
Max (2015) 70
Paul Blart: Mall Cop 2 (2015) 10
The Longest Ride (2015) 62
The Lazarus Effect (2015) 28
The Woman In Black 2 Angel of Death (2015) 44
Danny Collins (2015) 154
Spare Parts (2015) 104
Serena (2015) 36
Inside Out (2015) 196
Mr. Holmes (2015) 174
'71 (2015) 194
Two Days, One Night (2014) 194
Gett: The Trial of Viviane Amsalem (2015) 200
Kumiko, The Treasure Hunter (2015) 174
Length: 146, dtype: int64
----------6---------------
Avengers: Age of Ultron (2015) 74
The Water Diviner (2015) 63
Unbroken (2014) 51
Southpaw (2015) 59
Insidious: Chapter 3 (2015) 59
The Man From U.N.C.L.E. (2015) 68
Run All Night (2015) 60
5 Flights Up (2015) 52
Welcome to Me (2015) 71
Saint Laurent (2015) 51
Maps to the Stars (2015) 60
Pitch Perfect 2 (2015) 67
The Age of Adaline (2015) 54
The DUFF (2015) 71
Ricki and the Flash (2015) 64
Unfriended (2015) 60
American Sniper (2015) 72
The Hobbit: The Battle of the Five Armies (2014) 61
Paper Towns (2015) 55
Big Eyes (2014) 72
Maggie (2015) 54
Focus (2015) 57
The Second Best Exotic Marigold Hotel (2015) 62
The 100-Year-Old Man Who Climbed Out the Window and Disappeared (2015) 67
Escobar: Paradise Lost (2015) 52
Into the Woods (2014) 71
Inherent Vice (2014) 73
Magic Mike XXL (2015) 62
Woman in Gold (2015) 52
The Last Five Years (2015) 60
Jurassic World (2015) 71
Minions (2015) 54
Spare Parts (2015) 52
dtype: int64
-----------7---------------
FILM
Avengers: Age of Ultron (2015) 80.0
Cinderella (2015) 82.5
Ant-Man (2015) 85.0
Do You Believe? (2015) 51.0
Hot Tub Time Machine 2 (2015) 21.0
The Water Diviner (2015) 62.5
Irrational Man (2015) 47.5
Top Five (2014) 75.0
Shaun the Sheep Movie (2015) 90.5
Love & Mercy (2015) 88.0
Far From The Madding Crowd (2015) 80.5
Black Sea (2015) 71.0
Leviathan (2014) 89.0
Unbroken (2014) 60.5
The Imitation Game (2014) 91.0
Taken 3 (2015) 27.5
Ted 2 (2015) 52.0
Southpaw (2015) 69.5
Night at the Museum: Secret of the Tomb (2014) 54.0
Pixels (2015) 35.5
McFarland, USA (2015) 84.0
Insidious: Chapter 3 (2015) 57.5
The Man From U.N.C.L.E. (2015) 74.0
Run All Night (2015) 59.5
Trainwreck (2015) 79.5
Selma (2014) 92.5
Ex Machina (2015) 89.0
Still Alice (2015) 86.5
Wild Tales (2014) 94.0
The End of the Tour (2015) 90.5
...
Clouds of Sils Maria (2015) 78.0
Testament of Youth (2015) 80.0
Infinitely Polar Bear (2015) 78.0
Phoenix (2015) 90.0
The Wolfpack (2015) 78.5
The Stanford * Experiment (2015) 85.5
Tangerine (2015) 90.5
Magic Mike XXL (2015) 63.0
Home (2015) 55.0
The Wedding Ringer (2015) 46.5
Woman in Gold (2015) 66.5
The Last Five Years (2015) 60.0
Mission: Impossible – Rogue Nation (2015) 91.0
Amy (2015) 94.0
Jurassic World (2015) 76.0
Minions (2015) 53.0
Max (2015) 54.0
Paul Blart: Mall Cop 2 (2015) 20.5
The Longest Ride (2015) 52.0
The Lazarus Effect (2015) 18.5
The Woman In Black 2 Angel of Death (2015) 23.5
Danny Collins (2015) 76.0
Spare Parts (2015) 67.5
Serena (2015) 21.5
Inside Out (2015) 94.0
Mr. Holmes (2015) 82.5
'71 (2015) 89.5
Two Days, One Night (2014) 87.5
Gett: The Trial of Viviane Amsalem (2015) 90.5
Kumiko, The Treasure Hunter (2015) 75.0
Length: 146, dtype: float64
--------------------------
3-5:
import pandas as pd
#will return a new DataFrame that is indexed by the values in the specified column
#and will drop that column from the DataFrame
#without the FILM column dropped
fandango = pd.read_csv('fandango_score_comparison.csv')
print(type(fandango))
fandango_films = fandango.set_index('FILM', drop=False)#索引film
#print(fandango_films.index)
print("-----------1-------------")
# Slice using either bracket notation or loc[]
print(fandango_films["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"])
print(fandango_films.loc["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"])
# Specific movie
print(fandango_films.loc['Kumiko, The Treasure Hunter (2015)'])
# Selecting list of movies
movies = ['Kumiko, The Treasure Hunter (2015)', 'Do You Believe? (2015)', 'Ant-Man (2015)']
print(fandango_films.loc[movies])
#When selecting multiple rows, a DataFrame is returned,
#but when selecting an individual row, a Series object is returned instead
print("-----------2-------------")
#The apply() method in Pandas allows us to specify Python logic
#The apply() method requires you to pass in a vectorized operation
#that can be applied over each Series object.
import numpy as np
# returns the data types as a Series
types = fandango_films.dtypes
#print types
# filter data types to just floats, index attributes returns just column names
float_columns = types[types.values == 'float64'].index
# use bracket notation to filter columns to just float columns
float_df = fandango_films[float_columns]
#print float_df
# `x` is a Series object representing a column
deviations = float_df.apply(lambda x: np.std(x))
print(deviations)
print("-----------3-------------")
rt_mt_user = float_df[['RT_user_norm', 'Metacritic_user_nom']]
print(rt_mt_user.apply(lambda x: np.std(x), axis=1))
结果:
<class 'pandas.core.frame.DataFrame'>
-----------1-------------
FILM ... Fandango_Difference
FILM ...
Avengers: Age of Ultron (2015) Avengers: Age of Ultron (2015) ... 0.5
Cinderella (2015) Cinderella (2015) ... 0.5
Ant-Man (2015) Ant-Man (2015) ... 0.5
Do You Believe? (2015) Do You Believe? (2015) ... 0.5
Hot Tub Time Machine 2 (2015) Hot Tub Time Machine 2 (2015) ... 0.5
[5 rows x 22 columns]
FILM ... Fandango_Difference
FILM ...
Avengers: Age of Ultron (2015) Avengers: Age of Ultron (2015) ... 0.5
Cinderella (2015) Cinderella (2015) ... 0.5
Ant-Man (2015) Ant-Man (2015) ... 0.5
Do You Believe? (2015) Do You Believe? (2015) ... 0.5
Hot Tub Time Machine 2 (2015) Hot Tub Time Machine 2 (2015) ... 0.5
[5 rows x 22 columns]
FILM Kumiko, The Treasure Hunter (2015)
RottenTomatoes 87
RottenTomatoes_User 63
Metacritic 68
Metacritic_User 6.4
IMDB 6.7
Fandango_Stars 3.5
Fandango_Ratingvalue 3.5
RT_norm 4.35
RT_user_norm 3.15
Metacritic_norm 3.4
Metacritic_user_nom 3.2
IMDB_norm 3.35
RT_norm_round 4.5
RT_user_norm_round 3
Metacritic_norm_round 3.5
Metacritic_user_norm_round 3
IMDB_norm_round 3.5
Metacritic_user_vote_count 19
IMDB_user_vote_count 5289
Fandango_votes 41
Fandango_Difference 0
Name: Kumiko, The Treasure Hunter (2015), dtype: object
FILM ... Fandango_Difference
FILM ...
Kumiko, The Treasure Hunter (2015) Kumiko, The Treasure Hunter (2015) ... 0.0
Do You Believe? (2015) Do You Believe? (2015) ... 0.5
Ant-Man (2015) Ant-Man (2015) ... 0.5
[3 rows x 22 columns]
-----------2-------------
Metacritic_User 1.505529
IMDB 0.955447
Fandango_Stars 0.538532
Fandango_Ratingvalue 0.501106
RT_norm 1.503265
RT_user_norm 0.997787
Metacritic_norm 0.972522
Metacritic_user_nom 0.752765
IMDB_norm 0.477723
RT_norm_round 1.509404
RT_user_norm_round 1.003559
Metacritic_norm_round 0.987561
Metacritic_user_norm_round 0.785412
IMDB_norm_round 0.501043
Fandango_Difference 0.152141
dtype: float64
-----------3-------------
FILM
Avengers: Age of Ultron (2015) 0.375
Cinderella (2015) 0.125
Ant-Man (2015) 0.225
Do You Believe? (2015) 0.925
Hot Tub Time Machine 2 (2015) 0.150
The Water Diviner (2015) 0.150
Irrational Man (2015) 0.575
Top Five (2014) 0.100
Shaun the Sheep Movie (2015) 0.150
Love & Mercy (2015) 0.050
Far From The Madding Crowd (2015) 0.050
Black Sea (2015) 0.150
Leviathan (2014) 0.175
Unbroken (2014) 0.125
The Imitation Game (2014) 0.250
Taken 3 (2015) 0.000
Ted 2 (2015) 0.175
Southpaw (2015) 0.050
Night at the Museum: Secret of the Tomb (2014) 0.000
Pixels (2015) 0.025
McFarland, USA (2015) 0.425
Insidious: Chapter 3 (2015) 0.325
The Man From U.N.C.L.E. (2015) 0.025
Run All Night (2015) 0.350
Trainwreck (2015) 0.350
Selma (2014) 0.375
Ex Machina (2015) 0.175
Still Alice (2015) 0.175
Wild Tales (2014) 0.100
The End of the Tour (2015) 0.350
...
Clouds of Sils Maria (2015) 0.100
Testament of Youth (2015) 0.000
Infinitely Polar Bear (2015) 0.075
Phoenix (2015) 0.025
The Wolfpack (2015) 0.075
The Stanford * Experiment (2015) 0.050
Tangerine (2015) 0.325
Magic Mike XXL (2015) 0.250
Home (2015) 0.200
The Wedding Ringer (2015) 0.825
Woman in Gold (2015) 0.225
The Last Five Years (2015) 0.225
Mission: Impossible – Rogue Nation (2015) 0.250
Amy (2015) 0.075
Jurassic World (2015) 0.275
Minions (2015) 0.125
Max (2015) 0.350
Paul Blart: Mall Cop 2 (2015) 0.300
The Longest Ride (2015) 0.625
The Lazarus Effect (2015) 0.650
The Woman In Black 2 Angel of Death (2015) 0.475
Danny Collins (2015) 0.100
Spare Parts (2015) 0.300
Serena (2015) 0.700
Inside Out (2015) 0.025
Mr. Holmes (2015) 0.025
'71 (2015) 0.175
Two Days, One Night (2014) 0.250
Gett: The Trial of Viviane Amsalem (2015) 0.200
Kumiko, The Treasure Hunter (2015) 0.025
Length: 146, dtype: float64