这两天报名参加了阿里天池的’公交线路客流预测‘赛,就顺便先把以前看的kaggle的titanic的训练赛代码在熟悉下数据的一些处理。题目根据titanic乘客的信息来预测乘客的生还情况。给了titanic_test.csv和titanic_train.csv两数据表。首先是表的一些字段说明:
-
PassengerId
-- A numerical id assigned to each passenger. -
Survived
-- Whether the passenger survived (1), or didn't (0). We'll be making predictions for this column. -
Pclass
-- The class the passenger was in -- first class (1), second class (2), or third class (3). -
Name
-- the name of the passenger. -
Sex
-- The gender of the passenger -- male or female. -
Age
-- The age of the passenger. Fractional. -
SibSp
-- The number of siblings and spouses the passenger had on board. -
Parch
-- The number of parents and children the passenger had on board. -
Ticket
-- The ticket number of the passenger. -
Fare
-- How much the passenger paid for the ticker. -
Cabin
-- Which cabin the passenger was in. -
Embarked
-- Where the passenger boarded the Titanic.
下面是python处理代码:
from sklearn.ensemble import AdaBoostClassifier
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt train = pd.read_csv("titanic_train.csv", dtype={"Age": np.float64})
test = pd.read_csv("titanic_test.csv", dtype={"Age": np.float64} ) print("\n\nTop of the training data:")
print(train.head()) print("\n\nSummary statistics of training data")
print(train.describe()) #train.to_csv('copy_of_the_training_data.csv', index=False) train["Age"]=train["Age"].fillna(-1)
test["Age"]=test["Age"].fillna(-1) train.loc[train["Sex"]=="male","Sex"]=0
test.loc[test["Sex"]=="male","Sex"]=0
train.loc[train["Sex"]=="female","Sex"]=1
test.loc[test["Sex"]=="female","Sex"]=1 print(train["Embarked"].unique())
train["Embarked"]=train["Embarked"].fillna("S")
test["Embarked"]=test["Embarked"].fillna("S") train.loc[train["Embarked"]=="S","Embarked"]=0
train.loc[train["Embarked"]=="C","Embarked"]=1
train.loc[train["Embarked"]=="Q","Embarked"]=2 test.loc[test["Embarked"]=="S","Embarked"]=0
test.loc[test["Embarked"]=="C","Embarked"]=1
test.loc[test["Embarked"]=="Q","Embarked"]=2 train["Fare"]=train["Fare"].fillna(train["Fare"].median())
test["Fare"]=test["Fare"].fillna(test["Fare"].median()) #Generating a familysize column
train["FamilySize"]=train["SibSp"]+train["Parch"]
test["FamilySize"]=train["SibSp"]+test["Parch"] train["NameLength"]=train["Name"].apply(lambda x:len(x))
test["NameLength"]=test["Name"].apply(lambda x:len(x)) import re def get_title(name):
# Use a regular expression to search for a title. Titles always consist of capital and lowercase letters, and end with a period.
title_search = re.search(' ([A-Za-z]+)\.', name)
# If the title exists, extract and return it.
if title_search:
return title_search.group(1)
return "" # Get all the titles and print how often each one occurs.
train_titles = train["Name"].apply(get_title)
test_titles=test["Name"].apply(get_title) # Map each title to an integer. Some titles are very rare, and are compressed into the same codes as other titles.
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2,"Dona":9}
for k,v in title_mapping.items():
train_titles[train_titles == k] =v
test_titles[test_titles==k]=v # Add in the title column.
train["Title"] = train_titles
test["Title"]= test_titles #print (test["Title"]) import operator # A dictionary mapping family name to id
family_id_mapping = {} # A function to get the id given a row
def get_family_id(row):
# Find the last name by splitting on a comma
last_name = row["Name"].split(",")[0]
# Create the family id
family_id = "{0}{1}".format(last_name, row["FamilySize"])
# Look up the id in the mapping
if family_id not in family_id_mapping:
if len(family_id_mapping) == 0:
current_id = 1
else:
# Get the maximum id from the mapping and add one to it if we don't have an id
current_id = (max(family_id_mapping.items(), key=operator.itemgetter(1))[1] + 1)
family_id_mapping[family_id] = current_id
return family_id_mapping[family_id] # Get the family ids with the apply method
train_family_ids = train.apply(get_family_id, axis=1)
test_family_ids = test.apply(get_family_id,axis=1) # There are a lot of family ids, so we'll compress all of the families under 3 members into one code.
train_family_ids[train["FamilySize"] < 3] = -1
test_family_ids[test["FamilySize"]<3]=-1 train["FamilyId"] = train_family_ids
test["FamilyId"]=test_family_ids alg = AdaBoostClassifier()
#alg=RandomForestClassifier(random_state=1,n_estimators=150,min_samples_split=4,min_samples_leaf=2)
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "FamilySize", "Title", "FamilyId"] # Perform feature selection
selector = SelectKBest(f_classif, k=5)
selector.fit(train[predictors], train["Survived"]) # Get the raw p-values for each feature, and transform from p-values into scores
scores = -np.log10(selector.pvalues_) # Plot the scores. See how "Pclass", "Sex", "Title", and "Fare" are the best?
plt.bar(range(len(predictors)),scores)
plt.xticks(range(len(predictors)), predictors, rotation='vertical')
plt.show() print("#######")
predictors = ["Pclass", "Sex", "Fare","Title"] x_train = train[predictors]
y_train = train['Survived']
x_test=test[predictors]
alg.fit(x_train,y_train)
predictions = alg.predict(x_test) submission = pd.DataFrame({
"PassengerId": test["PassengerId"],
"Survived": predictions
}) submission.to_csv('submission.csv', index=False)
顺便总结一上上面有用到的格式化字符串的输出结果str.format():
#coding=utf-8
#str.format() 函数 #使用‘{}’占位符
print('what\'s {},{}'.format('wrong','hong!')) #使用{0},{1}形式的占位符
print ('{},I\'m {},my qq is {}'.format('hello','hong',''))
print('{},I\'m {},my E-mail is {}'.format('Hello','Hongten','hongtenzone@foxmail.com')) print ('{1},{0}'.format('hello','world')) #使用'{name}'形式的占位符
print('Hi,{name},{message}'.format(name="hongten",message='how are you?')) #格式控制:
import math
print('The value of PI is approximately {0:.3f}.'.format(math.pi)) table = {'Sjoerd': 4127, 'Jack': 4098, 'Dcab': 7678} for name,phone in table.items():
print('{0:10}==>{1:10d}'.format(name,phone))
还有就是python中正则表达式的模块学习:
#coding=utf-8
import re #re.match()
s='I like hongten! he is so cool!'
m=re.match(r'(\w+)\s',s)
if m:
print m.group(0) #print groups() 正则表达式中用 ()表示的是要提取的分组(Group)
else:
print 'not match' #re.seach()
text = "JGood is a handsome boy, he is cool, clever, and so on..."
m=re.search(r'\shan(ds)ome\s',text)
if m:
print m.groups()
else:
print "No serch" #re.sub 替换字符串的匹配项
import re
text = "JGood is a handsome boy, he is cool, clever, and so on..."
print re.sub(r'\s+', '-', text) #re.split #re.findall #re.compile
re_telephone=re.compile(r'^(\d{3})-(\d{3,8})$')
print re_telephone.match('010-8086').groups()