概述
一.题目描述
The sinking of the RMS Titanic is one of the most infamous shipwrecks in history. On April 15, 1912, during her maiden voyage, the Titanic sank after colliding with an iceberg, killing 1502 out of 2224 passengers and crew. This sensational tragedy shocked the international community and led to better safety regulations for ships.
One of the reasons that the shipwreck led to such loss of life was that there were not enough lifeboats for the passengers and crew. Although there was some element of luck involved in surviving the sinking, some groups of people were more likely to survive than others, such as women, children, and the upper-class.
In this challenge, we ask you to complete the analysis of what sorts of people were likely to survive. In particular, we ask you to apply the tools of machine learning to predict which passengers survived the tragedy.
二.基本思路
分析题意可知本题实际上是一个分类问题,给定titanic上一批乘客的一些个人信息(feature)和是否逃生(label),题目要求的是预测另一批乘客的逃生情况。训练数据集如下所示:
首先观察数据。训练数据集中在age这一栏有较多的缺失值,这些缺失值需要进行一些处理。name,ticket,cabin,embark 这几栏数据从直观上感觉用途不大,可以尝试直接丢弃。
本题有用的特征基本上是离散的,直观的想法是利用决策树模型去做。scikit-learn提供了决策树的模型。直接利用决策树的库函数,丢弃缺失age的数据,选取最基本的特征:pclass,male,sib,par,fare(部分缺失设为7),训练了一个决策树模型,线下测试误差率是19%。线上提交结果,最后正确率是72%。程序如下:
import csv
from numpy import *
from sklearn import tree
from sklearn import cross_validation
def total_test_online():
train_data,train_label = load_data_set()
train_feature0,new_label0 = feature_extraction(train_data,train_label) #only take some data with age info
model0 = build_model(train_feature0,new_label0)
train_feature1 = feature_extraction1(train_data) #without using age info
model1 = build_model(train_feature1,train_label)
test_data = load_test_set()
result= test(test_data,model0,model1,train_label)
gen_res_file(result)
def total_test_offline():
data,label = load_data_set()
train_data,train_label,test_data,test_label = pre_data(data,label)
train_feature0,new_label0 = feature_extraction(train_data,train_label) #only take some data with age info
model0 = build_model(train_feature0,new_label0)
train_feature1 = feature_extraction1(train_data) #without using age info
model1 = build_model(train_feature1,train_label)
result= test(test_data,model0,model1,train_label)
judge_off_result(result,test_label)
# val_res = val_model(model1,train_feature,train_label)
def load_data_set():
file = open('train.csv','rb')
lines=csv.reader(file)
l=[];train_data=[];train_label=[];
for line in lines:
l.append(line)
l.remove(l[0])
for line in l:
train_label.append(int(line[1]))
tmp=[]
tmp=line[2:]
train_data.append(tmp)
return train_data,train_label
def load_test_set():
file = open('test.csv','rb')
lines=csv.reader(file)
l=[];test_data=[]
for line in lines:
l.append(line)
l.remove(l[0])
for line in l:
# train_label.append(int(line[1]))
tmp=[]
tmp=line[1:]
test_data.append(tmp)
return test_data
def pre_data(data,label):
train_data,test_data,train_label,test_label= cross_validation.train_test_split(data, label, test_size=0.2, random_state=0)
return train_data,train_label,test_data,test_label
def judge_off_result(result,test_label):
tlen = len(test_label)
error=0
for i in range(tlen):
if(result[i][0]!=test_label[i]):
error=error+1
res = float(error)/tlen
print "error rate is: %f" % res
def test(test_data,model0,model1,train_label):
test_label=[];res_label=[];
for line in test_data:
curr_fea=feature_extraction1([line])
curr_label=model1.predict(curr_fea)
test_label.append(curr_label)
return test_label
def gen_res_file(result):
file = open('result.csv','wb')
my_save=csv.writer(file)
tmp=['PassengerId','Survived']
my_save.writerow(tmp)
cnt=1
for i in result:
tmp=[]
tmp.append(cnt+891)
tmp.append(i[0])
cnt=cnt+1
print cnt
my_save.writerow(tmp)
file.close()
def feature_extraction1(train_data):
feature=[];new_label=[]
idx=-1
for line in train_data:
curr_feature=[]
idx=idx+1
curr_feature.append(int(line[0])) #pclassed
cmp_res = cmp(len(line[2]),len('male'));#judge male or female,use 1 or 0
if(cmp_res==1):
curr_feature.append(2);
else:
curr_feature.append(1);
if(cmp(line[7],'')):
if(line[7].find('.')==-1):
curr_feature.append(int(line[7])) #fare
else:
tmp_str=line[7].split('.')
curr_feature.append(int(tmp_str[0]))
else:
curr_feature.append(7)
curr_feature.append(int(line[4]))
curr_feature.append(int(line[5]))
feature.append(curr_feature)
return feature
def feature_extraction(train_data,train_label):
feature=[];new_label=[]
idx=-1
for line in train_data:
curr_feature=[]
idx=idx+1
curr_feature.append(int(line[0])) #pclassed
cmp_res = cmp(len(line[2]),len('male'));#judge male or female,use 1 or 0
if(cmp_res==1):
curr_feature.append(1);
else:
curr_feature.append(0);
if(cmp(line[3],'')):
if (line[3].find('.')==-1):
tmp=int(line[3])
curr_feature.append(tmp)
else:
tmp_str=line[3].split('.')
curr_feature.append(int(tmp_str[0]))
else:
continue #if lose ,than continue
new_label.append(train_label[idx])
curr_feature.append(int(line[4]))
curr_feature.append(int(line[5]))
if(cmp(line[7],'')):
if(line[7].find('.')==-1):
curr_feature.append(int(line[7])) #fare
else:
tmp_str=line[7].split('.')
curr_feature.append(int(tmp_str[0]))
else:
curr_feature.append(7)
# curr_feature.append(int(line[9])) #embarked
feature.append(curr_feature)
return feature,new_label
def build_model(train_feature,train_label):
# train_feature = array(train_feature)
# train_label = array(train_label)
clf = tree.DecisionTreeClassifier()
clf = clf.fit(train_feature, train_label)
return clf
def val_model(model,train_feature,train_label):
res = model.predict(train_feature[4])
return res
使用pandas库的程序:
import numpy as np
import pandas as pd
import random as rd
from sklearn import tree
from sklearn import preprocessing
from sklearn import cross_validation
def load_data(file_name):
input_df = pd.read_csv(file_name) #using the first row as head
fix_miss_value(input_df)
convert_data(input_df)
return input_df
def fix_miss_value(df):
df.Embarked[df.Embarked.isnull()] = df.Embarked.dropna().mode().values
df.Age[df.Age.isnull()]=df.Age.dropna().mean()
del df['Cabin']
def convert_data(df):
df.Sex[df['Sex']=='male']=1
df.Sex[df['Sex']=='female']=0
df['fare_bin']=pd.qcut(df.Fare,4)
df['fare_id']=pd.factorize(df.fare_bin)[0]+1
df['embarked_id']=pd.factorize(df.Embarked)[0]+1
def off_feature_extraction(df):
new_df=df.drop(['fare_bin','Fare','PassengerId','Name','Ticket','Fare','Embarked','fare_bin'],axis=1)
new_df.columns = [0,1,2,3,4,5,6,7]
label=new_df[0].values
feature=new_df.ix[:,1:].values
return label,feature
def off_model_building(feature,label):
train_feature,test_feature,train_label,test_label= cross_validation.train_test_split(feature, label, test_size=0.3, random_state=0)
clf = RandomForestClassifier(random_state=1,n_estimators=150,min_samples_split=4,min_samples_leaf=2)
clf = clf.fit(train_feature, train_label)
predict_label=clf.predict(test_feature)
res=predict_label ^ test_label
accuracy = 1-float(sum(res))/len(test_label)
print "accuracy rate is: %f" % accuracy
上述是最基本简单做法。使用pandas库进行数据预处理,可以使得程序简洁规范。对于数据挖掘比赛,选取最基本的特征,调用库模型,即可得到初步的结果。但是比赛的核心应该是在后续的特征选择和模型的调优。接下来会记录我的改进思路。
三.改进思路
1.加上新的特征:age, fare。由于这几项的变化辅导较大,我是做了近似划分,比如说,年龄分为小孩 中年 老年三类。fare也分划分为三类:1,2,3。我这样做的目的是希望把候选特征空间给降下来,也是防止过拟合,因为训练数据集真心蛮小的。然后我采用了性能更优的决策森林模型,线下测试的正确率是:82.8%。
接下来进行线上测试,结果为:77.033% 性能有所提升。线下的测试正确率涨幅较小,但是线上测试的时候涨幅较大。我想这应该是线下总数据量太小了,线下用的val数据集也比较小,难以反映线上模型的真实情况。
记录下此时特征提取的程序:
def feature_extraction1(train_data):
feature=[];new_label=[]
idx=-1
for line in train_data:
curr_feature=[]
idx=idx+1
curr_feature.append(int(line[0])) #pclassed
cmp_res = cmp(len(line[2]),len('male'));#judge male or female,use 1 or 0
if(cmp_res==1):
curr_feature.append(2);
else:
curr_feature.append(1);
if(cmp(line[3],'')):
if (line[3].find('.')==-1):
tmp=int(line[3])
if(tmp<10):
curr_feature.append(1)
if(tmp>=10 and tmp<=60):
curr_feature.append(2)
if(tmp>60):
curr_feature.append(3)
else:
tmp_str=line[3].split('.')
tmp=int(tmp_str[0])
if(tmp<10):
curr_feature.append(1)
if(tmp>=10 and tmp<=60):
curr_feature.append(2)
if(tmp>60):
curr_feature.append(3)
#curr_feature.append(int(tmp_str[0]))
else:
curr_feature.append(2) #if lose ,than continue
if(cmp(line[7],'')):
if(line[7].find('.')==-1):
tmp=int(line[7])
if(tmp<50):
curr_feature.append(1)
if(tmp>=50 and tmp<=100):
curr_feature.append(2)
if(tmp>100):
curr_feature.append(3)
else:
tmp_str=line[7].split('.')
tmp=int(tmp_str[0])
if(tmp<50):
curr_feature.append(1)
if(tmp>=50 and tmp<=100):
curr_feature.append(2)
if(tmp>100):
curr_feature.append(3)
else:
curr_feature.append(2)
curr_feature.append(int(line[4]))
curr_feature.append(int(line[5]))
if(line[9]=='S'):
curr_feature.append(1)
else:
if(line[9]=='C'):
curr_feature.append(2)
else:
curr_feature.append(3)
'''
if(int(line[4])>0):
curr_feature.append(1) #Sibs
else:
curr_feature.append(0)
if(int(line[5])>0):
curr_feature.append(1) #Sibs
else:
curr_feature.append(0)
'''
feature.append(curr_feature)
return feature
到目前为止,我所采用的特征都还是基本的类别ID,并没有去构造一些新的特征,比如存活率等等。之后看了一些别人的博客,有些建议加上性别存活率这样的比例特征,尝试了一下。加了性别和pclass的存活率,线下正确率是83.95%,线上是 76.077%,反而下降了 。生成存活率字典的程序如下
def radio_feature(train_data,train_label):
sex_dic={};pclass_dic={}
idx=0
for line in train_data:
if int(line[0]) not in pclass_dic.keys():
if(train_label[idx]==1):
pclass_dic[int(line[0])]=1
else:
if(train_label[idx]==1):
pclass_dic[int(line[0])]= pclass_dic[int(line[0])]+1
if (line[2]) not in sex_dic.keys():
if(train_label[idx]==1):
sex_dic[(line[2])]=1
else:
if(train_label[idx]==1):
sex_dic[(line[2])]= sex_dic[(line[2])]+1
idx=idx+1
tlen=len(train_data)
for key in sex_dic.keys():
sex_dic[key]=sex_dic[key]/float(tlen)
for key in pclass_dic.keys():
pclass_dic[key]=pclass_dic[key]/float(tlen)
return pclass_dic,sex_dic
“`
也没有别的太好的改进思路了,阅读别人的博客 http://www.cnblogs.com/north-north/p/4358084.html 学习一下
最后
以上就是曾经大神为你收集整理的【Kaggle练习赛】之Titanic: Machine Learning from Disaster的全部内容,希望文章能够帮你解决【Kaggle练习赛】之Titanic: Machine Learning from Disaster所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
发表评论 取消回复