【Kaggle练习赛】之Titanic: Machine Learning from Disaster

124 阅读 0 评论 82 点赞

我是靠谱客的博主曾经大神，这篇文章主要介绍【Kaggle练习赛】之Titanic: Machine Learning from Disaster，现在分享给大家，希望可以做个参考。

一.题目描述

The sinking of the RMS Titanic is one of the most infamous shipwrecks in history. On April 15, 1912, during her maiden voyage, the Titanic sank after colliding with an iceberg, killing 1502 out of 2224 passengers and crew. This sensational tragedy shocked the international community and led to better safety regulations for ships.

One of the reasons that the shipwreck led to such loss of life was that there were not enough lifeboats for the passengers and crew. Although there was some element of luck involved in surviving the sinking, some groups of people were more likely to survive than others, such as women, children, and the upper-class.

In this challenge, we ask you to complete the analysis of what sorts of people were likely to survive. In particular, we ask you to apply the tools of machine learning to predict which passengers survived the tragedy.

二.基本思路

分析题意可知本题实际上是一个分类问题，给定titanic上一批乘客的一些个人信息（feature）和是否逃生（label），题目要求的是预测另一批乘客的逃生情况。训练数据集如下所示：
这里写图片描述

首先观察数据。训练数据集中在age这一栏有较多的缺失值，这些缺失值需要进行一些处理。name,ticket,cabin,embark 这几栏数据从直观上感觉用途不大，可以尝试直接丢弃。

本题有用的特征基本上是离散的，直观的想法是利用决策树模型去做。scikit-learn提供了决策树的模型。直接利用决策树的库函数，丢弃缺失age的数据，选取最基本的特征：pclass,male,sib,par,fare(部分缺失设为7)，训练了一个决策树模型，线下测试误差率是19%。线上提交结果，最后正确率是72%。程序如下：

import csv
from numpy import *
from sklearn import tree
from sklearn import cross_validation  

def total_test_online():
    train_data,train_label = load_data_set()

    train_feature0,new_label0 = feature_extraction(train_data,train_label) #only take some data with age info
    model0 = build_model(train_feature0,new_label0)

    train_feature1 = feature_extraction1(train_data) #without using age info
    model1 = build_model(train_feature1,train_label)

    test_data = load_test_set()
    result= test(test_data,model0,model1,train_label)
    gen_res_file(result)


def total_test_offline():
    data,label = load_data_set()
    train_data,train_label,test_data,test_label = pre_data(data,label)
    train_feature0,new_label0 = feature_extraction(train_data,train_label) #only take some data with age info
    model0 = build_model(train_feature0,new_label0)

    train_feature1 = feature_extraction1(train_data) #without using age info
    model1 = build_model(train_feature1,train_label)

    result= test(test_data,model0,model1,train_label)
    judge_off_result(result,test_label)

 #   val_res = val_model(model1,train_feature,train_label)



def load_data_set():
    file = open('train.csv','rb')
    lines=csv.reader(file) 
    l=[];train_data=[];train_label=[];
    for line in lines:  
       l.append(line) 
    l.remove(l[0])
    for line in l:
        train_label.append(int(line[1]))
        tmp=[]
        tmp=line[2:]
        train_data.append(tmp)
    return train_data,train_label

def load_test_set():
    file = open('test.csv','rb')
    lines=csv.reader(file) 
    l=[];test_data=[]
    for line in lines:  
       l.append(line) 
    l.remove(l[0])
    for line in l:
      #  train_label.append(int(line[1]))
        tmp=[]
        tmp=line[1:]
        test_data.append(tmp)
    return test_data

def pre_data(data,label):

    train_data,test_data,train_label,test_label= cross_validation.train_test_split(data, label, test_size=0.2, random_state=0)
    return train_data,train_label,test_data,test_label

def judge_off_result(result,test_label):
    tlen = len(test_label)
    error=0
    for i in range(tlen):
        if(result[i][0]!=test_label[i]):
            error=error+1
    res = float(error)/tlen
    print "error rate is: %f" % res

def test(test_data,model0,model1,train_label):
    test_label=[];res_label=[];
    for line in test_data:
           curr_fea=feature_extraction1([line])
           curr_label=model1.predict(curr_fea)
           test_label.append(curr_label)

    return test_label



def gen_res_file(result):
     file = open('result.csv','wb')  
     my_save=csv.writer(file)  
     tmp=['PassengerId','Survived']
     my_save.writerow(tmp) 
     cnt=1
     for i in result:  
            tmp=[]  
            tmp.append(cnt+891)
            tmp.append(i[0])  
            cnt=cnt+1
            print cnt
            my_save.writerow(tmp) 
     file.close()

def feature_extraction1(train_data):
    feature=[];new_label=[]
    idx=-1
    for line in train_data:
        curr_feature=[]
        idx=idx+1
        curr_feature.append(int(line[0])) #pclassed
        cmp_res = cmp(len(line[2]),len('male'));#judge male or female,use 1 or 0
        if(cmp_res==1):
            curr_feature.append(2);
        else:
            curr_feature.append(1);

        if(cmp(line[7],'')):
         if(line[7].find('.')==-1):
            curr_feature.append(int(line[7])) #fare
         else:
            tmp_str=line[7].split('.')
            curr_feature.append(int(tmp_str[0]))
        else:
              curr_feature.append(7)
        curr_feature.append(int(line[4]))
        curr_feature.append(int(line[5]))


        feature.append(curr_feature)
    return feature

def feature_extraction(train_data,train_label):
    feature=[];new_label=[]
    idx=-1
    for line in train_data:
        curr_feature=[]
        idx=idx+1
        curr_feature.append(int(line[0])) #pclassed

        cmp_res = cmp(len(line[2]),len('male'));#judge male or female,use 1 or 0
        if(cmp_res==1):
            curr_feature.append(1);
        else:
            curr_feature.append(0);

        if(cmp(line[3],'')):
            if (line[3].find('.')==-1):
                tmp=int(line[3])

                curr_feature.append(tmp)


            else:
                tmp_str=line[3].split('.')

                curr_feature.append(int(tmp_str[0]))
        else:
           continue  #if lose ,than continue

        new_label.append(train_label[idx])
        curr_feature.append(int(line[4]))
        curr_feature.append(int(line[5]))

        if(cmp(line[7],'')):
         if(line[7].find('.')==-1):
            curr_feature.append(int(line[7])) #fare
         else:
            tmp_str=line[7].split('.')
            curr_feature.append(int(tmp_str[0]))
        else:
              curr_feature.append(7)
     #   curr_feature.append(int(line[9])) #embarked

        feature.append(curr_feature)
    return feature,new_label

def build_model(train_feature,train_label):
  #  train_feature = array(train_feature)
   # train_label = array(train_label)
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(train_feature, train_label)
    return clf

def val_model(model,train_feature,train_label):
    res = model.predict(train_feature[4])
    return res

使用pandas库的程序：

import numpy as np
import pandas as pd
import random as rd
from sklearn import tree
from sklearn import preprocessing

from sklearn import cross_validation  

def load_data(file_name):
    input_df = pd.read_csv(file_name)  #using the first row as head
    fix_miss_value(input_df)
    convert_data(input_df)
    return input_df

def fix_miss_value(df):
    df.Embarked[df.Embarked.isnull()] = df.Embarked.dropna().mode().values 
    df.Age[df.Age.isnull()]=df.Age.dropna().mean()
    del df['Cabin']

def convert_data(df):
    df.Sex[df['Sex']=='male']=1
    df.Sex[df['Sex']=='female']=0
    df['fare_bin']=pd.qcut(df.Fare,4)
    df['fare_id']=pd.factorize(df.fare_bin)[0]+1
    df['embarked_id']=pd.factorize(df.Embarked)[0]+1

def off_feature_extraction(df):
     new_df=df.drop(['fare_bin','Fare','PassengerId','Name','Ticket','Fare','Embarked','fare_bin'],axis=1)
     new_df.columns = [0,1,2,3,4,5,6,7]
     label=new_df[0].values
     feature=new_df.ix[:,1:].values
     return label,feature

def off_model_building(feature,label):
    train_feature,test_feature,train_label,test_label= cross_validation.train_test_split(feature, label, test_size=0.3, random_state=0)
    clf = RandomForestClassifier(random_state=1,n_estimators=150,min_samples_split=4,min_samples_leaf=2)
    clf = clf.fit(train_feature, train_label)
    predict_label=clf.predict(test_feature)
    res=predict_label ^ test_label
    accuracy = 1-float(sum(res))/len(test_label)
    print "accuracy rate is: %f" % accuracy

上述是最基本简单做法。使用pandas库进行数据预处理，可以使得程序简洁规范。对于数据挖掘比赛，选取最基本的特征，调用库模型，即可得到初步的结果。但是比赛的核心应该是在后续的特征选择和模型的调优。接下来会记录我的改进思路。

三.改进思路

1.加上新的特征：age, fare。由于这几项的变化辅导较大，我是做了近似划分，比如说，年龄分为小孩中年老年三类。fare也分划分为三类：1,2,3。我这样做的目的是希望把候选特征空间给降下来，也是防止过拟合，因为训练数据集真心蛮小的。然后我采用了性能更优的决策森林模型，线下测试的正确率是：82.8%。

接下来进行线上测试，结果为：77.033% 性能有所提升。线下的测试正确率涨幅较小，但是线上测试的时候涨幅较大。我想这应该是线下总数据量太小了，线下用的val数据集也比较小，难以反映线上模型的真实情况。
记录下此时特征提取的程序：

def feature_extraction1(train_data):
feature=[];new_label=[]
idx=-1

for line in train_data:
    curr_feature=[]
    idx=idx+1
    curr_feature.append(int(line[0])) #pclassed
    cmp_res = cmp(len(line[2]),len('male'));#judge male or female,use 1 or 0
    if(cmp_res==1):
        curr_feature.append(2);
    else:
        curr_feature.append(1);

    if(cmp(line[3],'')):
        if (line[3].find('.')==-1):
            tmp=int(line[3])
            if(tmp<10):
                curr_feature.append(1)
            if(tmp>=10 and tmp<=60):
                curr_feature.append(2)
            if(tmp>60):
                curr_feature.append(3)
        else:
            tmp_str=line[3].split('.')
            tmp=int(tmp_str[0])
            if(tmp<10):
                curr_feature.append(1)
            if(tmp>=10 and tmp<=60):
                curr_feature.append(2)
            if(tmp>60):
                curr_feature.append(3)
            #curr_feature.append(int(tmp_str[0]))
    else:
      curr_feature.append(2)  #if lose ,than continue



    if(cmp(line[7],'')):
     if(line[7].find('.')==-1):
        tmp=int(line[7])
        if(tmp<50):
                curr_feature.append(1)
        if(tmp>=50 and tmp<=100):
                curr_feature.append(2)
        if(tmp>100):
                curr_feature.append(3)
     else:
         tmp_str=line[7].split('.')
         tmp=int(tmp_str[0])
         if(tmp<50):
                curr_feature.append(1)
         if(tmp>=50 and tmp<=100):
                curr_feature.append(2)
         if(tmp>100):
                curr_feature.append(3)
    else:
          curr_feature.append(2)

    curr_feature.append(int(line[4]))
    curr_feature.append(int(line[5]))

    if(line[9]=='S'):
         curr_feature.append(1)
    else:
        if(line[9]=='C'):
             curr_feature.append(2)
        else:
         curr_feature.append(3)

    '''
    if(int(line[4])>0):
        curr_feature.append(1) #Sibs
    else:
        curr_feature.append(0)
    if(int(line[5])>0):
        curr_feature.append(1) #Sibs
    else:
        curr_feature.append(0)
    '''
    feature.append(curr_feature)
return feature

到目前为止，我所采用的特征都还是基本的类别ID，并没有去构造一些新的特征，比如存活率等等。之后看了一些别人的博客，有些建议加上性别存活率这样的比例特征，尝试了一下。加了性别和pclass的存活率，线下正确率是83.95%，线上是 76.077%，反而下降了 。生成存活率字典的程序如下

def radio_feature(train_data,train_label):
sex_dic={};pclass_dic={}
idx=0
for line in train_data:
if int(line[0]) not in pclass_dic.keys():
if(train_label[idx]==1):
pclass_dic[int(line[0])]=1
else:
if(train_label[idx]==1):
pclass_dic[int(line[0])]= pclass_dic[int(line[0])]+1

    if (line[2]) not in sex_dic.keys():
          if(train_label[idx]==1):
                sex_dic[(line[2])]=1
    else:
        if(train_label[idx]==1):
            sex_dic[(line[2])]= sex_dic[(line[2])]+1
    idx=idx+1
tlen=len(train_data)
for key in sex_dic.keys():
    sex_dic[key]=sex_dic[key]/float(tlen)
for key in pclass_dic.keys():
    pclass_dic[key]=pclass_dic[key]/float(tlen)
return pclass_dic,sex_dic

“`
也没有别的太好的改进思路了，阅读别人的博客 http://www.cnblogs.com/north-north/p/4358084.html 学习一下