In [48]:
# Import required packages
import pandas as pd
import numpy as np
import pylab as plt
import csv as csv
import seaborn as sns
from scipy import stats

# Import Cross Validation Score
from sklearn.model_selection import cross_val_score

# Import split tool
from sklearn.model_selection import train_test_split

# Import GridSearch
from sklearn.model_selection import GridSearchCV

# Import Student t-test
from scipy.stats import ttest_rel

# Import DummyClassifier
from sklearn.dummy import DummyClassifier

# Import the Random Forest package
from sklearn.ensemble import RandomForestClassifier

# Import the Support Vector Machines package
from sklearn import svm

# For .read_csv, always use header=0 when you know row 0 is the header row
train_df = pd.read_csv('train.csv', header=0)

# Examine the data - note that Age, Cabin and Embarked have missing values
train_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
In [49]:
# Examine Cabin data
# Create a new column CabinLetter which is the first character of the Cabin string

train_df['CabinLetter'] = train_df['Cabin'].str[0]

plot_df = train_df[train_df['CabinLetter'].notnull()]

# Visualisation
fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5))

# Plot count of port of Embarkation
sns.countplot(x='CabinLetter', data=plot_df.sort_values(by='CabinLetter'), ax=axis1)

# Group by CabinLetter, and get the mean for survived passengers for each value in CabinLetter
cabin_perc = plot_df[['CabinLetter', 'Survived']].groupby(['CabinLetter'],as_index=False).mean()
sns.barplot(x='CabinLetter', y='Survived', data=cabin_perc.sort_values(by='CabinLetter'),ax=axis2)

# Plot count of CabinLetter by Passenger Class
sns.countplot(x='Pclass', hue='CabinLetter', data=plot_df.sort_values(by='CabinLetter'), ax=axis3)
Out[49]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fbe2da9a5f8>
In [50]:
# Some variation in survival rates by CabinLetter, but hard to deduce the missing values e.g. from Passenger Class
# For Passenger Class 1, the CabinLetter can be anything from A-E
# So drop the CabinLetter column

train_df = train_df.drop(['CabinLetter'], axis=1)

train_df.head(5)
Out[50]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
In [51]:
# Examine Sex data

# Visualisation
fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5))

# Plot count of Sex
sns.countplot(x='Sex', data=train_df, ax=axis1)

# Plot count of Sex by Survival status
sns.countplot(x='Survived', hue='Sex', data=train_df, order=[1,0], ax=axis2)

# Group by sex, and get the mean for survived passengers for each value in Sex
sex_perc = train_df[['Sex', 'Survived']].groupby(['Sex'],as_index=False).mean()
sns.barplot(x='Sex', y='Survived', data=sex_perc,order=['female','male'],ax=axis3)
Out[51]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fbe2d88b7f0>
In [52]:
# Sex makes a significant difference to mean Survival rate, so keep the data

# Create a numeric representation for Sex to be able to use it for Machine Learning
# Call this new column Gender
train_df['Gender'] = train_df['Sex'].map( {'female': 0, 'male': 1} ).astype(float)

train_df.head(5)
Out[52]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Gender
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 1.0
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 0.0
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 0.0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 0.0
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S 1.0
In [53]:
# Examine Embarked data
plot_df = train_df[train_df['Embarked'].notnull()]

# Visualisation
fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5))

# Plot count of port of Embarkation
sns.countplot(x='Embarked', data=plot_df, ax=axis1)

# Plot count of Embarked ports by Survival status
sns.countplot(x='Survived', hue='Embarked', data=plot_df, order=[1,0], ax=axis2)

# Group by embarked, and get the mean for survived passengers for each value in Embarked
embark_perc = plot_df[['Embarked', 'Survived']].groupby(['Embarked'],as_index=False).mean()
sns.barplot(x='Embarked', y='Survived', data=embark_perc,order=['S','C','Q'],ax=axis3)
Out[53]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fbe2d7bb748>
In [54]:
# Create a numeric representation for Embarked - the missing values will be replaced with S (mapped to 0) as it is by far the most common port
train_df['Port'] = train_df['Embarked'].fillna('S').map({'S': 0.0, 'C': 0.5, 'Q': 1.0}).astype(float)

train_df.head(5)
Out[54]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Gender Port
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 1.0 0.0
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 0.0 0.5
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 0.0 0.0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 0.0 0.0
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S 1.0 0.0
In [55]:
# Any relationship between fare and survival?

fare_bin_size = 5

bin_count, bin_edges, binnumber = stats.binned_statistic(train_df['Fare'], train_df['Fare'], statistic='count', bins=100/fare_bin_size, range=(0,100))
bin_means, bin_edges, binnumber = stats.binned_statistic(train_df['Fare'], train_df['Survived'], statistic='mean', bins=100/fare_bin_size, range=(0,100))

fare_survived_df = pd.DataFrame({'FareBin': bin_edges[1:], 'FareCount': bin_count, 'MeanSurv': bin_means})

# Visualisation
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(20,5))

axes = sns.barplot(x='FareBin', y='FareCount', data=fare_survived_df, ax=axis1)
axes.set(xlabel = 'Fare', ylabel = 'count')

axes = sns.barplot(x='FareBin', y='MeanSurv', data=fare_survived_df, ax=axis2)
axes.set(xlabel = 'Fare', ylabel = 'mean(Survived)')
Out[55]:
[<matplotlib.text.Text at 0x7fbe2d6540b8>,
 <matplotlib.text.Text at 0x7fbe2d647dd8>]
In [56]:
# Any relationship between age and survival?

age_bin_size = 5

bin_count, bin_edges, binnumber = stats.binned_statistic(train_df['Age'], train_df['Age'], statistic='count', bins=100/age_bin_size, range=(0,100))
bin_means, bin_edges, binnumber = stats.binned_statistic(train_df['Age'], train_df['Survived'], statistic='mean', bins=100/age_bin_size, range=(0,100))

age_survived_df = pd.DataFrame({'AgeBin': bin_edges[1:], 'AgeCount': bin_count, 'MeanSurv': bin_means})

# Visualisation
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(20,5))

axes = sns.barplot(x='AgeBin', y='AgeCount', data=age_survived_df, ax=axis1)
axes.set(xlabel = 'Age', ylabel = 'count')

axes = sns.barplot(x='AgeBin', y='MeanSurv', data=age_survived_df, ax=axis2)
axes.set(xlabel = 'Age', ylabel = 'mean(Survived)')
Out[56]:
[<matplotlib.text.Text at 0x7fbe2d45a9b0>,
 <matplotlib.text.Text at 0x7fbe2d450160>]
In [57]:
# Age appears to make a difference to mean Survival rate, so keep the data
# Process it for Machine Learning

#Examine the passengers with missing age details
agena_df = train_df[train_df['Age'].isnull()]
agena_df.head(5)
Out[57]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Gender Port
5 6 0 3 Moran, Mr. James male NaN 0 0 330877 8.4583 NaN Q 1.0 1.0
17 18 1 2 Williams, Mr. Charles Eugene male NaN 0 0 244373 13.0000 NaN S 1.0 0.0
19 20 1 3 Masselmani, Mrs. Fatima female NaN 0 0 2649 7.2250 NaN C 0.0 0.5
26 27 0 3 Emir, Mr. Farred Chehab male NaN 0 0 2631 7.2250 NaN C 1.0 0.5
28 29 1 3 O'Dwyer, Miss. Ellen "Nellie" female NaN 0 0 330959 7.8792 NaN Q 0.0 1.0
In [58]:
# Any relationship beween age and passenger class?

# Distribution of Pclass values among passengers, where age is known
count_Pclass = plot_df[['Pclass', 'PassengerId']].groupby(['Pclass'],as_index=False).count()

# Distribution of Pclass values among passengers, where age is unknown
count_agena_Pclass = agena_df[['Pclass', 'PassengerId']].groupby(['Pclass'],as_index=False).count()

# Mean and median age of passengers by Pclass value
mean_age = plot_df[['Pclass', 'Age']].groupby(['Pclass'],as_index=False).mean()
median_age = plot_df[['Pclass', 'Age']].groupby(['Pclass'],as_index=False).median()

# Visualisation
fig, (axis1,axis2,axis3,axis4) = plt.subplots(1,4,figsize=(15,5))

axes = sns.barplot(x='Pclass', y='PassengerId', data=count_Pclass, ax=axis1)
axes.set(ylabel = 'count(Age known)')

axes = sns.barplot(x='Pclass', y='PassengerId', data=count_agena_Pclass, ax=axis2)
axes.set(ylabel = 'count(Age unknown)')

sns.barplot(x='Pclass', y='Age', data=mean_age, ax=axis3)

axes = sns.barplot(x='Pclass', y='Age', data=median_age, ax=axis4)
axes.set(ylabel = 'median(Age)')
Out[58]:
[<matplotlib.text.Text at 0x7fbe2d244dd8>]
In [59]:
# Any relationship between age and fare paid?

axes = sns.lmplot('Fare', 'Age', data=plot_df, fit_reg=False)

axes.set(xlabel="Fare", ylabel="Age")
Out[59]:
<seaborn.axisgrid.FacetGrid at 0x7fbe2d701518>
In [60]:
# Any relationship beween age and sex?

# Visualisation
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,5))

mean_age = plot_df[['Sex', 'Age']].groupby(['Sex'],as_index=False).mean()
median_age = plot_df[['Sex', 'Age']].groupby(['Sex'],as_index=False).median()
sns.barplot(x='Sex', y='Age', data=mean_age, ax=axis1)
axes = sns.barplot(x='Sex', y='Age', data=median_age, ax=axis2)
axes.set(ylabel = 'median(Age)')
Out[60]:
[<matplotlib.text.Text at 0x7fbe2d057e48>]
In [61]:
# Any relationship between age and sibling/spouse?

# Distribution of SibSp values among passengers, where age is known
count_sibsp = plot_df[['SibSp', 'PassengerId']].groupby(['SibSp'],as_index=False).count()

# Distribution of SibSp values among passengers, where age is unknown
count_agena_sibsp = agena_df[['SibSp', 'PassengerId']].groupby(['SibSp'],as_index=False).count()

# Mean and median age of passengers by SibSp value
mean_age = plot_df[['SibSp', 'Age']].groupby(['SibSp'],as_index=False).mean()
median_age = plot_df[['SibSp', 'Age']].groupby(['SibSp'],as_index=False).median()

# Visualisation
fig, (axis1,axis2,axis3,axis4) = plt.subplots(1,4,figsize=(15,5))

axes = sns.barplot(x='SibSp', y='PassengerId', data=count_sibsp, ax=axis1)
axes.set(ylabel = 'count(Age known)')

axes = sns.barplot(x='SibSp', y='PassengerId', data=count_agena_sibsp, ax=axis2)
axes.set(ylabel = 'count(Age unknown)')

sns.barplot(x='SibSp', y='Age', data=mean_age, ax=axis3)

axes = sns.barplot(x='SibSp', y='Age', data=median_age, ax=axis4)
axes.set(ylabel = 'median(Age)')
Out[61]:
[<matplotlib.text.Text at 0x7fbe2cf2ccc0>]
In [62]:
# Any relationship between age and parent/child?

# Distribution of Parch values among passengers, where age is known
count_Parch = plot_df[['Parch', 'PassengerId']].groupby(['Parch'],as_index=False).count()

# Distribution of Parch values among passengers, where age is unknown
count_agena_Parch = agena_df[['Parch', 'PassengerId']].groupby(['Parch'],as_index=False).count()

# Mean and median age of passengers by Parch value
mean_age = plot_df[['Parch', 'Age']].groupby(['Parch'],as_index=False).mean()
median_age = plot_df[['Parch', 'Age']].groupby(['Parch'],as_index=False).median()

# Visualisation
fig, (axis1,axis2,axis3,axis4) = plt.subplots(1,4,figsize=(15,5))

axes = sns.barplot(x='Parch', y='PassengerId', data=count_Parch, ax=axis1)
axes.set(ylabel = 'count(Age known)')

axes = sns.barplot(x='Parch', y='PassengerId', data=count_agena_Parch, ax=axis2)
axes.set(ylabel = 'count(Age unknown)')

sns.barplot(x='Parch', y='Age', data=mean_age, ax=axis3)

axes = sns.barplot(x='Parch', y='Age', data=median_age, ax=axis4)
axes.set(ylabel = 'median(Age)')
Out[62]:
[<matplotlib.text.Text at 0x7fbe2ccdf160>]
In [63]:
# Will set missing age values to Median age for each passenger class
median_ages = np.zeros((2,3))
for i in range(0, 2):
    for j in range(0, 3):
        median_ages[i,j] = train_df[(train_df['Gender'] == i) & (train_df['Pclass'] == j+1)]['Age'].dropna().median()
 
train_df['AgeFill'] = train_df['Age']

for i in range(0, 2):
    for j in range(0, 3):
        train_df.loc[ (train_df.Age.isnull()) & (train_df.Gender == i) & (train_df.Pclass == j+1),'AgeFill'] = median_ages[i,j]

# Scale from 0 to 1
train_df['AgeFill'] = train_df['AgeFill']/100

train_df[train_df['Age'].isnull()].head(5)
Out[63]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Gender Port AgeFill
5 6 0 3 Moran, Mr. James male NaN 0 0 330877 8.4583 NaN Q 1.0 1.0 0.250
17 18 1 2 Williams, Mr. Charles Eugene male NaN 0 0 244373 13.0000 NaN S 1.0 0.0 0.300
19 20 1 3 Masselmani, Mrs. Fatima female NaN 0 0 2649 7.2250 NaN C 0.0 0.5 0.215
26 27 0 3 Emir, Mr. Farred Chehab male NaN 0 0 2631 7.2250 NaN C 1.0 0.5 0.250
28 29 1 3 O'Dwyer, Miss. Ellen "Nellie" female NaN 0 0 330959 7.8792 NaN Q 0.0 1.0 0.215
In [64]:
# Examine SibSp data

# Visualisation
fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5))

# Plot count of port of Embarkation
sns.countplot(x='SibSp', data=train_df, ax=axis1)

# Plot count of Embarked ports by Survival status
sns.countplot(x='Survived', hue='SibSp', data=train_df, order=[1,0], ax=axis2)

# Group by embarked, and get the mean for survived passengers for each value in SibSp
sibsp_mean = train_df[['SibSp', 'Survived']].groupby(['SibSp'],as_index=False).mean()
sns.barplot(x='SibSp', y='Survived', data=sibsp_mean,ax=axis3)
Out[64]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fbe2cb1a828>
In [65]:
# Examine Parch data

# Visualisation
fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5))

# Plot count of port of Embarkation
sns.countplot(x='Parch', data=train_df, ax=axis1)

# Plot count of Embarked ports by Survival status
sns.countplot(x='Survived', hue='Parch', data=train_df, order=[1,0], ax=axis2)

# Group by embarked, and get the mean for survived passengers for each value in Parch
parch_mean = train_df[['Parch', 'Survived']].groupby(['Parch'],as_index=False).mean()
sns.barplot(x='Parch', y='Survived', data=parch_mean,ax=axis3)
Out[65]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fbe2c9c3940>
In [66]:
# Both SibSp and Parch appear to make a difference to a passenger's chance of surviving - for non-zero values the chances are higher
# However, there are not very many non-zero values of each.  Try combining the two into a new column for total family size

train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch']

train_df.head(10)
Out[66]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Gender Port AgeFill FamilySize
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 1.0 0.0 0.22 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 0.0 0.5 0.38 1
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 0.0 0.0 0.26 0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 0.0 0.0 0.35 1
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S 1.0 0.0 0.35 0
5 6 0 3 Moran, Mr. James male NaN 0 0 330877 8.4583 NaN Q 1.0 1.0 0.25 0
6 7 0 1 McCarthy, Mr. Timothy J male 54.0 0 0 17463 51.8625 E46 S 1.0 0.0 0.54 0
7 8 0 3 Palsson, Master. Gosta Leonard male 2.0 3 1 349909 21.0750 NaN S 1.0 0.0 0.02 4
8 9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 2 347742 11.1333 NaN S 0.0 0.0 0.27 2
9 10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 0 237736 30.0708 NaN C 0.0 0.5 0.14 1
In [67]:
# Examine FamilySize data

# Visualisation
fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5))

# Plot count of port of Embarkation
sns.countplot(x='FamilySize', data=train_df, ax=axis1)

# Plot count of Embarked ports by Survival status
sns.countplot(x='Survived', hue='FamilySize', data=train_df, order=[1,0], ax=axis2)

# Group by embarked, and get the mean for survived passengers for each value in FamilySize
familysize_mean = train_df[['FamilySize', 'Survived']].groupby(['FamilySize'],as_index=False).mean()
sns.barplot(x='FamilySize', y='Survived', data=familysize_mean,ax=axis3)
Out[67]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fbe2c7f0b38>
In [68]:
# Scale SibSp, Parch, FamilySize and Pclass

train_df['SibSpS'] = train_df['SibSp'].astype(float)/10
train_df['ParchS'] = train_df['Parch'].astype(float)/10
train_df['FamilySizeS'] = train_df['FamilySize'].astype(float)/10
train_df['PclassS'] = train_df['Pclass'].astype(float)/3

train_df
Out[68]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Gender Port AgeFill FamilySize SibSpS ParchS FamilySizeS PclassS
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 1.0 0.0 0.220 1 0.1 0.0 0.1 1.000000
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 0.0 0.5 0.380 1 0.1 0.0 0.1 0.333333
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 0.0 0.0 0.260 0 0.0 0.0 0.0 1.000000
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 0.0 0.0 0.350 1 0.1 0.0 0.1 0.333333
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S 1.0 0.0 0.350 0 0.0 0.0 0.0 1.000000
5 6 0 3 Moran, Mr. James male NaN 0 0 330877 8.4583 NaN Q 1.0 1.0 0.250 0 0.0 0.0 0.0 1.000000
6 7 0 1 McCarthy, Mr. Timothy J male 54.0 0 0 17463 51.8625 E46 S 1.0 0.0 0.540 0 0.0 0.0 0.0 0.333333
7 8 0 3 Palsson, Master. Gosta Leonard male 2.0 3 1 349909 21.0750 NaN S 1.0 0.0 0.020 4 0.3 0.1 0.4 1.000000
8 9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 2 347742 11.1333 NaN S 0.0 0.0 0.270 2 0.0 0.2 0.2 1.000000
9 10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 0 237736 30.0708 NaN C 0.0 0.5 0.140 1 0.1 0.0 0.1 0.666667
10 11 1 3 Sandstrom, Miss. Marguerite Rut female 4.0 1 1 PP 9549 16.7000 G6 S 0.0 0.0 0.040 2 0.1 0.1 0.2 1.000000
11 12 1 1 Bonnell, Miss. Elizabeth female 58.0 0 0 113783 26.5500 C103 S 0.0 0.0 0.580 0 0.0 0.0 0.0 0.333333
12 13 0 3 Saundercock, Mr. William Henry male 20.0 0 0 A/5. 2151 8.0500 NaN S 1.0 0.0 0.200 0 0.0 0.0 0.0 1.000000
13 14 0 3 Andersson, Mr. Anders Johan male 39.0 1 5 347082 31.2750 NaN S 1.0 0.0 0.390 6 0.1 0.5 0.6 1.000000
14 15 0 3 Vestrom, Miss. Hulda Amanda Adolfina female 14.0 0 0 350406 7.8542 NaN S 0.0 0.0 0.140 0 0.0 0.0 0.0 1.000000
15 16 1 2 Hewlett, Mrs. (Mary D Kingcome) female 55.0 0 0 248706 16.0000 NaN S 0.0 0.0 0.550 0 0.0 0.0 0.0 0.666667
16 17 0 3 Rice, Master. Eugene male 2.0 4 1 382652 29.1250 NaN Q 1.0 1.0 0.020 5 0.4 0.1 0.5 1.000000
17 18 1 2 Williams, Mr. Charles Eugene male NaN 0 0 244373 13.0000 NaN S 1.0 0.0 0.300 0 0.0 0.0 0.0 0.666667
18 19 0 3 Vander Planke, Mrs. Julius (Emelia Maria Vande... female 31.0 1 0 345763 18.0000 NaN S 0.0 0.0 0.310 1 0.1 0.0 0.1 1.000000
19 20 1 3 Masselmani, Mrs. Fatima female NaN 0 0 2649 7.2250 NaN C 0.0 0.5 0.215 0 0.0 0.0 0.0 1.000000
20 21 0 2 Fynney, Mr. Joseph J male 35.0 0 0 239865 26.0000 NaN S 1.0 0.0 0.350 0 0.0 0.0 0.0 0.666667
21 22 1 2 Beesley, Mr. Lawrence male 34.0 0 0 248698 13.0000 D56 S 1.0 0.0 0.340 0 0.0 0.0 0.0 0.666667
22 23 1 3 McGowan, Miss. Anna "Annie" female 15.0 0 0 330923 8.0292 NaN Q 0.0 1.0 0.150 0 0.0 0.0 0.0 1.000000
23 24 1 1 Sloper, Mr. William Thompson male 28.0 0 0 113788 35.5000 A6 S 1.0 0.0 0.280 0 0.0 0.0 0.0 0.333333
24 25 0 3 Palsson, Miss. Torborg Danira female 8.0 3 1 349909 21.0750 NaN S 0.0 0.0 0.080 4 0.3 0.1 0.4 1.000000
25 26 1 3 Asplund, Mrs. Carl Oscar (Selma Augusta Emilia... female 38.0 1 5 347077 31.3875 NaN S 0.0 0.0 0.380 6 0.1 0.5 0.6 1.000000
26 27 0 3 Emir, Mr. Farred Chehab male NaN 0 0 2631 7.2250 NaN C 1.0 0.5 0.250 0 0.0 0.0 0.0 1.000000
27 28 0 1 Fortune, Mr. Charles Alexander male 19.0 3 2 19950 263.0000 C23 C25 C27 S 1.0 0.0 0.190 5 0.3 0.2 0.5 0.333333
28 29 1 3 O'Dwyer, Miss. Ellen "Nellie" female NaN 0 0 330959 7.8792 NaN Q 0.0 1.0 0.215 0 0.0 0.0 0.0 1.000000
29 30 0 3 Todoroff, Mr. Lalio male NaN 0 0 349216 7.8958 NaN S 1.0 0.0 0.250 0 0.0 0.0 0.0 1.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
861 862 0 2 Giles, Mr. Frederick Edward male 21.0 1 0 28134 11.5000 NaN S 1.0 0.0 0.210 1 0.1 0.0 0.1 0.666667
862 863 1 1 Swift, Mrs. Frederick Joel (Margaret Welles Ba... female 48.0 0 0 17466 25.9292 D17 S 0.0 0.0 0.480 0 0.0 0.0 0.0 0.333333
863 864 0 3 Sage, Miss. Dorothy Edith "Dolly" female NaN 8 2 CA. 2343 69.5500 NaN S 0.0 0.0 0.215 10 0.8 0.2 1.0 1.000000
864 865 0 2 Gill, Mr. John William male 24.0 0 0 233866 13.0000 NaN S 1.0 0.0 0.240 0 0.0 0.0 0.0 0.666667
865 866 1 2 Bystrom, Mrs. (Karolina) female 42.0 0 0 236852 13.0000 NaN S 0.0 0.0 0.420 0 0.0 0.0 0.0 0.666667
866 867 1 2 Duran y More, Miss. Asuncion female 27.0 1 0 SC/PARIS 2149 13.8583 NaN C 0.0 0.5 0.270 1 0.1 0.0 0.1 0.666667
867 868 0 1 Roebling, Mr. Washington Augustus II male 31.0 0 0 PC 17590 50.4958 A24 S 1.0 0.0 0.310 0 0.0 0.0 0.0 0.333333
868 869 0 3 van Melkebeke, Mr. Philemon male NaN 0 0 345777 9.5000 NaN S 1.0 0.0 0.250 0 0.0 0.0 0.0 1.000000
869 870 1 3 Johnson, Master. Harold Theodor male 4.0 1 1 347742 11.1333 NaN S 1.0 0.0 0.040 2 0.1 0.1 0.2 1.000000
870 871 0 3 Balkic, Mr. Cerin male 26.0 0 0 349248 7.8958 NaN S 1.0 0.0 0.260 0 0.0 0.0 0.0 1.000000
871 872 1 1 Beckwith, Mrs. Richard Leonard (Sallie Monypeny) female 47.0 1 1 11751 52.5542 D35 S 0.0 0.0 0.470 2 0.1 0.1 0.2 0.333333
872 873 0 1 Carlsson, Mr. Frans Olof male 33.0 0 0 695 5.0000 B51 B53 B55 S 1.0 0.0 0.330 0 0.0 0.0 0.0 0.333333
873 874 0 3 Vander Cruyssen, Mr. Victor male 47.0 0 0 345765 9.0000 NaN S 1.0 0.0 0.470 0 0.0 0.0 0.0 1.000000
874 875 1 2 Abelson, Mrs. Samuel (Hannah Wizosky) female 28.0 1 0 P/PP 3381 24.0000 NaN C 0.0 0.5 0.280 1 0.1 0.0 0.1 0.666667
875 876 1 3 Najib, Miss. Adele Kiamie "Jane" female 15.0 0 0 2667 7.2250 NaN C 0.0 0.5 0.150 0 0.0 0.0 0.0 1.000000
876 877 0 3 Gustafsson, Mr. Alfred Ossian male 20.0 0 0 7534 9.8458 NaN S 1.0 0.0 0.200 0 0.0 0.0 0.0 1.000000
877 878 0 3 Petroff, Mr. Nedelio male 19.0 0 0 349212 7.8958 NaN S 1.0 0.0 0.190 0 0.0 0.0 0.0 1.000000
878 879 0 3 Laleff, Mr. Kristo male NaN 0 0 349217 7.8958 NaN S 1.0 0.0 0.250 0 0.0 0.0 0.0 1.000000
879 880 1 1 Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) female 56.0 0 1 11767 83.1583 C50 C 0.0 0.5 0.560 1 0.0 0.1 0.1 0.333333
880 881 1 2 Shelley, Mrs. William (Imanita Parrish Hall) female 25.0 0 1 230433 26.0000 NaN S 0.0 0.0 0.250 1 0.0 0.1 0.1 0.666667
881 882 0 3 Markun, Mr. Johann male 33.0 0 0 349257 7.8958 NaN S 1.0 0.0 0.330 0 0.0 0.0 0.0 1.000000
882 883 0 3 Dahlberg, Miss. Gerda Ulrika female 22.0 0 0 7552 10.5167 NaN S 0.0 0.0 0.220 0 0.0 0.0 0.0 1.000000
883 884 0 2 Banfield, Mr. Frederick James male 28.0 0 0 C.A./SOTON 34068 10.5000 NaN S 1.0 0.0 0.280 0 0.0 0.0 0.0 0.666667
884 885 0 3 Sutehall, Mr. Henry Jr male 25.0 0 0 SOTON/OQ 392076 7.0500 NaN S 1.0 0.0 0.250 0 0.0 0.0 0.0 1.000000
885 886 0 3 Rice, Mrs. William (Margaret Norton) female 39.0 0 5 382652 29.1250 NaN Q 0.0 1.0 0.390 5 0.0 0.5 0.5 1.000000
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S 1.0 0.0 0.270 0 0.0 0.0 0.0 0.666667
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S 0.0 0.0 0.190 0 0.0 0.0 0.0 0.333333
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S 0.0 0.0 0.215 3 0.1 0.2 0.3 1.000000
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C 1.0 0.5 0.260 0 0.0 0.0 0.0 0.333333
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q 1.0 1.0 0.320 0 0.0 0.0 0.0 1.000000

891 rows × 20 columns

In [69]:
# Drop unused columns - this includes those that have been replaced

train_df = train_df.drop(['Age', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'PassengerId', 'SibSp', 'Parch', 'FamilySize', 'Pclass'], axis=1)
train_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
Survived       891 non-null int64
Fare           891 non-null float64
Gender         891 non-null float64
Port           891 non-null float64
AgeFill        891 non-null float64
SibSpS         891 non-null float64
ParchS         891 non-null float64
FamilySizeS    891 non-null float64
PclassS        891 non-null float64
dtypes: float64(8), int64(1)
memory usage: 62.7 KB
In [70]:
# TEST DATA
test_df = pd.read_csv('test.csv', header=0)        # Load the test file into a dataframe

# Repeat the data processing of the traing data with the test data, so that the columns are the same

# Create a numeric representation for Sex
test_df['Gender'] = test_df['Sex'].map( {'female': 0, 'male': 1} ).astype(float)

# Create a numeric representation for Embarked port
test_df['Port'] = test_df['Embarked'].fillna('S').map({'S': 0.0, 'C': 0.5, 'Q': 1.0}).astype(float)

# Will set missing age values to Median age for each passenger class
median_ages = np.zeros((2,3))
for i in range(0, 2):
    for j in range(0, 3):
        median_ages[i,j] = test_df[(test_df['Gender'] == i) & (test_df['Pclass'] == j+1)]['Age'].dropna().median()
 
test_df['AgeFill'] = test_df['Age']

for i in range(0, 2):
    for j in range(0, 3):
        test_df.loc[ (test_df.Age.isnull()) & (test_df.Gender == i) & (test_df.Pclass == j+1),'AgeFill'] = median_ages[i,j]
        
# Scale from 0 to 1
test_df['AgeFill'] = test_df['AgeFill']/100
        
# Add FamilySize column

test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch']

# All the missing Fares -> assume median of their respective class
if len(test_df.Fare[ test_df.Fare.isnull() ]) > 0:
    median_fare = np.zeros(3)
    for f in range(0, 3):
        median_fare[f] = test_df[ test_df.Pclass == f+1 ]['Fare'].dropna().median()
    for f in range(0, 3):
        test_df.loc[ (test_df.Fare.isnull()) & (test_df.Pclass == f+1 ), 'Fare'] = median_fare[f]

# Collect the test data's PassengerIds before dropping it
ids = test_df['PassengerId'].values

# Scaling
test_df['SibSpS'] = test_df['SibSp'].astype(float)/10
test_df['ParchS'] = test_df['Parch'].astype(float)/10
test_df['FamilySizeS'] = test_df['FamilySize'].astype(float)/10
test_df['PclassS'] = test_df['Pclass'].astype(float)/3

# Remove the unused/replaced columns
test_df = test_df.drop(['Age', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'PassengerId', 'SibSp', 'Parch', 'FamilySize', 'Pclass'], axis=1) 

# The data is now ready to go. So lets fit to the train, then predict to the test!
# Convert back to a numpy array
train_data = train_df.values
test_data = test_df.values

test_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
Fare           418 non-null float64
Gender         418 non-null float64
Port           418 non-null float64
AgeFill        418 non-null float64
SibSpS         418 non-null float64
ParchS         418 non-null float64
FamilySizeS    418 non-null float64
PclassS        418 non-null float64
dtypes: float64(8)
memory usage: 26.2 KB
In [71]:
# Control classifier - always chooses the most frequent class

control = DummyClassifier(strategy='most_frequent')

# Use Stratified k-fold cross-validation for 10 folds

control_cv_scores = pd.DataFrame({"accuracy": cross_val_score(control, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="accuracy"),
                                 "precision": cross_val_score(control, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="precision"),
                                 "recall": cross_val_score(control, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="recall"),
                                 "f1": cross_val_score(control, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="f1")
                                 })

print('Control Classifier - Cross-validated')
Control Classifier - Cross-validated
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
In [72]:
# Display all the measure scores for each fold

control_cv_scores.to_csv("control_cv_scores.csv")

control_cv_scores
Out[72]:
accuracy f1 precision recall
0 0.611111 0.0 0.0 0.0
1 0.611111 0.0 0.0 0.0
2 0.617978 0.0 0.0 0.0
3 0.617978 0.0 0.0 0.0
4 0.617978 0.0 0.0 0.0
5 0.617978 0.0 0.0 0.0
6 0.617978 0.0 0.0 0.0
7 0.617978 0.0 0.0 0.0
8 0.617978 0.0 0.0 0.0
9 0.613636 0.0 0.0 0.0
In [73]:
# Mean values of measure scores

control_cv_scores.mean()
Out[73]:
accuracy     0.61617
f1           0.00000
precision    0.00000
recall       0.00000
dtype: float64
In [74]:
# Standard deviation values of measure scores

control_cv_scores.mean()
Out[74]:
accuracy     0.61617
f1           0.00000
precision    0.00000
recall       0.00000
dtype: float64
In [75]:
# Grid search for Random Forest parameter optimisation

params = [ {'n_estimators': [100, 200, 500], 'max_features': ['sqrt', None]} ]

rf = RandomForestClassifier()

grid = GridSearchCV(estimator=rf, param_grid=params, cv=5, scoring='f1', verbose=3)

grid.fit(train_data[0::,1::], train_data[0::,0])

print(grid.best_params_)
Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] max_features=sqrt, n_estimators=100 .............................
[CV] .... max_features=sqrt, n_estimators=100, score=0.719424 -   0.0s
[CV] max_features=sqrt, n_estimators=100 .............................
[CV] .... max_features=sqrt, n_estimators=100, score=0.727273 -   0.0s
[CV] max_features=sqrt, n_estimators=100 .............................
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s
[CV] .... max_features=sqrt, n_estimators=100, score=0.802920 -   0.0s
[CV] max_features=sqrt, n_estimators=100 .............................
[CV] .... max_features=sqrt, n_estimators=100, score=0.661157 -   0.0s
[CV] max_features=sqrt, n_estimators=100 .............................
[CV] .... max_features=sqrt, n_estimators=100, score=0.797101 -   0.0s
[CV] max_features=sqrt, n_estimators=200 .............................
[CV] .... max_features=sqrt, n_estimators=200, score=0.737589 -   0.0s
[CV] max_features=sqrt, n_estimators=200 .............................
[CV] .... max_features=sqrt, n_estimators=200, score=0.723404 -   0.0s
[CV] max_features=sqrt, n_estimators=200 .............................
[CV] .... max_features=sqrt, n_estimators=200, score=0.811594 -   0.0s
[CV] max_features=sqrt, n_estimators=200 .............................
[CV] .... max_features=sqrt, n_estimators=200, score=0.672131 -   0.0s
[CV] max_features=sqrt, n_estimators=200 .............................
[CV] .... max_features=sqrt, n_estimators=200, score=0.780142 -   0.0s
[CV] max_features=sqrt, n_estimators=500 .............................
[CV] .... max_features=sqrt, n_estimators=500, score=0.746479 -   0.1s
[CV] max_features=sqrt, n_estimators=500 .............................
[CV] .... max_features=sqrt, n_estimators=500, score=0.746269 -   0.1s
[CV] max_features=sqrt, n_estimators=500 .............................
[CV] .... max_features=sqrt, n_estimators=500, score=0.802920 -   0.1s
[CV] max_features=sqrt, n_estimators=500 .............................
[CV] .... max_features=sqrt, n_estimators=500, score=0.688525 -   0.1s
[CV] max_features=sqrt, n_estimators=500 .............................
[CV] .... max_features=sqrt, n_estimators=500, score=0.771429 -   0.1s
[CV] max_features=None, n_estimators=100 .............................
[CV] .... max_features=None, n_estimators=100, score=0.700730 -   0.0s
[CV] max_features=None, n_estimators=100 .............................
[CV] .... max_features=None, n_estimators=100, score=0.728571 -   0.0s
[CV] max_features=None, n_estimators=100 .............................
[CV] .... max_features=None, n_estimators=100, score=0.800000 -   0.0s
[CV] max_features=None, n_estimators=100 .............................
[CV] .... max_features=None, n_estimators=100, score=0.688525 -   0.0s
[CV] max_features=None, n_estimators=100 .............................
[CV] .... max_features=None, n_estimators=100, score=0.776978 -   0.0s
[CV] max_features=None, n_estimators=200 .............................
[CV] .... max_features=None, n_estimators=200, score=0.700730 -   0.0s
[CV] max_features=None, n_estimators=200 .............................
[CV] .... max_features=None, n_estimators=200, score=0.746269 -   0.0s
[CV] max_features=None, n_estimators=200 .............................
[CV] .... max_features=None, n_estimators=200, score=0.824427 -   0.0s
[CV] max_features=None, n_estimators=200 .............................
[CV] .... max_features=None, n_estimators=200, score=0.693548 -   0.0s
[CV] max_features=None, n_estimators=200 .............................
[CV] .... max_features=None, n_estimators=200, score=0.762590 -   0.0s
[CV] max_features=None, n_estimators=500 .............................
[CV] .... max_features=None, n_estimators=500, score=0.700000 -   0.1s
[CV] max_features=None, n_estimators=500 .............................
[CV] .... max_features=None, n_estimators=500, score=0.729927 -   0.1s
[CV] max_features=None, n_estimators=500 .............................
[CV] .... max_features=None, n_estimators=500, score=0.800000 -   0.1s
[CV] max_features=None, n_estimators=500 .............................
[CV] .... max_features=None, n_estimators=500, score=0.666667 -   0.1s
[CV] max_features=None, n_estimators=500 .............................
[CV] .... max_features=None, n_estimators=500, score=0.780142 -   0.1s
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   16.1s finished
{'max_features': 'sqrt', 'n_estimators': 500}
In [76]:
# Random Forest

forest = RandomForestClassifier(n_estimators=100, max_features=None)
forest = forest.fit( train_data[0::,1::], train_data[0::,0] )

print('Random Forest - Trained')
Random Forest - Trained
In [77]:
output = forest.predict(test_data).astype(int)


predictions_file = open("randomforest.csv", "w")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(ids, output))
predictions_file.close()

print('Random Forest - Predicted')
Random Forest - Predicted
In [78]:
# Cross Validator uses Stratified k-fold cross-validation for 10 folds

forest_cv_scores = pd.DataFrame({"accuracy": cross_val_score(forest, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="accuracy"),
                                 "precision": cross_val_score(forest, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="precision"),
                                 "recall": cross_val_score(forest, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="recall"),
                                 "f1": cross_val_score(forest, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="f1")
                                 })

print('Random Forest - Cross-validated')
Random Forest - Cross-validated
In [79]:
# Display all the measure scores for each fold

forest_cv_scores.to_csv("forest_cv_scores.csv")

forest_cv_scores
Out[79]:
accuracy f1 precision recall
0 0.744444 0.676056 0.648649 0.685714
1 0.800000 0.738462 0.766667 0.628571
2 0.775281 0.644068 0.689655 0.588235
3 0.831461 0.773333 0.731707 0.852941
4 0.842697 0.811594 0.848485 0.852941
5 0.842697 0.764706 0.787879 0.764706
6 0.831461 0.774194 0.857143 0.676471
7 0.752809 0.677419 0.666667 0.588235
8 0.831461 0.800000 0.756757 0.852941
9 0.863636 0.823529 0.848485 0.823529
In [80]:
# Mean values of measure scores

forest_cv_scores.mean()
Out[80]:
accuracy     0.811595
f1           0.748336
precision    0.760209
recall       0.731429
dtype: float64
In [81]:
# Standard Deviations of measure scores

forest_cv_scores.std()
Out[81]:
accuracy     0.041157
f1           0.062458
precision    0.076585
recall       0.110847
dtype: float64
In [82]:
# 95% Confidence Intervals

stats.norm.interval(0.95, loc=forest_cv_scores['f1'].mean(), scale=forest_cv_scores['f1'].std())
Out[82]:
(0.62592046904632481, 0.87075181228871823)
In [83]:
# Grid search for Support Vector Machine parameter optimisation

c_list = [8.845, 8.847, 8.85, 8.852, 8.855]
gamma_list = [1.5135, 1.5137, 1.514, 1.5142, 1.5145]
params = [ {'C': c_list, 'gamma': gamma_list} ]

grid = GridSearchCV(estimator=svm.SVC(kernel='rbf', cache_size=500, class_weight='balanced'), param_grid=params, cv=3, scoring='f1', verbose=3)

grid.fit(train_data[0::,1::], train_data[0::,0])

print(grid.best_params_)
Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] gamma=1.5135, C=8.845 ...........................................
[CV] .................. gamma=1.5135, C=8.845, score=0.708520 -   0.0s
[CV] gamma=1.5135, C=8.845 ...........................................
[CV] .................. gamma=1.5135, C=8.845, score=0.756098 -   0.0s
[CV] gamma=1.5135, C=8.845 ...........................................
[CV] .................. gamma=1.5135, C=8.845, score=0.755556 -   0.0s
[CV] gamma=1.5137, C=8.845 ...........................................
[CV] .................. gamma=1.5137, C=8.845, score=0.708520 -   0.0s
[CV] gamma=1.5137, C=8.845 ...........................................
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[CV] .................. gamma=1.5137, C=8.845, score=0.756098 -   0.0s
[CV] gamma=1.5137, C=8.845 ...........................................
[CV] .................. gamma=1.5137, C=8.845, score=0.761062 -   0.0s
[CV] gamma=1.514, C=8.845 ............................................
[CV] ................... gamma=1.514, C=8.845, score=0.708520 -   0.0s
[CV] gamma=1.514, C=8.845 ............................................
[CV] ................... gamma=1.514, C=8.845, score=0.756098 -   0.0s
[CV] gamma=1.514, C=8.845 ............................................
[CV] ................... gamma=1.514, C=8.845, score=0.761062 -   0.0s
[CV] gamma=1.5142, C=8.845 ...........................................
[CV] .................. gamma=1.5142, C=8.845, score=0.708520 -   0.0s
[CV] gamma=1.5142, C=8.845 ...........................................
[CV] .................. gamma=1.5142, C=8.845, score=0.756098 -   0.0s
[CV] gamma=1.5142, C=8.845 ...........................................
[CV] .................. gamma=1.5142, C=8.845, score=0.761062 -   0.0s
[CV] gamma=1.5145, C=8.845 ...........................................
[CV] .................. gamma=1.5145, C=8.845, score=0.708520 -   0.0s
[CV] gamma=1.5145, C=8.845 ...........................................
[CV] .................. gamma=1.5145, C=8.845, score=0.756098 -   0.0s
[CV] gamma=1.5145, C=8.845 ...........................................
[CV] .................. gamma=1.5145, C=8.845, score=0.761062 -   0.0s
[CV] gamma=1.5135, C=8.847 ...........................................
[CV] .................. gamma=1.5135, C=8.847, score=0.708520 -   0.0s
[CV] gamma=1.5135, C=8.847 ...........................................
[CV] .................. gamma=1.5135, C=8.847, score=0.759184 -   0.0s
[CV] gamma=1.5135, C=8.847 ...........................................
[CV] .................. gamma=1.5135, C=8.847, score=0.761062 -   0.0s
[CV] gamma=1.5137, C=8.847 ...........................................
[CV] .................. gamma=1.5137, C=8.847, score=0.708520 -   0.0s
[CV] gamma=1.5137, C=8.847 ...........................................
[CV] .................. gamma=1.5137, C=8.847, score=0.759184 -   0.0s
[CV] gamma=1.5137, C=8.847 ...........................................
[CV] .................. gamma=1.5137, C=8.847, score=0.761062 -   0.0s
[CV] gamma=1.514, C=8.847 ............................................
[CV] ................... gamma=1.514, C=8.847, score=0.708520 -   0.0s
[CV] gamma=1.514, C=8.847 ............................................
[CV] ................... gamma=1.514, C=8.847, score=0.756098 -   0.0s
[CV] gamma=1.514, C=8.847 ............................................
[CV] ................... gamma=1.514, C=8.847, score=0.761062 -   0.0s
[CV] gamma=1.5142, C=8.847 ...........................................
[CV] .................. gamma=1.5142, C=8.847, score=0.708520 -   0.0s
[CV] gamma=1.5142, C=8.847 ...........................................
[CV] .................. gamma=1.5142, C=8.847, score=0.759184 -   0.0s
[CV] gamma=1.5142, C=8.847 ...........................................
[CV] .................. gamma=1.5142, C=8.847, score=0.761062 -   0.0s
[CV] gamma=1.5145, C=8.847 ...........................................
[CV] .................. gamma=1.5145, C=8.847, score=0.708520 -   0.0s
[CV] gamma=1.5145, C=8.847 ...........................................
[CV] .................. gamma=1.5145, C=8.847, score=0.756098 -   0.0s
[CV] gamma=1.5145, C=8.847 ...........................................
[CV] .................. gamma=1.5145, C=8.847, score=0.761062 -   0.0s
[CV] gamma=1.5135, C=8.85 ............................................
[CV] ................... gamma=1.5135, C=8.85, score=0.708520 -   0.0s
[CV] gamma=1.5135, C=8.85 ............................................
[CV] ................... gamma=1.5135, C=8.85, score=0.759184 -   0.0s
[CV] gamma=1.5135, C=8.85 ............................................
[CV] ................... gamma=1.5135, C=8.85, score=0.761062 -   0.0s
[CV] gamma=1.5137, C=8.85 ............................................
[CV] ................... gamma=1.5137, C=8.85, score=0.708520 -   0.0s
[CV] gamma=1.5137, C=8.85 ............................................
[CV] ................... gamma=1.5137, C=8.85, score=0.756098 -   0.0s
[CV] gamma=1.5137, C=8.85 ............................................
[CV] ................... gamma=1.5137, C=8.85, score=0.761062 -   0.0s
[CV] gamma=1.514, C=8.85 .............................................
[CV] .................... gamma=1.514, C=8.85, score=0.708520 -   0.0s
[CV] gamma=1.514, C=8.85 .............................................
[CV] .................... gamma=1.514, C=8.85, score=0.759184 -   0.0s
[CV] gamma=1.514, C=8.85 .............................................
[CV] .................... gamma=1.514, C=8.85, score=0.761062 -   0.0s
[CV] gamma=1.5142, C=8.85 ............................................
[CV] ................... gamma=1.5142, C=8.85, score=0.708520 -   0.0s
[CV] gamma=1.5142, C=8.85 ............................................
[CV] ................... gamma=1.5142, C=8.85, score=0.759184 -   0.0s
[CV] gamma=1.5142, C=8.85 ............................................
[CV] ................... gamma=1.5142, C=8.85, score=0.761062 -   0.0s
[CV] gamma=1.5145, C=8.85 ............................................
[CV] ................... gamma=1.5145, C=8.85, score=0.708520 -   0.0s
[CV] gamma=1.5145, C=8.85 ............................................
[CV] ................... gamma=1.5145, C=8.85, score=0.759184 -   0.0s
[CV] gamma=1.5145, C=8.85 ............................................
[CV] ................... gamma=1.5145, C=8.85, score=0.761062 -   0.0s
[CV] gamma=1.5135, C=8.852 ...........................................
[CV] .................. gamma=1.5135, C=8.852, score=0.708520 -   0.0s
[CV] gamma=1.5135, C=8.852 ...........................................
[CV] .................. gamma=1.5135, C=8.852, score=0.759184 -   0.0s
[CV] gamma=1.5135, C=8.852 ...........................................
[CV] .................. gamma=1.5135, C=8.852, score=0.761062 -   0.0s
[CV] gamma=1.5137, C=8.852 ...........................................
[CV] .................. gamma=1.5137, C=8.852, score=0.708520 -   0.0s
[CV] gamma=1.5137, C=8.852 ...........................................
[CV] .................. gamma=1.5137, C=8.852, score=0.759184 -   0.0s
[CV] gamma=1.5137, C=8.852 ...........................................
[CV] .................. gamma=1.5137, C=8.852, score=0.761062 -   0.0s
[CV] gamma=1.514, C=8.852 ............................................
[CV] ................... gamma=1.514, C=8.852, score=0.708520 -   0.0s
[CV] gamma=1.514, C=8.852 ............................................
[CV] ................... gamma=1.514, C=8.852, score=0.759184 -   0.0s
[CV] gamma=1.514, C=8.852 ............................................
[CV] ................... gamma=1.514, C=8.852, score=0.761062 -   0.0s
[CV] gamma=1.5142, C=8.852 ...........................................
[CV] .................. gamma=1.5142, C=8.852, score=0.708520 -   0.0s
[CV] gamma=1.5142, C=8.852 ...........................................
[CV] .................. gamma=1.5142, C=8.852, score=0.759184 -   0.0s
[CV] gamma=1.5142, C=8.852 ...........................................
[CV] .................. gamma=1.5142, C=8.852, score=0.761062 -   0.0s
[CV] gamma=1.5145, C=8.852 ...........................................
[CV] .................. gamma=1.5145, C=8.852, score=0.708520 -   0.0s
[CV] gamma=1.5145, C=8.852 ...........................................
[CV] .................. gamma=1.5145, C=8.852, score=0.759184 -   0.0s
[CV] gamma=1.5145, C=8.852 ...........................................
[CV] .................. gamma=1.5145, C=8.852, score=0.761062 -   0.0s
[CV] gamma=1.5135, C=8.855 ...........................................
[CV] .................. gamma=1.5135, C=8.855, score=0.708520 -   0.0s
[CV] gamma=1.5135, C=8.855 ...........................................
[CV] .................. gamma=1.5135, C=8.855, score=0.759184 -   0.0s
[CV] gamma=1.5135, C=8.855 ...........................................
[CV] .................. gamma=1.5135, C=8.855, score=0.761062 -   0.0s
[CV] gamma=1.5137, C=8.855 ...........................................
[CV] .................. gamma=1.5137, C=8.855, score=0.708520 -   0.0s
[CV] gamma=1.5137, C=8.855 ...........................................
[CV] .................. gamma=1.5137, C=8.855, score=0.759184 -   0.0s
[CV] gamma=1.5137, C=8.855 ...........................................
[CV] .................. gamma=1.5137, C=8.855, score=0.761062 -   0.0s
[CV] gamma=1.514, C=8.855 ............................................
[CV] ................... gamma=1.514, C=8.855, score=0.708520 -   0.0s
[CV] gamma=1.514, C=8.855 ............................................
[CV] ................... gamma=1.514, C=8.855, score=0.759184 -   0.0s
[CV] gamma=1.514, C=8.855 ............................................
[CV] ................... gamma=1.514, C=8.855, score=0.761062 -   0.0s
[CV] gamma=1.5142, C=8.855 ...........................................
[CV] .................. gamma=1.5142, C=8.855, score=0.708520 -   0.0s
[CV] gamma=1.5142, C=8.855 ...........................................
[CV] .................. gamma=1.5142, C=8.855, score=0.759184 -   0.0s
[CV] gamma=1.5142, C=8.855 ...........................................
[CV] .................. gamma=1.5142, C=8.855, score=0.761062 -   0.0s
[CV] gamma=1.5145, C=8.855 ...........................................
[CV] .................. gamma=1.5145, C=8.855, score=0.708520 -   0.0s
[CV] gamma=1.5145, C=8.855 ...........................................
[CV] .................. gamma=1.5145, C=8.855, score=0.759184 -   0.0s
[CV] gamma=1.5145, C=8.855 ...........................................
[CV] .................. gamma=1.5145, C=8.855, score=0.761062 -   0.0s
{'gamma': 1.5135, 'C': 8.847}
[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:    3.2s finished
In [84]:
grid_df = pd.DataFrame(grid.cv_results_)
grid_df.to_csv("svmachine_gridsearch.csv")
grid_df
Out[84]:
mean_fit_time mean_score_time mean_test_score mean_train_score param_C param_gamma params rank_test_score split0_test_score split0_train_score split1_test_score split1_train_score split2_test_score split2_train_score std_fit_time std_score_time std_test_score std_train_score
0 0.026519 0.005270 0.740058 0.88829 8.845 1.5135 {'gamma': 1.5135, 'C': 8.845} 25 0.70852 0.900222 0.756098 0.877729 0.755556 0.886918 0.003043 0.000311 0.022302 0.009234
1 0.029775 0.005481 0.741893 0.88829 8.845 1.5137 {'gamma': 1.5137, 'C': 8.845} 18 0.70852 0.900222 0.756098 0.877729 0.761062 0.886918 0.003933 0.000385 0.023685 0.009234
2 0.024944 0.004726 0.741893 0.88829 8.845 1.514 {'gamma': 1.514, 'C': 8.845} 18 0.70852 0.900222 0.756098 0.877729 0.761062 0.886918 0.002504 0.000115 0.023685 0.009234
3 0.028615 0.004974 0.741893 0.88829 8.845 1.5142 {'gamma': 1.5142, 'C': 8.845} 18 0.70852 0.900222 0.756098 0.877729 0.761062 0.886918 0.004213 0.000342 0.023685 0.009234
4 0.028352 0.006070 0.741893 0.88829 8.845 1.5145 {'gamma': 1.5145, 'C': 8.845} 18 0.70852 0.900222 0.756098 0.877729 0.761062 0.886918 0.003812 0.001969 0.023685 0.009234
5 0.029189 0.004819 0.742922 0.88829 8.847 1.5135 {'gamma': 1.5135, 'C': 8.847} 1 0.70852 0.900222 0.759184 0.877729 0.761062 0.886918 0.005666 0.000163 0.024338 0.009234
6 0.025874 0.006638 0.742922 0.88829 8.847 1.5137 {'gamma': 1.5137, 'C': 8.847} 1 0.70852 0.900222 0.759184 0.877729 0.761062 0.886918 0.003883 0.002887 0.024338 0.009234
7 0.026419 0.005511 0.741893 0.88829 8.847 1.514 {'gamma': 1.514, 'C': 8.847} 18 0.70852 0.900222 0.756098 0.877729 0.761062 0.886918 0.002627 0.000643 0.023685 0.009234
8 0.026388 0.004622 0.742922 0.88829 8.847 1.5142 {'gamma': 1.5142, 'C': 8.847} 1 0.70852 0.900222 0.759184 0.877729 0.761062 0.886918 0.002897 0.000022 0.024338 0.009234
9 0.026506 0.004623 0.741893 0.88829 8.847 1.5145 {'gamma': 1.5145, 'C': 8.847} 18 0.70852 0.900222 0.756098 0.877729 0.761062 0.886918 0.002926 0.000066 0.023685 0.009234
10 0.029469 0.004860 0.742922 0.88829 8.85 1.5135 {'gamma': 1.5135, 'C': 8.85} 1 0.70852 0.900222 0.759184 0.877729 0.761062 0.886918 0.004484 0.000307 0.024338 0.009234
11 0.029518 0.004609 0.741893 0.88829 8.85 1.5137 {'gamma': 1.5137, 'C': 8.85} 18 0.70852 0.900222 0.756098 0.877729 0.761062 0.886918 0.003012 0.000063 0.023685 0.009234
12 0.028544 0.004685 0.742922 0.88829 8.85 1.514 {'gamma': 1.514, 'C': 8.85} 1 0.70852 0.900222 0.759184 0.877729 0.761062 0.886918 0.004921 0.000098 0.024338 0.009234
13 0.027720 0.004616 0.742922 0.88829 8.85 1.5142 {'gamma': 1.5142, 'C': 8.85} 1 0.70852 0.900222 0.759184 0.877729 0.761062 0.886918 0.003298 0.000088 0.024338 0.009234
14 0.027366 0.004920 0.742922 0.88829 8.85 1.5145 {'gamma': 1.5145, 'C': 8.85} 1 0.70852 0.900222 0.759184 0.877729 0.761062 0.886918 0.004772 0.000181 0.024338 0.009234
15 0.031986 0.004831 0.742922 0.88829 8.852 1.5135 {'gamma': 1.5135, 'C': 8.852} 1 0.70852 0.900222 0.759184 0.877729 0.761062 0.886918 0.010387 0.000032 0.024338 0.009234
16 0.027930 0.004705 0.742922 0.88829 8.852 1.5137 {'gamma': 1.5137, 'C': 8.852} 1 0.70852 0.900222 0.759184 0.877729 0.761062 0.886918 0.003607 0.000108 0.024338 0.009234
17 0.026094 0.007387 0.742922 0.88829 8.852 1.514 {'gamma': 1.514, 'C': 8.852} 1 0.70852 0.900222 0.759184 0.877729 0.761062 0.886918 0.002769 0.003723 0.024338 0.009234
18 0.026732 0.004754 0.742922 0.88829 8.852 1.5142 {'gamma': 1.5142, 'C': 8.852} 1 0.70852 0.900222 0.759184 0.877729 0.761062 0.886918 0.004105 0.000158 0.024338 0.009234
19 0.026130 0.006103 0.742922 0.88829 8.852 1.5145 {'gamma': 1.5145, 'C': 8.852} 1 0.70852 0.900222 0.759184 0.877729 0.761062 0.886918 0.002678 0.001976 0.024338 0.009234
20 0.027911 0.004642 0.742922 0.88829 8.855 1.5135 {'gamma': 1.5135, 'C': 8.855} 1 0.70852 0.900222 0.759184 0.877729 0.761062 0.886918 0.003728 0.000124 0.024338 0.009234
21 0.028212 0.005137 0.742922 0.88829 8.855 1.5137 {'gamma': 1.5137, 'C': 8.855} 1 0.70852 0.900222 0.759184 0.877729 0.761062 0.886918 0.002840 0.000454 0.024338 0.009234
22 0.029147 0.004723 0.742922 0.88829 8.855 1.514 {'gamma': 1.514, 'C': 8.855} 1 0.70852 0.900222 0.759184 0.877729 0.761062 0.886918 0.004498 0.000132 0.024338 0.009234
23 0.026079 0.004897 0.742922 0.88829 8.855 1.5142 {'gamma': 1.5142, 'C': 8.855} 1 0.70852 0.900222 0.759184 0.877729 0.761062 0.886918 0.002749 0.000382 0.024338 0.009234
24 0.024683 0.004708 0.742922 0.88829 8.855 1.5145 {'gamma': 1.5145, 'C': 8.855} 1 0.70852 0.900222 0.759184 0.877729 0.761062 0.886918 0.004048 0.000112 0.024338 0.009234
In [85]:
# Support Vector Machine

svmachine = svm.SVC(C=8.847, gamma=1.5135, kernel='rbf', cache_size=500, class_weight='balanced')
svmachine = svmachine.fit( train_data[0::,1::], train_data[0::,0] )

print('Support Vector Machine - Trained')
Support Vector Machine - Trained
In [86]:
output = svmachine.predict(test_data).astype(int)


predictions_file = open("svmachine.csv", "w")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(ids, output))
predictions_file.close()

print('Support Vector Machine - Predicted')
Support Vector Machine - Predicted
In [87]:
# Cross Validator uses Stratified k-fold cross-validation for 10 folds

svmachine_cv_scores = pd.DataFrame({"accuracy": cross_val_score(svmachine, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="accuracy"),
                                 "precision": cross_val_score(svmachine, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="precision"),
                                 "recall": cross_val_score(svmachine, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="recall"),
                                 "f1": cross_val_score(svmachine, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="f1")
                                 })

print('Support Vector Machine - Cross-validated')
Support Vector Machine - Cross-validated
In [88]:
# Display all the measure scores for each fold

svmachine_cv_scores.to_csv("svmachine_cv_scores.csv")

svmachine_cv_scores
Out[88]:
accuracy f1 precision recall
0 0.800000 0.742857 0.742857 0.742857
1 0.822222 0.764706 0.787879 0.742857
2 0.764045 0.644068 0.760000 0.558824
3 0.752809 0.717949 0.636364 0.823529
4 0.898876 0.869565 0.857143 0.882353
5 0.853933 0.816901 0.783784 0.852941
6 0.831461 0.761905 0.827586 0.705882
7 0.853933 0.811594 0.800000 0.823529
8 0.808989 0.746269 0.757576 0.735294
9 0.795455 0.727273 0.750000 0.705882
In [89]:
# Mean values of measure scores

svmachine_cv_scores.mean()
Out[89]:
accuracy     0.818172
f1           0.760309
precision    0.770319
recall       0.757395
dtype: float64
In [90]:
# Standard Deviations of measure scores

svmachine_cv_scores.std()
Out[90]:
accuracy     0.043965
f1           0.062085
precision    0.059225
recall       0.093773
dtype: float64
In [91]:
# 95% Confidence Intervals on f1 scores

stats.norm.interval(0.95, loc=svmachine_cv_scores['f1'].mean(), scale=svmachine_cv_scores['f1'].std())
Out[91]:
(0.63862384460437693, 0.88199345827631082)
In [92]:
# Student paired t-test on f1 scores

t_stat, p_val = ttest_rel(forest_cv_scores['f1'], svmachine_cv_scores['f1'])
print(t_stat, p_val)
-0.54661666845 0.597930735175
In [ ]: