# Import required packages
import pandas as pd
import numpy as np
import pylab as plt
import csv as csv
import seaborn as sns
from scipy import stats

# Import Cross Validation Score
from sklearn.model_selection import cross_val_score

# Import split tool
from sklearn.model_selection import train_test_split

# Import GridSearch
from sklearn.model_selection import GridSearchCV

# Import Student t-test
from scipy.stats import ttest_rel

# Import DummyClassifier
from sklearn.dummy import DummyClassifier

# Import the Random Forest package
from sklearn.ensemble import RandomForestClassifier

# Import the Support Vector Machines package
from sklearn import svm

# For .read_csv, always use header=0 when you know row 0 is the header row
train_df = pd.read_csv('train.csv', header=0)

# Examine the data - note that Age, Cabin and Embarked have missing values
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB

# Examine Cabin data
# Create a new column CabinLetter which is the first character of the Cabin string

train_df['CabinLetter'] = train_df['Cabin'].str[0]

plot_df = train_df[train_df['CabinLetter'].notnull()]

# Visualisation
fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5))

# Plot count of port of Embarkation
sns.countplot(x='CabinLetter', data=plot_df.sort_values(by='CabinLetter'), ax=axis1)

# Group by CabinLetter, and get the mean for survived passengers for each value in CabinLetter
cabin_perc = plot_df[['CabinLetter', 'Survived']].groupby(['CabinLetter'],as_index=False).mean()
sns.barplot(x='CabinLetter', y='Survived', data=cabin_perc.sort_values(by='CabinLetter'),ax=axis2)

# Plot count of CabinLetter by Passenger Class
sns.countplot(x='Pclass', hue='CabinLetter', data=plot_df.sort_values(by='CabinLetter'), ax=axis3)

<matplotlib.axes._subplots.AxesSubplot at 0x7fbe2da9a5f8>

# Some variation in survival rates by CabinLetter, but hard to deduce the missing values e.g. from Passenger Class
# For Passenger Class 1, the CabinLetter can be anything from A-E
# So drop the CabinLetter column

train_df = train_df.drop(['CabinLetter'], axis=1)

train_df.head(5)

# Examine Sex data

# Visualisation
fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5))

# Plot count of Sex
sns.countplot(x='Sex', data=train_df, ax=axis1)

# Plot count of Sex by Survival status
sns.countplot(x='Survived', hue='Sex', data=train_df, order=[1,0], ax=axis2)

# Group by sex, and get the mean for survived passengers for each value in Sex
sex_perc = train_df[['Sex', 'Survived']].groupby(['Sex'],as_index=False).mean()
sns.barplot(x='Sex', y='Survived', data=sex_perc,order=['female','male'],ax=axis3)

<matplotlib.axes._subplots.AxesSubplot at 0x7fbe2d88b7f0>

# Sex makes a significant difference to mean Survival rate, so keep the data

# Create a numeric representation for Sex to be able to use it for Machine Learning
# Call this new column Gender
train_df['Gender'] = train_df['Sex'].map( {'female': 0, 'male': 1} ).astype(float)

train_df.head(5)

# Examine Embarked data
plot_df = train_df[train_df['Embarked'].notnull()]

# Visualisation
fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5))

# Plot count of port of Embarkation
sns.countplot(x='Embarked', data=plot_df, ax=axis1)

# Plot count of Embarked ports by Survival status
sns.countplot(x='Survived', hue='Embarked', data=plot_df, order=[1,0], ax=axis2)

# Group by embarked, and get the mean for survived passengers for each value in Embarked
embark_perc = plot_df[['Embarked', 'Survived']].groupby(['Embarked'],as_index=False).mean()
sns.barplot(x='Embarked', y='Survived', data=embark_perc,order=['S','C','Q'],ax=axis3)

<matplotlib.axes._subplots.AxesSubplot at 0x7fbe2d7bb748>

# Create a numeric representation for Embarked - the missing values will be replaced with S (mapped to 0) as it is by far the most common port
train_df['Port'] = train_df['Embarked'].fillna('S').map({'S': 0.0, 'C': 0.5, 'Q': 1.0}).astype(float)

train_df.head(5)

# Any relationship between fare and survival?

fare_bin_size = 5

bin_count, bin_edges, binnumber = stats.binned_statistic(train_df['Fare'], train_df['Fare'], statistic='count', bins=100/fare_bin_size, range=(0,100))
bin_means, bin_edges, binnumber = stats.binned_statistic(train_df['Fare'], train_df['Survived'], statistic='mean', bins=100/fare_bin_size, range=(0,100))

fare_survived_df = pd.DataFrame({'FareBin': bin_edges[1:], 'FareCount': bin_count, 'MeanSurv': bin_means})

# Visualisation
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(20,5))

axes = sns.barplot(x='FareBin', y='FareCount', data=fare_survived_df, ax=axis1)
axes.set(xlabel = 'Fare', ylabel = 'count')

axes = sns.barplot(x='FareBin', y='MeanSurv', data=fare_survived_df, ax=axis2)
axes.set(xlabel = 'Fare', ylabel = 'mean(Survived)')

[<matplotlib.text.Text at 0x7fbe2d6540b8>,
 <matplotlib.text.Text at 0x7fbe2d647dd8>]

# Any relationship between age and survival?

age_bin_size = 5

bin_count, bin_edges, binnumber = stats.binned_statistic(train_df['Age'], train_df['Age'], statistic='count', bins=100/age_bin_size, range=(0,100))
bin_means, bin_edges, binnumber = stats.binned_statistic(train_df['Age'], train_df['Survived'], statistic='mean', bins=100/age_bin_size, range=(0,100))

age_survived_df = pd.DataFrame({'AgeBin': bin_edges[1:], 'AgeCount': bin_count, 'MeanSurv': bin_means})

# Visualisation
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(20,5))

axes = sns.barplot(x='AgeBin', y='AgeCount', data=age_survived_df, ax=axis1)
axes.set(xlabel = 'Age', ylabel = 'count')

axes = sns.barplot(x='AgeBin', y='MeanSurv', data=age_survived_df, ax=axis2)
axes.set(xlabel = 'Age', ylabel = 'mean(Survived)')

[<matplotlib.text.Text at 0x7fbe2d45a9b0>,
 <matplotlib.text.Text at 0x7fbe2d450160>]

# Age appears to make a difference to mean Survival rate, so keep the data
# Process it for Machine Learning

#Examine the passengers with missing age details
agena_df = train_df[train_df['Age'].isnull()]
agena_df.head(5)

# Any relationship beween age and passenger class?

# Distribution of Pclass values among passengers, where age is known
count_Pclass = plot_df[['Pclass', 'PassengerId']].groupby(['Pclass'],as_index=False).count()

# Distribution of Pclass values among passengers, where age is unknown
count_agena_Pclass = agena_df[['Pclass', 'PassengerId']].groupby(['Pclass'],as_index=False).count()

# Mean and median age of passengers by Pclass value
mean_age = plot_df[['Pclass', 'Age']].groupby(['Pclass'],as_index=False).mean()
median_age = plot_df[['Pclass', 'Age']].groupby(['Pclass'],as_index=False).median()

# Visualisation
fig, (axis1,axis2,axis3,axis4) = plt.subplots(1,4,figsize=(15,5))

axes = sns.barplot(x='Pclass', y='PassengerId', data=count_Pclass, ax=axis1)
axes.set(ylabel = 'count(Age known)')

axes = sns.barplot(x='Pclass', y='PassengerId', data=count_agena_Pclass, ax=axis2)
axes.set(ylabel = 'count(Age unknown)')

sns.barplot(x='Pclass', y='Age', data=mean_age, ax=axis3)

axes = sns.barplot(x='Pclass', y='Age', data=median_age, ax=axis4)
axes.set(ylabel = 'median(Age)')

[<matplotlib.text.Text at 0x7fbe2d244dd8>]

# Any relationship between age and fare paid?

axes = sns.lmplot('Fare', 'Age', data=plot_df, fit_reg=False)

axes.set(xlabel="Fare", ylabel="Age")

<seaborn.axisgrid.FacetGrid at 0x7fbe2d701518>

# Any relationship beween age and sex?

# Visualisation
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,5))

mean_age = plot_df[['Sex', 'Age']].groupby(['Sex'],as_index=False).mean()
median_age = plot_df[['Sex', 'Age']].groupby(['Sex'],as_index=False).median()
sns.barplot(x='Sex', y='Age', data=mean_age, ax=axis1)
axes = sns.barplot(x='Sex', y='Age', data=median_age, ax=axis2)
axes.set(ylabel = 'median(Age)')

[<matplotlib.text.Text at 0x7fbe2d057e48>]

# Any relationship between age and sibling/spouse?

# Distribution of SibSp values among passengers, where age is known
count_sibsp = plot_df[['SibSp', 'PassengerId']].groupby(['SibSp'],as_index=False).count()

# Distribution of SibSp values among passengers, where age is unknown
count_agena_sibsp = agena_df[['SibSp', 'PassengerId']].groupby(['SibSp'],as_index=False).count()

# Mean and median age of passengers by SibSp value
mean_age = plot_df[['SibSp', 'Age']].groupby(['SibSp'],as_index=False).mean()
median_age = plot_df[['SibSp', 'Age']].groupby(['SibSp'],as_index=False).median()

# Visualisation
fig, (axis1,axis2,axis3,axis4) = plt.subplots(1,4,figsize=(15,5))

axes = sns.barplot(x='SibSp', y='PassengerId', data=count_sibsp, ax=axis1)
axes.set(ylabel = 'count(Age known)')

axes = sns.barplot(x='SibSp', y='PassengerId', data=count_agena_sibsp, ax=axis2)
axes.set(ylabel = 'count(Age unknown)')

sns.barplot(x='SibSp', y='Age', data=mean_age, ax=axis3)

axes = sns.barplot(x='SibSp', y='Age', data=median_age, ax=axis4)
axes.set(ylabel = 'median(Age)')

[<matplotlib.text.Text at 0x7fbe2cf2ccc0>]

# Any relationship between age and parent/child?

# Distribution of Parch values among passengers, where age is known
count_Parch = plot_df[['Parch', 'PassengerId']].groupby(['Parch'],as_index=False).count()

# Distribution of Parch values among passengers, where age is unknown
count_agena_Parch = agena_df[['Parch', 'PassengerId']].groupby(['Parch'],as_index=False).count()

# Mean and median age of passengers by Parch value
mean_age = plot_df[['Parch', 'Age']].groupby(['Parch'],as_index=False).mean()
median_age = plot_df[['Parch', 'Age']].groupby(['Parch'],as_index=False).median()

# Visualisation
fig, (axis1,axis2,axis3,axis4) = plt.subplots(1,4,figsize=(15,5))

axes = sns.barplot(x='Parch', y='PassengerId', data=count_Parch, ax=axis1)
axes.set(ylabel = 'count(Age known)')

axes = sns.barplot(x='Parch', y='PassengerId', data=count_agena_Parch, ax=axis2)
axes.set(ylabel = 'count(Age unknown)')

sns.barplot(x='Parch', y='Age', data=mean_age, ax=axis3)

axes = sns.barplot(x='Parch', y='Age', data=median_age, ax=axis4)
axes.set(ylabel = 'median(Age)')

[<matplotlib.text.Text at 0x7fbe2ccdf160>]

# Will set missing age values to Median age for each passenger class
median_ages = np.zeros((2,3))
for i in range(0, 2):
    for j in range(0, 3):
        median_ages[i,j] = train_df[(train_df['Gender'] == i) & (train_df['Pclass'] == j+1)]['Age'].dropna().median()
 
train_df['AgeFill'] = train_df['Age']

for i in range(0, 2):
    for j in range(0, 3):
        train_df.loc[ (train_df.Age.isnull()) & (train_df.Gender == i) & (train_df.Pclass == j+1),'AgeFill'] = median_ages[i,j]

# Scale from 0 to 1
train_df['AgeFill'] = train_df['AgeFill']/100

train_df[train_df['Age'].isnull()].head(5)

# Examine SibSp data

# Visualisation
fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5))

# Plot count of port of Embarkation
sns.countplot(x='SibSp', data=train_df, ax=axis1)

# Plot count of Embarked ports by Survival status
sns.countplot(x='Survived', hue='SibSp', data=train_df, order=[1,0], ax=axis2)

# Group by embarked, and get the mean for survived passengers for each value in SibSp
sibsp_mean = train_df[['SibSp', 'Survived']].groupby(['SibSp'],as_index=False).mean()
sns.barplot(x='SibSp', y='Survived', data=sibsp_mean,ax=axis3)

<matplotlib.axes._subplots.AxesSubplot at 0x7fbe2cb1a828>

# Examine Parch data

# Visualisation
fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5))

# Plot count of port of Embarkation
sns.countplot(x='Parch', data=train_df, ax=axis1)

# Plot count of Embarked ports by Survival status
sns.countplot(x='Survived', hue='Parch', data=train_df, order=[1,0], ax=axis2)

# Group by embarked, and get the mean for survived passengers for each value in Parch
parch_mean = train_df[['Parch', 'Survived']].groupby(['Parch'],as_index=False).mean()
sns.barplot(x='Parch', y='Survived', data=parch_mean,ax=axis3)

<matplotlib.axes._subplots.AxesSubplot at 0x7fbe2c9c3940>

# Both SibSp and Parch appear to make a difference to a passenger's chance of surviving - for non-zero values the chances are higher
# However, there are not very many non-zero values of each.  Try combining the two into a new column for total family size

train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch']

train_df.head(10)

# Examine FamilySize data

# Visualisation
fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5))

# Plot count of port of Embarkation
sns.countplot(x='FamilySize', data=train_df, ax=axis1)

# Plot count of Embarked ports by Survival status
sns.countplot(x='Survived', hue='FamilySize', data=train_df, order=[1,0], ax=axis2)

# Group by embarked, and get the mean for survived passengers for each value in FamilySize
familysize_mean = train_df[['FamilySize', 'Survived']].groupby(['FamilySize'],as_index=False).mean()
sns.barplot(x='FamilySize', y='Survived', data=familysize_mean,ax=axis3)

<matplotlib.axes._subplots.AxesSubplot at 0x7fbe2c7f0b38>

# Scale SibSp, Parch, FamilySize and Pclass

train_df['SibSpS'] = train_df['SibSp'].astype(float)/10
train_df['ParchS'] = train_df['Parch'].astype(float)/10
train_df['FamilySizeS'] = train_df['FamilySize'].astype(float)/10
train_df['PclassS'] = train_df['Pclass'].astype(float)/3

train_df

# Drop unused columns - this includes those that have been replaced

train_df = train_df.drop(['Age', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'PassengerId', 'SibSp', 'Parch', 'FamilySize', 'Pclass'], axis=1)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
Survived       891 non-null int64
Fare           891 non-null float64
Gender         891 non-null float64
Port           891 non-null float64
AgeFill        891 non-null float64
SibSpS         891 non-null float64
ParchS         891 non-null float64
FamilySizeS    891 non-null float64
PclassS        891 non-null float64
dtypes: float64(8), int64(1)
memory usage: 62.7 KB

# TEST DATA
test_df = pd.read_csv('test.csv', header=0)        # Load the test file into a dataframe

# Repeat the data processing of the traing data with the test data, so that the columns are the same

# Create a numeric representation for Sex
test_df['Gender'] = test_df['Sex'].map( {'female': 0, 'male': 1} ).astype(float)

# Create a numeric representation for Embarked port
test_df['Port'] = test_df['Embarked'].fillna('S').map({'S': 0.0, 'C': 0.5, 'Q': 1.0}).astype(float)

# Will set missing age values to Median age for each passenger class
median_ages = np.zeros((2,3))
for i in range(0, 2):
    for j in range(0, 3):
        median_ages[i,j] = test_df[(test_df['Gender'] == i) & (test_df['Pclass'] == j+1)]['Age'].dropna().median()
 
test_df['AgeFill'] = test_df['Age']

for i in range(0, 2):
    for j in range(0, 3):
        test_df.loc[ (test_df.Age.isnull()) & (test_df.Gender == i) & (test_df.Pclass == j+1),'AgeFill'] = median_ages[i,j]
        
# Scale from 0 to 1
test_df['AgeFill'] = test_df['AgeFill']/100
        
# Add FamilySize column

test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch']

# All the missing Fares -> assume median of their respective class
if len(test_df.Fare[ test_df.Fare.isnull() ]) > 0:
    median_fare = np.zeros(3)
    for f in range(0, 3):
        median_fare[f] = test_df[ test_df.Pclass == f+1 ]['Fare'].dropna().median()
    for f in range(0, 3):
        test_df.loc[ (test_df.Fare.isnull()) & (test_df.Pclass == f+1 ), 'Fare'] = median_fare[f]

# Collect the test data's PassengerIds before dropping it
ids = test_df['PassengerId'].values

# Scaling
test_df['SibSpS'] = test_df['SibSp'].astype(float)/10
test_df['ParchS'] = test_df['Parch'].astype(float)/10
test_df['FamilySizeS'] = test_df['FamilySize'].astype(float)/10
test_df['PclassS'] = test_df['Pclass'].astype(float)/3

# Remove the unused/replaced columns
test_df = test_df.drop(['Age', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'PassengerId', 'SibSp', 'Parch', 'FamilySize', 'Pclass'], axis=1) 

# The data is now ready to go. So lets fit to the train, then predict to the test!
# Convert back to a numpy array
train_data = train_df.values
test_data = test_df.values

test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
Fare           418 non-null float64
Gender         418 non-null float64
Port           418 non-null float64
AgeFill        418 non-null float64
SibSpS         418 non-null float64
ParchS         418 non-null float64
FamilySizeS    418 non-null float64
PclassS        418 non-null float64
dtypes: float64(8)
memory usage: 26.2 KB

# Control classifier - always chooses the most frequent class

control = DummyClassifier(strategy='most_frequent')

# Use Stratified k-fold cross-validation for 10 folds

control_cv_scores = pd.DataFrame({"accuracy": cross_val_score(control, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="accuracy"),
                                 "precision": cross_val_score(control, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="precision"),
                                 "recall": cross_val_score(control, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="recall"),
                                 "f1": cross_val_score(control, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="f1")
                                 })

print('Control Classifier - Cross-validated')

Control Classifier - Cross-validated

/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/projects/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)

# Display all the measure scores for each fold

control_cv_scores.to_csv("control_cv_scores.csv")

control_cv_scores

# Mean values of measure scores

control_cv_scores.mean()

accuracy     0.61617
f1           0.00000
precision    0.00000
recall       0.00000
dtype: float64

# Standard deviation values of measure scores

control_cv_scores.mean()

accuracy     0.61617
f1           0.00000
precision    0.00000
recall       0.00000
dtype: float64

# Grid search for Random Forest parameter optimisation

params = [ {'n_estimators': [100, 200, 500], 'max_features': ['sqrt', None]} ]

rf = RandomForestClassifier()

grid = GridSearchCV(estimator=rf, param_grid=params, cv=5, scoring='f1', verbose=3)

grid.fit(train_data[0::,1::], train_data[0::,0])

print(grid.best_params_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] max_features=sqrt, n_estimators=100 .............................
[CV] .... max_features=sqrt, n_estimators=100, score=0.719424 -   0.0s
[CV] max_features=sqrt, n_estimators=100 .............................
[CV] .... max_features=sqrt, n_estimators=100, score=0.727273 -   0.0s
[CV] max_features=sqrt, n_estimators=100 .............................

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s

[CV] .... max_features=sqrt, n_estimators=100, score=0.802920 -   0.0s
[CV] max_features=sqrt, n_estimators=100 .............................
[CV] .... max_features=sqrt, n_estimators=100, score=0.661157 -   0.0s
[CV] max_features=sqrt, n_estimators=100 .............................
[CV] .... max_features=sqrt, n_estimators=100, score=0.797101 -   0.0s
[CV] max_features=sqrt, n_estimators=200 .............................
[CV] .... max_features=sqrt, n_estimators=200, score=0.737589 -   0.0s
[CV] max_features=sqrt, n_estimators=200 .............................
[CV] .... max_features=sqrt, n_estimators=200, score=0.723404 -   0.0s
[CV] max_features=sqrt, n_estimators=200 .............................
[CV] .... max_features=sqrt, n_estimators=200, score=0.811594 -   0.0s
[CV] max_features=sqrt, n_estimators=200 .............................
[CV] .... max_features=sqrt, n_estimators=200, score=0.672131 -   0.0s
[CV] max_features=sqrt, n_estimators=200 .............................
[CV] .... max_features=sqrt, n_estimators=200, score=0.780142 -   0.0s
[CV] max_features=sqrt, n_estimators=500 .............................
[CV] .... max_features=sqrt, n_estimators=500, score=0.746479 -   0.1s
[CV] max_features=sqrt, n_estimators=500 .............................
[CV] .... max_features=sqrt, n_estimators=500, score=0.746269 -   0.1s
[CV] max_features=sqrt, n_estimators=500 .............................
[CV] .... max_features=sqrt, n_estimators=500, score=0.802920 -   0.1s
[CV] max_features=sqrt, n_estimators=500 .............................
[CV] .... max_features=sqrt, n_estimators=500, score=0.688525 -   0.1s
[CV] max_features=sqrt, n_estimators=500 .............................
[CV] .... max_features=sqrt, n_estimators=500, score=0.771429 -   0.1s
[CV] max_features=None, n_estimators=100 .............................
[CV] .... max_features=None, n_estimators=100, score=0.700730 -   0.0s
[CV] max_features=None, n_estimators=100 .............................
[CV] .... max_features=None, n_estimators=100, score=0.728571 -   0.0s
[CV] max_features=None, n_estimators=100 .............................
[CV] .... max_features=None, n_estimators=100, score=0.800000 -   0.0s
[CV] max_features=None, n_estimators=100 .............................
[CV] .... max_features=None, n_estimators=100, score=0.688525 -   0.0s
[CV] max_features=None, n_estimators=100 .............................
[CV] .... max_features=None, n_estimators=100, score=0.776978 -   0.0s
[CV] max_features=None, n_estimators=200 .............................
[CV] .... max_features=None, n_estimators=200, score=0.700730 -   0.0s
[CV] max_features=None, n_estimators=200 .............................
[CV] .... max_features=None, n_estimators=200, score=0.746269 -   0.0s
[CV] max_features=None, n_estimators=200 .............................
[CV] .... max_features=None, n_estimators=200, score=0.824427 -   0.0s
[CV] max_features=None, n_estimators=200 .............................
[CV] .... max_features=None, n_estimators=200, score=0.693548 -   0.0s
[CV] max_features=None, n_estimators=200 .............................
[CV] .... max_features=None, n_estimators=200, score=0.762590 -   0.0s
[CV] max_features=None, n_estimators=500 .............................
[CV] .... max_features=None, n_estimators=500, score=0.700000 -   0.1s
[CV] max_features=None, n_estimators=500 .............................
[CV] .... max_features=None, n_estimators=500, score=0.729927 -   0.1s
[CV] max_features=None, n_estimators=500 .............................
[CV] .... max_features=None, n_estimators=500, score=0.800000 -   0.1s
[CV] max_features=None, n_estimators=500 .............................
[CV] .... max_features=None, n_estimators=500, score=0.666667 -   0.1s
[CV] max_features=None, n_estimators=500 .............................
[CV] .... max_features=None, n_estimators=500, score=0.780142 -   0.1s

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   16.1s finished

{'max_features': 'sqrt', 'n_estimators': 500}

# Random Forest

forest = RandomForestClassifier(n_estimators=100, max_features=None)
forest = forest.fit( train_data[0::,1::], train_data[0::,0] )

print('Random Forest - Trained')

Random Forest - Trained

output = forest.predict(test_data).astype(int)


predictions_file = open("randomforest.csv", "w")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(ids, output))
predictions_file.close()

print('Random Forest - Predicted')

Random Forest - Predicted

# Cross Validator uses Stratified k-fold cross-validation for 10 folds

forest_cv_scores = pd.DataFrame({"accuracy": cross_val_score(forest, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="accuracy"),
                                 "precision": cross_val_score(forest, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="precision"),
                                 "recall": cross_val_score(forest, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="recall"),
                                 "f1": cross_val_score(forest, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="f1")
                                 })

print('Random Forest - Cross-validated')

Random Forest - Cross-validated

# Display all the measure scores for each fold

forest_cv_scores.to_csv("forest_cv_scores.csv")

forest_cv_scores

# Mean values of measure scores

forest_cv_scores.mean()

accuracy     0.811595
f1           0.748336
precision    0.760209
recall       0.731429
dtype: float64

# Standard Deviations of measure scores

forest_cv_scores.std()

accuracy     0.041157
f1           0.062458
precision    0.076585
recall       0.110847
dtype: float64

# 95% Confidence Intervals

stats.norm.interval(0.95, loc=forest_cv_scores['f1'].mean(), scale=forest_cv_scores['f1'].std())

(0.62592046904632481, 0.87075181228871823)

# Grid search for Support Vector Machine parameter optimisation

c_list = [8.845, 8.847, 8.85, 8.852, 8.855]
gamma_list = [1.5135, 1.5137, 1.514, 1.5142, 1.5145]
params = [ {'C': c_list, 'gamma': gamma_list} ]

grid = GridSearchCV(estimator=svm.SVC(kernel='rbf', cache_size=500, class_weight='balanced'), param_grid=params, cv=3, scoring='f1', verbose=3)

grid.fit(train_data[0::,1::], train_data[0::,0])

print(grid.best_params_)

Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] gamma=1.5135, C=8.845 ...........................................
[CV] .................. gamma=1.5135, C=8.845, score=0.708520 -   0.0s
[CV] gamma=1.5135, C=8.845 ...........................................
[CV] .................. gamma=1.5135, C=8.845, score=0.756098 -   0.0s
[CV] gamma=1.5135, C=8.845 ...........................................
[CV] .................. gamma=1.5135, C=8.845, score=0.755556 -   0.0s
[CV] gamma=1.5137, C=8.845 ...........................................
[CV] .................. gamma=1.5137, C=8.845, score=0.708520 -   0.0s
[CV] gamma=1.5137, C=8.845 ...........................................

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s

[CV] .................. gamma=1.5137, C=8.845, score=0.756098 -   0.0s
[CV] gamma=1.5137, C=8.845 ...........................................
[CV] .................. gamma=1.5137, C=8.845, score=0.761062 -   0.0s
[CV] gamma=1.514, C=8.845 ............................................
[CV] ................... gamma=1.514, C=8.845, score=0.708520 -   0.0s
[CV] gamma=1.514, C=8.845 ............................................
[CV] ................... gamma=1.514, C=8.845, score=0.756098 -   0.0s
[CV] gamma=1.514, C=8.845 ............................................
[CV] ................... gamma=1.514, C=8.845, score=0.761062 -   0.0s
[CV] gamma=1.5142, C=8.845 ...........................................
[CV] .................. gamma=1.5142, C=8.845, score=0.708520 -   0.0s
[CV] gamma=1.5142, C=8.845 ...........................................
[CV] .................. gamma=1.5142, C=8.845, score=0.756098 -   0.0s
[CV] gamma=1.5142, C=8.845 ...........................................
[CV] .................. gamma=1.5142, C=8.845, score=0.761062 -   0.0s
[CV] gamma=1.5145, C=8.845 ...........................................
[CV] .................. gamma=1.5145, C=8.845, score=0.708520 -   0.0s
[CV] gamma=1.5145, C=8.845 ...........................................
[CV] .................. gamma=1.5145, C=8.845, score=0.756098 -   0.0s
[CV] gamma=1.5145, C=8.845 ...........................................
[CV] .................. gamma=1.5145, C=8.845, score=0.761062 -   0.0s
[CV] gamma=1.5135, C=8.847 ...........................................
[CV] .................. gamma=1.5135, C=8.847, score=0.708520 -   0.0s
[CV] gamma=1.5135, C=8.847 ...........................................
[CV] .................. gamma=1.5135, C=8.847, score=0.759184 -   0.0s
[CV] gamma=1.5135, C=8.847 ...........................................
[CV] .................. gamma=1.5135, C=8.847, score=0.761062 -   0.0s
[CV] gamma=1.5137, C=8.847 ...........................................
[CV] .................. gamma=1.5137, C=8.847, score=0.708520 -   0.0s
[CV] gamma=1.5137, C=8.847 ...........................................
[CV] .................. gamma=1.5137, C=8.847, score=0.759184 -   0.0s
[CV] gamma=1.5137, C=8.847 ...........................................
[CV] .................. gamma=1.5137, C=8.847, score=0.761062 -   0.0s
[CV] gamma=1.514, C=8.847 ............................................
[CV] ................... gamma=1.514, C=8.847, score=0.708520 -   0.0s
[CV] gamma=1.514, C=8.847 ............................................
[CV] ................... gamma=1.514, C=8.847, score=0.756098 -   0.0s
[CV] gamma=1.514, C=8.847 ............................................
[CV] ................... gamma=1.514, C=8.847, score=0.761062 -   0.0s
[CV] gamma=1.5142, C=8.847 ...........................................
[CV] .................. gamma=1.5142, C=8.847, score=0.708520 -   0.0s
[CV] gamma=1.5142, C=8.847 ...........................................
[CV] .................. gamma=1.5142, C=8.847, score=0.759184 -   0.0s
[CV] gamma=1.5142, C=8.847 ...........................................
[CV] .................. gamma=1.5142, C=8.847, score=0.761062 -   0.0s
[CV] gamma=1.5145, C=8.847 ...........................................
[CV] .................. gamma=1.5145, C=8.847, score=0.708520 -   0.0s
[CV] gamma=1.5145, C=8.847 ...........................................
[CV] .................. gamma=1.5145, C=8.847, score=0.756098 -   0.0s
[CV] gamma=1.5145, C=8.847 ...........................................
[CV] .................. gamma=1.5145, C=8.847, score=0.761062 -   0.0s
[CV] gamma=1.5135, C=8.85 ............................................
[CV] ................... gamma=1.5135, C=8.85, score=0.708520 -   0.0s
[CV] gamma=1.5135, C=8.85 ............................................
[CV] ................... gamma=1.5135, C=8.85, score=0.759184 -   0.0s
[CV] gamma=1.5135, C=8.85 ............................................
[CV] ................... gamma=1.5135, C=8.85, score=0.761062 -   0.0s
[CV] gamma=1.5137, C=8.85 ............................................
[CV] ................... gamma=1.5137, C=8.85, score=0.708520 -   0.0s
[CV] gamma=1.5137, C=8.85 ............................................
[CV] ................... gamma=1.5137, C=8.85, score=0.756098 -   0.0s
[CV] gamma=1.5137, C=8.85 ............................................
[CV] ................... gamma=1.5137, C=8.85, score=0.761062 -   0.0s
[CV] gamma=1.514, C=8.85 .............................................
[CV] .................... gamma=1.514, C=8.85, score=0.708520 -   0.0s
[CV] gamma=1.514, C=8.85 .............................................
[CV] .................... gamma=1.514, C=8.85, score=0.759184 -   0.0s
[CV] gamma=1.514, C=8.85 .............................................
[CV] .................... gamma=1.514, C=8.85, score=0.761062 -   0.0s
[CV] gamma=1.5142, C=8.85 ............................................
[CV] ................... gamma=1.5142, C=8.85, score=0.708520 -   0.0s
[CV] gamma=1.5142, C=8.85 ............................................
[CV] ................... gamma=1.5142, C=8.85, score=0.759184 -   0.0s
[CV] gamma=1.5142, C=8.85 ............................................
[CV] ................... gamma=1.5142, C=8.85, score=0.761062 -   0.0s
[CV] gamma=1.5145, C=8.85 ............................................
[CV] ................... gamma=1.5145, C=8.85, score=0.708520 -   0.0s
[CV] gamma=1.5145, C=8.85 ............................................
[CV] ................... gamma=1.5145, C=8.85, score=0.759184 -   0.0s
[CV] gamma=1.5145, C=8.85 ............................................
[CV] ................... gamma=1.5145, C=8.85, score=0.761062 -   0.0s
[CV] gamma=1.5135, C=8.852 ...........................................
[CV] .................. gamma=1.5135, C=8.852, score=0.708520 -   0.0s
[CV] gamma=1.5135, C=8.852 ...........................................
[CV] .................. gamma=1.5135, C=8.852, score=0.759184 -   0.0s
[CV] gamma=1.5135, C=8.852 ...........................................
[CV] .................. gamma=1.5135, C=8.852, score=0.761062 -   0.0s
[CV] gamma=1.5137, C=8.852 ...........................................
[CV] .................. gamma=1.5137, C=8.852, score=0.708520 -   0.0s
[CV] gamma=1.5137, C=8.852 ...........................................
[CV] .................. gamma=1.5137, C=8.852, score=0.759184 -   0.0s
[CV] gamma=1.5137, C=8.852 ...........................................
[CV] .................. gamma=1.5137, C=8.852, score=0.761062 -   0.0s
[CV] gamma=1.514, C=8.852 ............................................
[CV] ................... gamma=1.514, C=8.852, score=0.708520 -   0.0s
[CV] gamma=1.514, C=8.852 ............................................
[CV] ................... gamma=1.514, C=8.852, score=0.759184 -   0.0s
[CV] gamma=1.514, C=8.852 ............................................
[CV] ................... gamma=1.514, C=8.852, score=0.761062 -   0.0s
[CV] gamma=1.5142, C=8.852 ...........................................
[CV] .................. gamma=1.5142, C=8.852, score=0.708520 -   0.0s
[CV] gamma=1.5142, C=8.852 ...........................................
[CV] .................. gamma=1.5142, C=8.852, score=0.759184 -   0.0s
[CV] gamma=1.5142, C=8.852 ...........................................
[CV] .................. gamma=1.5142, C=8.852, score=0.761062 -   0.0s
[CV] gamma=1.5145, C=8.852 ...........................................
[CV] .................. gamma=1.5145, C=8.852, score=0.708520 -   0.0s
[CV] gamma=1.5145, C=8.852 ...........................................
[CV] .................. gamma=1.5145, C=8.852, score=0.759184 -   0.0s
[CV] gamma=1.5145, C=8.852 ...........................................
[CV] .................. gamma=1.5145, C=8.852, score=0.761062 -   0.0s
[CV] gamma=1.5135, C=8.855 ...........................................
[CV] .................. gamma=1.5135, C=8.855, score=0.708520 -   0.0s
[CV] gamma=1.5135, C=8.855 ...........................................
[CV] .................. gamma=1.5135, C=8.855, score=0.759184 -   0.0s
[CV] gamma=1.5135, C=8.855 ...........................................
[CV] .................. gamma=1.5135, C=8.855, score=0.761062 -   0.0s
[CV] gamma=1.5137, C=8.855 ...........................................
[CV] .................. gamma=1.5137, C=8.855, score=0.708520 -   0.0s
[CV] gamma=1.5137, C=8.855 ...........................................
[CV] .................. gamma=1.5137, C=8.855, score=0.759184 -   0.0s
[CV] gamma=1.5137, C=8.855 ...........................................
[CV] .................. gamma=1.5137, C=8.855, score=0.761062 -   0.0s
[CV] gamma=1.514, C=8.855 ............................................
[CV] ................... gamma=1.514, C=8.855, score=0.708520 -   0.0s
[CV] gamma=1.514, C=8.855 ............................................
[CV] ................... gamma=1.514, C=8.855, score=0.759184 -   0.0s
[CV] gamma=1.514, C=8.855 ............................................
[CV] ................... gamma=1.514, C=8.855, score=0.761062 -   0.0s
[CV] gamma=1.5142, C=8.855 ...........................................
[CV] .................. gamma=1.5142, C=8.855, score=0.708520 -   0.0s
[CV] gamma=1.5142, C=8.855 ...........................................
[CV] .................. gamma=1.5142, C=8.855, score=0.759184 -   0.0s
[CV] gamma=1.5142, C=8.855 ...........................................
[CV] .................. gamma=1.5142, C=8.855, score=0.761062 -   0.0s
[CV] gamma=1.5145, C=8.855 ...........................................
[CV] .................. gamma=1.5145, C=8.855, score=0.708520 -   0.0s
[CV] gamma=1.5145, C=8.855 ...........................................
[CV] .................. gamma=1.5145, C=8.855, score=0.759184 -   0.0s
[CV] gamma=1.5145, C=8.855 ...........................................
[CV] .................. gamma=1.5145, C=8.855, score=0.761062 -   0.0s
{'gamma': 1.5135, 'C': 8.847}

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:    3.2s finished

grid_df = pd.DataFrame(grid.cv_results_)
grid_df.to_csv("svmachine_gridsearch.csv")
grid_df

# Support Vector Machine

svmachine = svm.SVC(C=8.847, gamma=1.5135, kernel='rbf', cache_size=500, class_weight='balanced')
svmachine = svmachine.fit( train_data[0::,1::], train_data[0::,0] )

print('Support Vector Machine - Trained')

Support Vector Machine - Trained

output = svmachine.predict(test_data).astype(int)


predictions_file = open("svmachine.csv", "w")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(ids, output))
predictions_file.close()

print('Support Vector Machine - Predicted')

Support Vector Machine - Predicted

# Cross Validator uses Stratified k-fold cross-validation for 10 folds

svmachine_cv_scores = pd.DataFrame({"accuracy": cross_val_score(svmachine, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="accuracy"),
                                 "precision": cross_val_score(svmachine, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="precision"),
                                 "recall": cross_val_score(svmachine, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="recall"),
                                 "f1": cross_val_score(svmachine, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="f1")
                                 })

print('Support Vector Machine - Cross-validated')

Support Vector Machine - Cross-validated

# Display all the measure scores for each fold

svmachine_cv_scores.to_csv("svmachine_cv_scores.csv")

svmachine_cv_scores

# Mean values of measure scores

svmachine_cv_scores.mean()

accuracy     0.818172
f1           0.760309
precision    0.770319
recall       0.757395
dtype: float64

# Standard Deviations of measure scores

svmachine_cv_scores.std()

accuracy     0.043965
f1           0.062085
precision    0.059225
recall       0.093773
dtype: float64

# 95% Confidence Intervals on f1 scores

stats.norm.interval(0.95, loc=svmachine_cv_scores['f1'].mean(), scale=svmachine_cv_scores['f1'].std())

(0.63862384460437693, 0.88199345827631082)

# Student paired t-test on f1 scores

t_stat, p_val = ttest_rel(forest_cv_scores['f1'], svmachine_cv_scores['f1'])
print(t_stat, p_val)

-0.54661666845 0.597930735175

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked	Gender
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S	1.0
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C	0.0
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S	0.0
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S	0.0
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S	1.0

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked	Gender	Port	AgeFill	FamilySize
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	0	A/5 21171	7.2500	NaN	S	1.0	0.0	0.22	1
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	0	PC 17599	71.2833	C85	C	0.0	0.5	0.38	1
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	0	STON/O2. 3101282	7.9250	NaN	S	0.0	0.0	0.26	0
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	0	113803	53.1000	C123	S	0.0	0.0	0.35	1
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	0	373450	8.0500	NaN	S	1.0	0.0	0.35	0
5	6	0	3	Moran, Mr. James	male	NaN	0	0	330877	8.4583	NaN	Q	1.0	1.0	0.25	0
6	7	0	1	McCarthy, Mr. Timothy J	male	54.0	0	0	17463	51.8625	E46	S	1.0	0.0	0.54	0
7	8	0	3	Palsson, Master. Gosta Leonard	male	2.0	3	1	349909	21.0750	NaN	S	1.0	0.0	0.02	4
8	9	1	3	Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)	female	27.0	0	2	347742	11.1333	NaN	S	0.0	0.0	0.27	2
9	10	1	2	Nasser, Mrs. Nicholas (Adele Achem)	female	14.0	1	0	237736	30.0708	NaN	C	0.0	0.5	0.14	1

	accuracy	f1	precision	recall
0	0.744444	0.676056	0.648649	0.685714
1	0.800000	0.738462	0.766667	0.628571
2	0.775281	0.644068	0.689655	0.588235
3	0.831461	0.773333	0.731707	0.852941
4	0.842697	0.811594	0.848485	0.852941
5	0.842697	0.764706	0.787879	0.764706
6	0.831461	0.774194	0.857143	0.676471
7	0.752809	0.677419	0.666667	0.588235
8	0.831461	0.800000	0.756757	0.852941
9	0.863636	0.823529	0.848485	0.823529

	mean_fit_time	mean_score_time	mean_test_score	mean_train_score	param_C	param_gamma	params	rank_test_score	split0_test_score	split0_train_score	split1_test_score	split1_train_score	split2_test_score	split2_train_score	std_fit_time	std_score_time	std_test_score	std_train_score
0	0.026519	0.005270	0.740058	0.88829	8.845	1.5135	{'gamma': 1.5135, 'C': 8.845}	25	0.70852	0.900222	0.756098	0.877729	0.755556	0.886918	0.003043	0.000311	0.022302	0.009234
1	0.029775	0.005481	0.741893	0.88829	8.845	1.5137	{'gamma': 1.5137, 'C': 8.845}	18	0.70852	0.900222	0.756098	0.877729	0.761062	0.886918	0.003933	0.000385	0.023685	0.009234
2	0.024944	0.004726	0.741893	0.88829	8.845	1.514	{'gamma': 1.514, 'C': 8.845}	18	0.70852	0.900222	0.756098	0.877729	0.761062	0.886918	0.002504	0.000115	0.023685	0.009234
3	0.028615	0.004974	0.741893	0.88829	8.845	1.5142	{'gamma': 1.5142, 'C': 8.845}	18	0.70852	0.900222	0.756098	0.877729	0.761062	0.886918	0.004213	0.000342	0.023685	0.009234
4	0.028352	0.006070	0.741893	0.88829	8.845	1.5145	{'gamma': 1.5145, 'C': 8.845}	18	0.70852	0.900222	0.756098	0.877729	0.761062	0.886918	0.003812	0.001969	0.023685	0.009234
5	0.029189	0.004819	0.742922	0.88829	8.847	1.5135	{'gamma': 1.5135, 'C': 8.847}	1	0.70852	0.900222	0.759184	0.877729	0.761062	0.886918	0.005666	0.000163	0.024338	0.009234
6	0.025874	0.006638	0.742922	0.88829	8.847	1.5137	{'gamma': 1.5137, 'C': 8.847}	1	0.70852	0.900222	0.759184	0.877729	0.761062	0.886918	0.003883	0.002887	0.024338	0.009234
7	0.026419	0.005511	0.741893	0.88829	8.847	1.514	{'gamma': 1.514, 'C': 8.847}	18	0.70852	0.900222	0.756098	0.877729	0.761062	0.886918	0.002627	0.000643	0.023685	0.009234
8	0.026388	0.004622	0.742922	0.88829	8.847	1.5142	{'gamma': 1.5142, 'C': 8.847}	1	0.70852	0.900222	0.759184	0.877729	0.761062	0.886918	0.002897	0.000022	0.024338	0.009234
9	0.026506	0.004623	0.741893	0.88829	8.847	1.5145	{'gamma': 1.5145, 'C': 8.847}	18	0.70852	0.900222	0.756098	0.877729	0.761062	0.886918	0.002926	0.000066	0.023685	0.009234
10	0.029469	0.004860	0.742922	0.88829	8.85	1.5135	{'gamma': 1.5135, 'C': 8.85}	1	0.70852	0.900222	0.759184	0.877729	0.761062	0.886918	0.004484	0.000307	0.024338	0.009234
11	0.029518	0.004609	0.741893	0.88829	8.85	1.5137	{'gamma': 1.5137, 'C': 8.85}	18	0.70852	0.900222	0.756098	0.877729	0.761062	0.886918	0.003012	0.000063	0.023685	0.009234
12	0.028544	0.004685	0.742922	0.88829	8.85	1.514	{'gamma': 1.514, 'C': 8.85}	1	0.70852	0.900222	0.759184	0.877729	0.761062	0.886918	0.004921	0.000098	0.024338	0.009234
13	0.027720	0.004616	0.742922	0.88829	8.85	1.5142	{'gamma': 1.5142, 'C': 8.85}	1	0.70852	0.900222	0.759184	0.877729	0.761062	0.886918	0.003298	0.000088	0.024338	0.009234
14	0.027366	0.004920	0.742922	0.88829	8.85	1.5145	{'gamma': 1.5145, 'C': 8.85}	1	0.70852	0.900222	0.759184	0.877729	0.761062	0.886918	0.004772	0.000181	0.024338	0.009234
15	0.031986	0.004831	0.742922	0.88829	8.852	1.5135	{'gamma': 1.5135, 'C': 8.852}	1	0.70852	0.900222	0.759184	0.877729	0.761062	0.886918	0.010387	0.000032	0.024338	0.009234
16	0.027930	0.004705	0.742922	0.88829	8.852	1.5137	{'gamma': 1.5137, 'C': 8.852}	1	0.70852	0.900222	0.759184	0.877729	0.761062	0.886918	0.003607	0.000108	0.024338	0.009234
17	0.026094	0.007387	0.742922	0.88829	8.852	1.514	{'gamma': 1.514, 'C': 8.852}	1	0.70852	0.900222	0.759184	0.877729	0.761062	0.886918	0.002769	0.003723	0.024338	0.009234
18	0.026732	0.004754	0.742922	0.88829	8.852	1.5142	{'gamma': 1.5142, 'C': 8.852}	1	0.70852	0.900222	0.759184	0.877729	0.761062	0.886918	0.004105	0.000158	0.024338	0.009234
19	0.026130	0.006103	0.742922	0.88829	8.852	1.5145	{'gamma': 1.5145, 'C': 8.852}	1	0.70852	0.900222	0.759184	0.877729	0.761062	0.886918	0.002678	0.001976	0.024338	0.009234
20	0.027911	0.004642	0.742922	0.88829	8.855	1.5135	{'gamma': 1.5135, 'C': 8.855}	1	0.70852	0.900222	0.759184	0.877729	0.761062	0.886918	0.003728	0.000124	0.024338	0.009234
21	0.028212	0.005137	0.742922	0.88829	8.855	1.5137	{'gamma': 1.5137, 'C': 8.855}	1	0.70852	0.900222	0.759184	0.877729	0.761062	0.886918	0.002840	0.000454	0.024338	0.009234
22	0.029147	0.004723	0.742922	0.88829	8.855	1.514	{'gamma': 1.514, 'C': 8.855}	1	0.70852	0.900222	0.759184	0.877729	0.761062	0.886918	0.004498	0.000132	0.024338	0.009234
23	0.026079	0.004897	0.742922	0.88829	8.855	1.5142	{'gamma': 1.5142, 'C': 8.855}	1	0.70852	0.900222	0.759184	0.877729	0.761062	0.886918	0.002749	0.000382	0.024338	0.009234
24	0.024683	0.004708	0.742922	0.88829	8.855	1.5145	{'gamma': 1.5145, 'C': 8.855}	1	0.70852	0.900222	0.759184	0.877729	0.761062	0.886918	0.004048	0.000112	0.024338	0.009234

	accuracy	f1	precision	recall
0	0.611111	0.0	0.0	0.0
1	0.611111	0.0	0.0	0.0
2	0.617978	0.0	0.0	0.0
3	0.617978	0.0	0.0	0.0
4	0.617978	0.0	0.0	0.0
5	0.617978	0.0	0.0	0.0
6	0.617978	0.0	0.0	0.0
7	0.617978	0.0	0.0	0.0
8	0.617978	0.0	0.0	0.0
9	0.613636	0.0	0.0	0.0

	accuracy	f1	precision	recall
0	0.800000	0.742857	0.742857	0.742857
1	0.822222	0.764706	0.787879	0.742857
2	0.764045	0.644068	0.760000	0.558824
3	0.752809	0.717949	0.636364	0.823529
4	0.898876	0.869565	0.857143	0.882353
5	0.853933	0.816901	0.783784	0.852941
6	0.831461	0.761905	0.827586	0.705882
7	0.853933	0.811594	0.800000	0.823529
8	0.808989	0.746269	0.757576	0.735294
9	0.795455	0.727273	0.750000	0.705882