# Import required packages
import pandas as pd
import numpy as np
import pylab as plt
import csv as csv
import seaborn as sns
from scipy import stats
# Import Cross Validation Score
from sklearn.model_selection import cross_val_score
# Import split tool
from sklearn.model_selection import train_test_split
# Import GridSearch
from sklearn.model_selection import GridSearchCV
# Import Student t-test
from scipy.stats import ttest_rel
# Import DummyClassifier
from sklearn.dummy import DummyClassifier
# Import the Random Forest package
from sklearn.ensemble import RandomForestClassifier
# Import the Support Vector Machines package
from sklearn import svm
# For .read_csv, always use header=0 when you know row 0 is the header row
train_df = pd.read_csv('train.csv', header=0)
# Examine the data - note that Age, Cabin and Embarked have missing values
train_df.info()
# Examine Cabin data
# Create a new column CabinLetter which is the first character of the Cabin string
train_df['CabinLetter'] = train_df['Cabin'].str[0]
plot_df = train_df[train_df['CabinLetter'].notnull()]
# Visualisation
fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5))
# Plot count of port of Embarkation
sns.countplot(x='CabinLetter', data=plot_df.sort_values(by='CabinLetter'), ax=axis1)
# Group by CabinLetter, and get the mean for survived passengers for each value in CabinLetter
cabin_perc = plot_df[['CabinLetter', 'Survived']].groupby(['CabinLetter'],as_index=False).mean()
sns.barplot(x='CabinLetter', y='Survived', data=cabin_perc.sort_values(by='CabinLetter'),ax=axis2)
# Plot count of CabinLetter by Passenger Class
sns.countplot(x='Pclass', hue='CabinLetter', data=plot_df.sort_values(by='CabinLetter'), ax=axis3)
# Some variation in survival rates by CabinLetter, but hard to deduce the missing values e.g. from Passenger Class
# For Passenger Class 1, the CabinLetter can be anything from A-E
# So drop the CabinLetter column
train_df = train_df.drop(['CabinLetter'], axis=1)
train_df.head(5)
# Examine Sex data
# Visualisation
fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5))
# Plot count of Sex
sns.countplot(x='Sex', data=train_df, ax=axis1)
# Plot count of Sex by Survival status
sns.countplot(x='Survived', hue='Sex', data=train_df, order=[1,0], ax=axis2)
# Group by sex, and get the mean for survived passengers for each value in Sex
sex_perc = train_df[['Sex', 'Survived']].groupby(['Sex'],as_index=False).mean()
sns.barplot(x='Sex', y='Survived', data=sex_perc,order=['female','male'],ax=axis3)
# Sex makes a significant difference to mean Survival rate, so keep the data
# Create a numeric representation for Sex to be able to use it for Machine Learning
# Call this new column Gender
train_df['Gender'] = train_df['Sex'].map( {'female': 0, 'male': 1} ).astype(float)
train_df.head(5)
# Examine Embarked data
plot_df = train_df[train_df['Embarked'].notnull()]
# Visualisation
fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5))
# Plot count of port of Embarkation
sns.countplot(x='Embarked', data=plot_df, ax=axis1)
# Plot count of Embarked ports by Survival status
sns.countplot(x='Survived', hue='Embarked', data=plot_df, order=[1,0], ax=axis2)
# Group by embarked, and get the mean for survived passengers for each value in Embarked
embark_perc = plot_df[['Embarked', 'Survived']].groupby(['Embarked'],as_index=False).mean()
sns.barplot(x='Embarked', y='Survived', data=embark_perc,order=['S','C','Q'],ax=axis3)
# Create a numeric representation for Embarked - the missing values will be replaced with S (mapped to 0) as it is by far the most common port
train_df['Port'] = train_df['Embarked'].fillna('S').map({'S': 0.0, 'C': 0.5, 'Q': 1.0}).astype(float)
train_df.head(5)
# Any relationship between fare and survival?
fare_bin_size = 5
bin_count, bin_edges, binnumber = stats.binned_statistic(train_df['Fare'], train_df['Fare'], statistic='count', bins=100/fare_bin_size, range=(0,100))
bin_means, bin_edges, binnumber = stats.binned_statistic(train_df['Fare'], train_df['Survived'], statistic='mean', bins=100/fare_bin_size, range=(0,100))
fare_survived_df = pd.DataFrame({'FareBin': bin_edges[1:], 'FareCount': bin_count, 'MeanSurv': bin_means})
# Visualisation
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(20,5))
axes = sns.barplot(x='FareBin', y='FareCount', data=fare_survived_df, ax=axis1)
axes.set(xlabel = 'Fare', ylabel = 'count')
axes = sns.barplot(x='FareBin', y='MeanSurv', data=fare_survived_df, ax=axis2)
axes.set(xlabel = 'Fare', ylabel = 'mean(Survived)')
# Any relationship between age and survival?
age_bin_size = 5
bin_count, bin_edges, binnumber = stats.binned_statistic(train_df['Age'], train_df['Age'], statistic='count', bins=100/age_bin_size, range=(0,100))
bin_means, bin_edges, binnumber = stats.binned_statistic(train_df['Age'], train_df['Survived'], statistic='mean', bins=100/age_bin_size, range=(0,100))
age_survived_df = pd.DataFrame({'AgeBin': bin_edges[1:], 'AgeCount': bin_count, 'MeanSurv': bin_means})
# Visualisation
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(20,5))
axes = sns.barplot(x='AgeBin', y='AgeCount', data=age_survived_df, ax=axis1)
axes.set(xlabel = 'Age', ylabel = 'count')
axes = sns.barplot(x='AgeBin', y='MeanSurv', data=age_survived_df, ax=axis2)
axes.set(xlabel = 'Age', ylabel = 'mean(Survived)')
# Age appears to make a difference to mean Survival rate, so keep the data
# Process it for Machine Learning
#Examine the passengers with missing age details
agena_df = train_df[train_df['Age'].isnull()]
agena_df.head(5)
# Any relationship beween age and passenger class?
# Distribution of Pclass values among passengers, where age is known
count_Pclass = plot_df[['Pclass', 'PassengerId']].groupby(['Pclass'],as_index=False).count()
# Distribution of Pclass values among passengers, where age is unknown
count_agena_Pclass = agena_df[['Pclass', 'PassengerId']].groupby(['Pclass'],as_index=False).count()
# Mean and median age of passengers by Pclass value
mean_age = plot_df[['Pclass', 'Age']].groupby(['Pclass'],as_index=False).mean()
median_age = plot_df[['Pclass', 'Age']].groupby(['Pclass'],as_index=False).median()
# Visualisation
fig, (axis1,axis2,axis3,axis4) = plt.subplots(1,4,figsize=(15,5))
axes = sns.barplot(x='Pclass', y='PassengerId', data=count_Pclass, ax=axis1)
axes.set(ylabel = 'count(Age known)')
axes = sns.barplot(x='Pclass', y='PassengerId', data=count_agena_Pclass, ax=axis2)
axes.set(ylabel = 'count(Age unknown)')
sns.barplot(x='Pclass', y='Age', data=mean_age, ax=axis3)
axes = sns.barplot(x='Pclass', y='Age', data=median_age, ax=axis4)
axes.set(ylabel = 'median(Age)')
# Any relationship between age and fare paid?
axes = sns.lmplot('Fare', 'Age', data=plot_df, fit_reg=False)
axes.set(xlabel="Fare", ylabel="Age")
# Any relationship beween age and sex?
# Visualisation
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,5))
mean_age = plot_df[['Sex', 'Age']].groupby(['Sex'],as_index=False).mean()
median_age = plot_df[['Sex', 'Age']].groupby(['Sex'],as_index=False).median()
sns.barplot(x='Sex', y='Age', data=mean_age, ax=axis1)
axes = sns.barplot(x='Sex', y='Age', data=median_age, ax=axis2)
axes.set(ylabel = 'median(Age)')
# Any relationship between age and sibling/spouse?
# Distribution of SibSp values among passengers, where age is known
count_sibsp = plot_df[['SibSp', 'PassengerId']].groupby(['SibSp'],as_index=False).count()
# Distribution of SibSp values among passengers, where age is unknown
count_agena_sibsp = agena_df[['SibSp', 'PassengerId']].groupby(['SibSp'],as_index=False).count()
# Mean and median age of passengers by SibSp value
mean_age = plot_df[['SibSp', 'Age']].groupby(['SibSp'],as_index=False).mean()
median_age = plot_df[['SibSp', 'Age']].groupby(['SibSp'],as_index=False).median()
# Visualisation
fig, (axis1,axis2,axis3,axis4) = plt.subplots(1,4,figsize=(15,5))
axes = sns.barplot(x='SibSp', y='PassengerId', data=count_sibsp, ax=axis1)
axes.set(ylabel = 'count(Age known)')
axes = sns.barplot(x='SibSp', y='PassengerId', data=count_agena_sibsp, ax=axis2)
axes.set(ylabel = 'count(Age unknown)')
sns.barplot(x='SibSp', y='Age', data=mean_age, ax=axis3)
axes = sns.barplot(x='SibSp', y='Age', data=median_age, ax=axis4)
axes.set(ylabel = 'median(Age)')
# Any relationship between age and parent/child?
# Distribution of Parch values among passengers, where age is known
count_Parch = plot_df[['Parch', 'PassengerId']].groupby(['Parch'],as_index=False).count()
# Distribution of Parch values among passengers, where age is unknown
count_agena_Parch = agena_df[['Parch', 'PassengerId']].groupby(['Parch'],as_index=False).count()
# Mean and median age of passengers by Parch value
mean_age = plot_df[['Parch', 'Age']].groupby(['Parch'],as_index=False).mean()
median_age = plot_df[['Parch', 'Age']].groupby(['Parch'],as_index=False).median()
# Visualisation
fig, (axis1,axis2,axis3,axis4) = plt.subplots(1,4,figsize=(15,5))
axes = sns.barplot(x='Parch', y='PassengerId', data=count_Parch, ax=axis1)
axes.set(ylabel = 'count(Age known)')
axes = sns.barplot(x='Parch', y='PassengerId', data=count_agena_Parch, ax=axis2)
axes.set(ylabel = 'count(Age unknown)')
sns.barplot(x='Parch', y='Age', data=mean_age, ax=axis3)
axes = sns.barplot(x='Parch', y='Age', data=median_age, ax=axis4)
axes.set(ylabel = 'median(Age)')
# Will set missing age values to Median age for each passenger class
median_ages = np.zeros((2,3))
for i in range(0, 2):
for j in range(0, 3):
median_ages[i,j] = train_df[(train_df['Gender'] == i) & (train_df['Pclass'] == j+1)]['Age'].dropna().median()
train_df['AgeFill'] = train_df['Age']
for i in range(0, 2):
for j in range(0, 3):
train_df.loc[ (train_df.Age.isnull()) & (train_df.Gender == i) & (train_df.Pclass == j+1),'AgeFill'] = median_ages[i,j]
# Scale from 0 to 1
train_df['AgeFill'] = train_df['AgeFill']/100
train_df[train_df['Age'].isnull()].head(5)
# Examine SibSp data
# Visualisation
fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5))
# Plot count of port of Embarkation
sns.countplot(x='SibSp', data=train_df, ax=axis1)
# Plot count of Embarked ports by Survival status
sns.countplot(x='Survived', hue='SibSp', data=train_df, order=[1,0], ax=axis2)
# Group by embarked, and get the mean for survived passengers for each value in SibSp
sibsp_mean = train_df[['SibSp', 'Survived']].groupby(['SibSp'],as_index=False).mean()
sns.barplot(x='SibSp', y='Survived', data=sibsp_mean,ax=axis3)
# Examine Parch data
# Visualisation
fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5))
# Plot count of port of Embarkation
sns.countplot(x='Parch', data=train_df, ax=axis1)
# Plot count of Embarked ports by Survival status
sns.countplot(x='Survived', hue='Parch', data=train_df, order=[1,0], ax=axis2)
# Group by embarked, and get the mean for survived passengers for each value in Parch
parch_mean = train_df[['Parch', 'Survived']].groupby(['Parch'],as_index=False).mean()
sns.barplot(x='Parch', y='Survived', data=parch_mean,ax=axis3)
# Both SibSp and Parch appear to make a difference to a passenger's chance of surviving - for non-zero values the chances are higher
# However, there are not very many non-zero values of each. Try combining the two into a new column for total family size
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch']
train_df.head(10)
# Examine FamilySize data
# Visualisation
fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5))
# Plot count of port of Embarkation
sns.countplot(x='FamilySize', data=train_df, ax=axis1)
# Plot count of Embarked ports by Survival status
sns.countplot(x='Survived', hue='FamilySize', data=train_df, order=[1,0], ax=axis2)
# Group by embarked, and get the mean for survived passengers for each value in FamilySize
familysize_mean = train_df[['FamilySize', 'Survived']].groupby(['FamilySize'],as_index=False).mean()
sns.barplot(x='FamilySize', y='Survived', data=familysize_mean,ax=axis3)
# Scale SibSp, Parch, FamilySize and Pclass
train_df['SibSpS'] = train_df['SibSp'].astype(float)/10
train_df['ParchS'] = train_df['Parch'].astype(float)/10
train_df['FamilySizeS'] = train_df['FamilySize'].astype(float)/10
train_df['PclassS'] = train_df['Pclass'].astype(float)/3
train_df
# Drop unused columns - this includes those that have been replaced
train_df = train_df.drop(['Age', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'PassengerId', 'SibSp', 'Parch', 'FamilySize', 'Pclass'], axis=1)
train_df.info()
# TEST DATA
test_df = pd.read_csv('test.csv', header=0) # Load the test file into a dataframe
# Repeat the data processing of the traing data with the test data, so that the columns are the same
# Create a numeric representation for Sex
test_df['Gender'] = test_df['Sex'].map( {'female': 0, 'male': 1} ).astype(float)
# Create a numeric representation for Embarked port
test_df['Port'] = test_df['Embarked'].fillna('S').map({'S': 0.0, 'C': 0.5, 'Q': 1.0}).astype(float)
# Will set missing age values to Median age for each passenger class
median_ages = np.zeros((2,3))
for i in range(0, 2):
for j in range(0, 3):
median_ages[i,j] = test_df[(test_df['Gender'] == i) & (test_df['Pclass'] == j+1)]['Age'].dropna().median()
test_df['AgeFill'] = test_df['Age']
for i in range(0, 2):
for j in range(0, 3):
test_df.loc[ (test_df.Age.isnull()) & (test_df.Gender == i) & (test_df.Pclass == j+1),'AgeFill'] = median_ages[i,j]
# Scale from 0 to 1
test_df['AgeFill'] = test_df['AgeFill']/100
# Add FamilySize column
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch']
# All the missing Fares -> assume median of their respective class
if len(test_df.Fare[ test_df.Fare.isnull() ]) > 0:
median_fare = np.zeros(3)
for f in range(0, 3):
median_fare[f] = test_df[ test_df.Pclass == f+1 ]['Fare'].dropna().median()
for f in range(0, 3):
test_df.loc[ (test_df.Fare.isnull()) & (test_df.Pclass == f+1 ), 'Fare'] = median_fare[f]
# Collect the test data's PassengerIds before dropping it
ids = test_df['PassengerId'].values
# Scaling
test_df['SibSpS'] = test_df['SibSp'].astype(float)/10
test_df['ParchS'] = test_df['Parch'].astype(float)/10
test_df['FamilySizeS'] = test_df['FamilySize'].astype(float)/10
test_df['PclassS'] = test_df['Pclass'].astype(float)/3
# Remove the unused/replaced columns
test_df = test_df.drop(['Age', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'PassengerId', 'SibSp', 'Parch', 'FamilySize', 'Pclass'], axis=1)
# The data is now ready to go. So lets fit to the train, then predict to the test!
# Convert back to a numpy array
train_data = train_df.values
test_data = test_df.values
test_df.info()
# Control classifier - always chooses the most frequent class
control = DummyClassifier(strategy='most_frequent')
# Use Stratified k-fold cross-validation for 10 folds
control_cv_scores = pd.DataFrame({"accuracy": cross_val_score(control, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="accuracy"),
"precision": cross_val_score(control, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="precision"),
"recall": cross_val_score(control, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="recall"),
"f1": cross_val_score(control, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="f1")
})
print('Control Classifier - Cross-validated')
# Display all the measure scores for each fold
control_cv_scores.to_csv("control_cv_scores.csv")
control_cv_scores
# Mean values of measure scores
control_cv_scores.mean()
# Standard deviation values of measure scores
control_cv_scores.mean()
# Grid search for Random Forest parameter optimisation
params = [ {'n_estimators': [100, 200, 500], 'max_features': ['sqrt', None]} ]
rf = RandomForestClassifier()
grid = GridSearchCV(estimator=rf, param_grid=params, cv=5, scoring='f1', verbose=3)
grid.fit(train_data[0::,1::], train_data[0::,0])
print(grid.best_params_)
# Random Forest
forest = RandomForestClassifier(n_estimators=100, max_features=None)
forest = forest.fit( train_data[0::,1::], train_data[0::,0] )
print('Random Forest - Trained')
output = forest.predict(test_data).astype(int)
predictions_file = open("randomforest.csv", "w")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(ids, output))
predictions_file.close()
print('Random Forest - Predicted')
# Cross Validator uses Stratified k-fold cross-validation for 10 folds
forest_cv_scores = pd.DataFrame({"accuracy": cross_val_score(forest, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="accuracy"),
"precision": cross_val_score(forest, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="precision"),
"recall": cross_val_score(forest, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="recall"),
"f1": cross_val_score(forest, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="f1")
})
print('Random Forest - Cross-validated')
# Display all the measure scores for each fold
forest_cv_scores.to_csv("forest_cv_scores.csv")
forest_cv_scores
# Mean values of measure scores
forest_cv_scores.mean()
# Standard Deviations of measure scores
forest_cv_scores.std()
# 95% Confidence Intervals
stats.norm.interval(0.95, loc=forest_cv_scores['f1'].mean(), scale=forest_cv_scores['f1'].std())
# Grid search for Support Vector Machine parameter optimisation
c_list = [8.845, 8.847, 8.85, 8.852, 8.855]
gamma_list = [1.5135, 1.5137, 1.514, 1.5142, 1.5145]
params = [ {'C': c_list, 'gamma': gamma_list} ]
grid = GridSearchCV(estimator=svm.SVC(kernel='rbf', cache_size=500, class_weight='balanced'), param_grid=params, cv=3, scoring='f1', verbose=3)
grid.fit(train_data[0::,1::], train_data[0::,0])
print(grid.best_params_)
grid_df = pd.DataFrame(grid.cv_results_)
grid_df.to_csv("svmachine_gridsearch.csv")
grid_df
# Support Vector Machine
svmachine = svm.SVC(C=8.847, gamma=1.5135, kernel='rbf', cache_size=500, class_weight='balanced')
svmachine = svmachine.fit( train_data[0::,1::], train_data[0::,0] )
print('Support Vector Machine - Trained')
output = svmachine.predict(test_data).astype(int)
predictions_file = open("svmachine.csv", "w")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(ids, output))
predictions_file.close()
print('Support Vector Machine - Predicted')
# Cross Validator uses Stratified k-fold cross-validation for 10 folds
svmachine_cv_scores = pd.DataFrame({"accuracy": cross_val_score(svmachine, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="accuracy"),
"precision": cross_val_score(svmachine, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="precision"),
"recall": cross_val_score(svmachine, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="recall"),
"f1": cross_val_score(svmachine, train_data[0::,1::], y=train_data[0::,0], cv=10, scoring="f1")
})
print('Support Vector Machine - Cross-validated')
# Display all the measure scores for each fold
svmachine_cv_scores.to_csv("svmachine_cv_scores.csv")
svmachine_cv_scores
# Mean values of measure scores
svmachine_cv_scores.mean()
# Standard Deviations of measure scores
svmachine_cv_scores.std()
# 95% Confidence Intervals on f1 scores
stats.norm.interval(0.95, loc=svmachine_cv_scores['f1'].mean(), scale=svmachine_cv_scores['f1'].std())
# Student paired t-test on f1 scores
t_stat, p_val = ttest_rel(forest_cv_scores['f1'], svmachine_cv_scores['f1'])
print(t_stat, p_val)