Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
AllenDowney
GitHub Repository: AllenDowney/bayesian-analysis-recipes
Path: blob/master/incubator/multiclass-classification-neural-network.ipynb
411 views
Kernel: bayesian
%load_ext autoreload %matplotlib inline %config InlineBackend.figure_format = 'retina' import pymc3 as pm import numpy as np import theano.tensor as tt import theano import matplotlib.pyplot as plt import pandas as pd from sklearn.preprocessing import OneHotEncoder from pymc3.backends import HDF5, text %autoreload 2
theano.__version__

Introduction

I'd like to try doing multiclass classification with a deep neural network. The network architecture is a feed-forward network with one hidden layer in between the input and output; n_hidden units is a hyperparameter.

Model Specification

Because neural networks are the highlight here, I will first do the model specification up-front.

def make_nn(ann_input, ann_output, n_hidden): """ Makes a feed forward neural network with n_hidden layers for doing multi- class classification. Feed-forward networks are easy to define, so I have not relied on any other Deep Learning frameworks to define the neural network here. """ init_1 = np.random.randn(ann_input.shape[1], n_hidden) init_2 = np.random.randn(n_hidden, n_hidden) init_out = np.random.randn(n_hidden, ann_output.shape[1]) with pm.Model() as nn_model: # Define weights weights_1 = pm.Normal( "w_1", mu=0, sd=1, shape=(ann_input.shape[1], n_hidden), testval=init_1 ) weights_2 = pm.Normal( "w_2", mu=0, sd=1, shape=(n_hidden, n_hidden), testval=init_2 ) weights_out = pm.Normal( "w_out", mu=0, sd=1, shape=(n_hidden, ann_output.shape[1]), testval=init_out ) # Define activations acts_1 = pm.Deterministic( "activations_1", tt.tanh(tt.dot(ann_input, weights_1)) ) acts_2 = pm.Deterministic("activations_2", tt.tanh(tt.dot(acts_1, weights_2))) acts_out = pm.Deterministic( "activations_out", tt.nnet.softmax(tt.dot(acts_2, weights_out)) ) # noqa # Define likelihood out = pm.Multinomial("likelihood", n=1, p=acts_out, observed=ann_output) return nn_model

Preprocess Data

Basic Cleaning

Now, let's read in the dataset. There's a bunch of preprocessing that has to happen. I happened to have this code written for the IACS 2017 data science bootcamp, and copied it over from there. It's commented out because it takes some time to execute, and if executed once, it needn't be executed again.

# df = pd.read_csv('datasets/covtype.data', header=None) # columns = [ # 'Elevation', # 'Aspect', # 'Slope', # 'Horizontal_Distance_To_Hydrology', # 'Vertical_Distance_To_Hydrology', # 'Horizontal_Distance_To_Roadways', # 'Hillshade_9am', # 'Hillshade_Noon', # 'Hillshade_3pm', # 'Horizontal_Distance_To_Fire_Points' # ] # # Add in wilderness area data (binary) # for i in range(1, 5): # columns.append('Wilderness_Area_{0}'.format(i)) # # Add in soil type data (binary) # for i in range(1, 41): # columns.append('Soil_Type_{0}'.format(i)) # # Add in soil cover type # columns.append('Cover_Type') # df.columns = columns # # Add in soil codes. These were downloaded from the UCI repository. # soil_codes = pd.read_csv('datasets/climatic_geologic_zone.csv') # soil_dict = soil_codes.set_index('soil_type').to_dict() # # Add geologic and climatic zone code to soil type # for i in range(1, 41): # df.loc[df['Soil_Type_{i}'.format(i=i)] == 1, 'Climatic_Zone'] = soil_dict['climatic_zone'][i] # df.loc[df['Soil_Type_{i}'.format(i=i)] == 1, 'Geologic_Zone'] = soil_dict['geologic_zone'][i] # # Encode one-of-K for the geologic zone, climatic zone, and cover_type encodings. # # This is important because the geologic and climatic zones aren't ordinal - they are strictly categorical. # enc = OneHotEncoder() # clm_zone_enc = enc.fit_transform(df['Climatic_Zone'].values.reshape(-1, 1)).toarray() # geo_zone_enc = enc.fit_transform(df['Geologic_Zone'].values.reshape(-1, 1)).toarray() # cov_type_enc = enc.fit_transform(df['Cover_Type'].values.reshape(-1, 1)).toarray() # for i in range(clm_zone_enc.shape[1]): # df['Climatic_Zone_{i}'.format(i=i)] = clm_zone_enc[:, i] # for i in range(geo_zone_enc.shape[1]): # df['Geologic_Zone_{i}'.format(i=i)] = geo_zone_enc[:, i] # del df['Climatic_Zone'] # del df['Geologic_Zone'] # df.to_csv('datasets/covtype_preprocess.csv')
df = pd.read_csv("../datasets/covtype_preprocess.csv", index_col=0) df.shape

Let's now make the X and Y matrices. We use patsy to give us a quick and fast way to turn categorical variables into one-of-K columns.

df["Cover_Type"] = df["Cover_Type"].apply(lambda x: str(x))
output_col = "Cover_Type" input_cols = [c for c in df.columns if c != output_col] input_formula = "".join(c + " + " for c in input_cols) input_formula = input_formula + "-1" import patsy from sklearn.preprocessing import scale, normalize X = patsy.dmatrix(formula_like=input_formula, data=df, return_type="dataframe") # X = normalize(X) Y = patsy.dmatrix(formula_like="Cover_Type -1", data=df, return_type="dataframe") print(X.shape, Y.shape)
Y.head()

Balance Classes

We will balance out the classes to make them evenly distributed.

from sklearn.preprocessing import MinMaxScaler downsampled_targets = [] for i in range(1, 7 + 1): # print(f'target[{i}]') target = Y[Y["Cover_Type[{i}]".format(i=i)] == 1] # print(len(target)) downsampled_targets.append(target.sample(2747)) mms = MinMaxScaler() X_tfm = pm.floatX(mms.fit_transform(X[input_cols])) Y_downsampled = pd.concat(downsampled_targets) Y_downsamp = pm.floatX(Y_downsampled) X_downsampled = X_tfm[Y_downsampled.index] X_downsamp = pm.floatX(X_downsampled)
X_downsamp.shape
from models.feedforward import ForestCoverModel fcm = ForestCoverModel(n_hidden=20,) fcm.fit(X_downsamp, Y_downsamp)
X_downsampled.shape
X_downsamp.dtype
n_hidden = 20 # define the number of hidden units

Model Execution

We now make the model with {{n_hidden}} hidden units.

model = make_nn(X_downsamp, Y_downsamp, n_hidden=n_hidden)
with model: # s = theano.shared(pm.floatX(1.1)) # inference = pm.ADVI(cost_part_grad_scale=s) approx = pm.fit( 500000, callbacks=[pm.callbacks.CheckParametersConvergence(tolerance=1e-1)] )
plt.plot(approx.hist) plt.yscale("log")
with model: trace = approx.sample(1000)
# pm.traceplot(trace, varnames=['w_1'])
# pm.traceplot(trace, varnames=['w_2'])
# pm.traceplot(trace, varnames=['w_out'])
with model: samp_ppc = pm.sample_ppc(trace, samples=1000)

Results

preds_proba = samp_ppc["likelihood"].mean(axis=0) preds = (preds_proba == np.max(preds_proba, axis=1, keepdims=True)) * 1 plt.pcolor(preds) plt.savefig("figures/class_predictions.png", dpi=600)
plt.pcolor(preds_proba) plt.savefig("figures/class_probabilities.png", dpi=600)
plt.pcolor(samp_ppc["likelihood"].std(axis=0)) plt.savefig("figures/class_uncertainties.png", dpi=600)
from sklearn.metrics import classification_report print(classification_report(Y_downsamp, preds))

Compared to the logistic regression notebook, we have higher performance!

Fit sklearn-like estimator