Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
probml
GitHub Repository: probml/pyprobml
Path: blob/master/notebooks/misc/dropout_MLP_torch.ipynb
1192 views
Kernel: Python 3

Open In Colab

import numpy as np import matplotlib.pyplot as plt np.random.seed(seed=1) import math import torch from torch import nn from torch.nn import functional as F !mkdir figures # for saving plots !wget https://raw.githubusercontent.com/d2l-ai/d2l-en/master/d2l/torch.py -q -O d2l.py import d2l

Add dropout layer by hand to an MLP

def dropout_layer(X, dropout): assert 0 <= dropout <= 1 # In this case, all elements are dropped out if dropout == 1: return torch.zeros_like(X) # In this case, all elements are kept if dropout == 0: return X mask = (torch.Tensor(X.shape).uniform_(0, 1) > dropout).float() return mask * X / (1.0 - dropout)
# quick test torch.manual_seed(0) X = torch.arange(16, dtype=torch.float32).reshape((2, 8)) print(X) print(dropout_layer(X, 0.0)) print(dropout_layer(X, 0.5)) print(dropout_layer(X, 1.0))
tensor([[ 0., 1., 2., 3., 4., 5., 6., 7.], [ 8., 9., 10., 11., 12., 13., 14., 15.]]) tensor([[ 0., 1., 2., 3., 4., 5., 6., 7.], [ 8., 9., 10., 11., 12., 13., 14., 15.]]) tensor([[ 0., 2., 0., 0., 0., 10., 0., 14.], [ 0., 18., 0., 0., 0., 0., 0., 30.]]) tensor([[0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0.]])
# A common trend is to set a lower dropout probability closer to the input layer class Net(nn.Module): def __init__( self, num_inputs, num_outputs, num_hiddens1, num_hiddens2, is_training=True, dropout1=0.2, dropout2=0.5 ): super(Net, self).__init__() self.dropout1 = dropout1 self.dropout2 = dropout2 self.num_inputs = num_inputs self.training = is_training self.lin1 = nn.Linear(num_inputs, num_hiddens1) self.lin2 = nn.Linear(num_hiddens1, num_hiddens2) self.lin3 = nn.Linear(num_hiddens2, num_outputs) self.relu = nn.ReLU() def forward(self, X): H1 = self.relu(self.lin1(X.reshape((-1, self.num_inputs)))) # Use dropout only when training the model if self.training == True: # Add a dropout layer after the first fully connected layer H1 = dropout_layer(H1, self.dropout1) H2 = self.relu(self.lin2(H1)) if self.training == True: # Add a dropout layer after the second fully connected layer H2 = dropout_layer(H2, self.dropout2) out = self.lin3(H2) return out

Fit to FashionMNIST

Uses the d2l.load_data_fashion_mnist function.

train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size=256)
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ../data/FashionMNIST/raw/train-images-idx3-ubyte.gz
HBox(children=(FloatProgress(value=0.0, max=26421880.0), HTML(value='')))
Extracting ../data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ../data/FashionMNIST/raw Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ../data/FashionMNIST/raw/train-labels-idx1-ubyte.gz
HBox(children=(FloatProgress(value=0.0, max=29515.0), HTML(value='')))
Extracting ../data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ../data/FashionMNIST/raw Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ../data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz
HBox(children=(FloatProgress(value=0.0, max=4422102.0), HTML(value='')))
Extracting ../data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ../data/FashionMNIST/raw Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ../data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz
HBox(children=(FloatProgress(value=0.0, max=5148.0), HTML(value='')))
Extracting ../data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ../data/FashionMNIST/raw Processing... Done!
/usr/local/lib/python3.7/dist-packages/torchvision/datasets/mnist.py:479: UserWarning: The given NumPy array is not writeable, and PyTorch does not support non-writeable tensors. This means you can write to the underlying (supposedly non-writeable) NumPy array using the tensor. You may want to copy the array to protect its data or make it writeable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:143.) return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s) /usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py:477: UserWarning: This DataLoader will create 4 worker processes in total. Our suggested max number of worker in current system is 2, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary. cpuset_checked))

Fit model using SGD. Uses the d2l.train_ch3 function.

torch.manual_seed(0) # We pick a wide model to cause overfitting without dropout num_inputs, num_outputs, num_hiddens1, num_hiddens2 = 784, 10, 256, 256 net = Net(num_inputs, num_outputs, num_hiddens1, num_hiddens2, dropout1=0.5, dropout2=0.5) loss = nn.CrossEntropyLoss() lr = 0.5 trainer = torch.optim.SGD(net.parameters(), lr=lr) num_epochs = 10 d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, trainer)
Image in a Jupyter notebook

When we turn dropout off, we notice a slightly larger gap between train and test accuracy.

torch.manual_seed(0) net = Net(num_inputs, num_outputs, num_hiddens1, num_hiddens2, dropout1=0.0, dropout2=0.0) loss = nn.CrossEntropyLoss() trainer = torch.optim.SGD(net.parameters(), lr=lr) num_epochs = 10 d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, trainer)
Image in a Jupyter notebook

Dropout using PyTorch layer

dropout1 = 0.5 dropout2 = 0.5 net = nn.Sequential( nn.Flatten(), nn.Linear(num_inputs, num_hiddens1), nn.ReLU(), # Add a dropout layer after the first fully connected layer nn.Dropout(dropout1), nn.Linear(num_hiddens2, num_hiddens1), nn.ReLU(), # Add a dropout layer after the second fully connected layer nn.Dropout(dropout2), nn.Linear(num_hiddens2, num_outputs), ) def init_weights(m): if type(m) == nn.Linear: nn.init.normal_(m.weight, std=0.01) torch.manual_seed(0) net.apply(init_weights);
trainer = torch.optim.SGD(net.parameters(), lr=lr) d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, trainer)
Image in a Jupyter notebook

Visualize some predictions

def display_predictions(net, test_iter, n=6): # Extract first batch from iterator for X, y in test_iter: break # Get labels trues = d2l.get_fashion_mnist_labels(y) preds = d2l.get_fashion_mnist_labels(d2l.argmax(net(X), axis=1)) # Plot titles = [true + "\n" + pred for true, pred in zip(trues, preds)] d2l.show_images(d2l.reshape(X[0:n], (n, 28, 28)), 1, n, titles=titles[0:n])
# d2l.predict_ch3(net, test_iter) display_predictions(net, test_iter)
/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py:477: UserWarning: This DataLoader will create 4 worker processes in total. Our suggested max number of worker in current system is 2, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary. cpuset_checked))
Image in a Jupyter notebook