Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
probml
GitHub Repository: probml/pyprobml
Path: blob/master/notebooks/book1/08/autodiff_pytorch.ipynb
1192 views
Kernel: Python 3

Open In Colab

Automatic differentation using PyTorch

We show how to do Automatic differentation using PyTorch.

import sklearn import scipy import scipy.optimize import matplotlib.pyplot as plt import itertools import time from functools import partial import os import numpy as np from scipy.special import logsumexp np.set_printoptions(precision=3)
import torch import torch.nn as nn import torchvision print("torch version {}".format(torch.__version__)) if torch.cuda.is_available(): print(torch.cuda.get_device_name(0)) print("current device {}".format(torch.cuda.current_device())) else: print("Torch cannot find GPU") use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if use_cuda else "cpu")
torch version 1.8.0+cu101 Tesla P100-PCIE-16GB current device 0

Example: binary logistic regression

Objective = NLL for binary logistic regression

# Fit the model usign sklearn import sklearn.datasets from sklearn.model_selection import train_test_split iris = sklearn.datasets.load_iris() X = iris["data"] y = (iris["target"] == 2).astype(np.int) # 1 if Iris-Virginica, else 0' N, D = X.shape # 150, 4 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) from sklearn.linear_model import LogisticRegression # We set C to a large number to turn off regularization. # We don't fit the bias term to simplify the comparison below. log_reg = LogisticRegression(solver="lbfgs", C=1e5, fit_intercept=False) log_reg.fit(X_train, y_train) w_mle_sklearn = np.ravel(log_reg.coef_) print(w_mle_sklearn)
[-4.414 -9.111 6.539 12.686]

Computing gradients by hand

# Binary cross entropy def BCE_with_logits(logits, targets): N = logits.shape[0] logits = logits.reshape(N, 1) logits_plus = np.hstack([np.zeros((N, 1)), logits]) # e^0=1 logits_minus = np.hstack([np.zeros((N, 1)), -logits]) logp1 = -logsumexp(logits_minus, axis=1) logp0 = -logsumexp(logits_plus, axis=1) logprobs = logp1 * targets + logp0 * (1 - targets) return -np.sum(logprobs) / N # Compute using numpy def sigmoid(x): return 0.5 * (np.tanh(x / 2.0) + 1) def predict_logit(weights, inputs): return np.dot(inputs, weights) # Already vectorized def predict_np(weights, inputs): return sigmoid(predict_logit(weights, inputs)) def NLL(weights, batch): X, y = batch logits = predict_logit(weights, X) return BCE_with_logits(logits, y) def NLL_grad(weights, batch): X, y = batch N = X.shape[0] mu = predict_np(weights, X) g = np.sum(np.dot(np.diag(mu - y), X), axis=0) / N return g
w_np = w_mle_sklearn y_pred = predict_np(w_np, X_test) loss_np = NLL(w_np, (X_test, y_test)) grad_np = NLL_grad(w_np, (X_test, y_test)) print("params {}".format(w_np)) # print("pred {}".format(y_pred)) print("loss {}".format(loss_np)) print("grad {}".format(grad_np))
params [-4.414 -9.111 6.539 12.686] loss 0.1182400709961879 grad [-0.235 -0.122 -0.198 -0.064]

PyTorch code

To compute the gradient using torch, we proceed as follows.

  • declare all the variables that you want to take derivatives with respect to using the requires_grad=True argumnet

  • define the (scalar output) objective function you want to differentiate in terms of these variables, and evaluate it at a point. This will generate a computation graph and store all the tensors.

  • call objective.backward() to trigger backpropagation (chain rule) on this graph.

  • extract the gradients from each variable using variable.grad field. (These will be torch tensors.)

See the example below.

# data. By default, numpy uses double but torch uses float X_train_t = torch.tensor(X_train, dtype=torch.float) y_train_t = torch.tensor(y_train, dtype=torch.float) X_test_t = torch.tensor(X_test, dtype=torch.float) y_test_t = torch.tensor(y_test, dtype=torch.float)
# parameters W = np.reshape(w_mle_sklearn, [D, 1]) # convert 1d vector to 2d matrix w_torch = torch.tensor(W, requires_grad=True, dtype=torch.float) # w_torch.requires_grad_() # binary logistic regression in one line of Pytorch def predict(X, w): y_pred = torch.sigmoid(torch.matmul(X, w))[:, 0] return y_pred # This returns Nx1 probabilities y_pred = predict(X_test_t, w_torch) # loss function is average NLL criterion = torch.nn.BCELoss(reduction="mean") loss_torch = criterion(y_pred, y_test_t) print(loss_torch) # Backprop loss_torch.backward() print(w_torch.grad) # convert to numpy. We have to "detach" the gradient tracing feature loss_torch = loss_torch.detach().numpy() grad_torch = w_torch.grad[:, 0].detach().numpy()
tensor(0.1182, grad_fn=<BinaryCrossEntropyBackward>) tensor([[-0.2353], [-0.1223], [-0.1976], [-0.0638]])
# Test assert np.allclose(loss_np, loss_torch) assert np.allclose(grad_np, grad_torch) print("loss {}".format(loss_torch)) print("grad {}".format(grad_torch))
loss 0.11824005842208862 grad [-0.235 -0.122 -0.198 -0.064]

Autograd on a DNN

Below we show how to define more complex deep neural networks, and how to access their parameters. We can then call backward() on the scalar loss function, and extract their gradients. We base our presentation on http://d2l.ai/chapter_deep-learning-computation/parameters.html.

Sequential models

First we create a shallow MLP.

torch.manual_seed(0) net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1)) X = torch.rand(size=(2, 4)) # batch x Din, batch=2, Din=4 out = net(X) # batch x Dout, Dout=1 print(out)
tensor([[-0.2531], [-0.3098]], grad_fn=<AddmmBackward>)

Let's visualize the model and all the parameters in each layer.

print(net)
Sequential( (0): Linear(in_features=4, out_features=8, bias=True) (1): ReLU() (2): Linear(in_features=8, out_features=1, bias=True) )
for i in range(3): print(f"layer {i}") print(net[i].state_dict())
layer 0 OrderedDict([('weight', tensor([[-0.0037, 0.2682, -0.4115, -0.3680], [-0.1926, 0.1341, -0.0099, 0.3964], [-0.0444, 0.1323, -0.1511, -0.0983], [-0.4777, -0.3311, -0.2061, 0.0185], [ 0.1977, 0.3000, -0.3390, -0.2177], [ 0.1816, 0.4152, -0.1029, 0.3742], [-0.0806, 0.0529, 0.4527, -0.4638], [-0.3148, -0.1266, -0.1949, 0.4320]])), ('bias', tensor([-0.3241, -0.2302, -0.3493, -0.4683, -0.2919, 0.4298, 0.2231, 0.2423]))]) layer 1 OrderedDict() layer 2 OrderedDict([('weight', tensor([[ 0.0186, -0.1813, 0.0598, -0.3301, -0.2555, -0.1823, 0.2231, 0.2073]])), ('bias', tensor([-0.1568]))])
print(*[(name, param.shape) for name, param in net.named_parameters()])
('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))

Access a specific parameter.

print(type(net[2].bias)) print(net[2].bias) print(net[2].bias.data) print(net.state_dict()["2.bias"].data)
<class 'torch.nn.parameter.Parameter'> Parameter containing: tensor([-0.1568], requires_grad=True) tensor([-0.1568]) tensor([-0.1568])

The gradient is not defined until we call backward.

net[2].weight.grad == None
True

Nested models

def block1(): return nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 4), nn.ReLU()) def block2(): net = nn.Sequential() for i in range(4): # Nested here net.add_module(f"block {i}", block1()) return net rgnet = nn.Sequential(block2(), nn.Linear(4, 1)) print(rgnet(X)) print(rgnet)
tensor([[0.2138], [0.2138]], grad_fn=<AddmmBackward>) Sequential( (0): Sequential( (block 0): Sequential( (0): Linear(in_features=4, out_features=8, bias=True) (1): ReLU() (2): Linear(in_features=8, out_features=4, bias=True) (3): ReLU() ) (block 1): Sequential( (0): Linear(in_features=4, out_features=8, bias=True) (1): ReLU() (2): Linear(in_features=8, out_features=4, bias=True) (3): ReLU() ) (block 2): Sequential( (0): Linear(in_features=4, out_features=8, bias=True) (1): ReLU() (2): Linear(in_features=8, out_features=4, bias=True) (3): ReLU() ) (block 3): Sequential( (0): Linear(in_features=4, out_features=8, bias=True) (1): ReLU() (2): Linear(in_features=8, out_features=4, bias=True) (3): ReLU() ) ) (1): Linear(in_features=4, out_features=1, bias=True) )

Let us access the 0 element of the top level sequence, which is block 0-3. Then we access element 1 of this, which is block 1. Then we access element 0 of this, which is the first linear layer.

rgnet[0][1][0].bias.data
tensor([ 0.1753, -0.4905, -0.4271, 0.2333, -0.2832, 0.2405, -0.3530, -0.2477])

Backprop

# set loss function to output squared out = rgnet(X) loss = torch.mean(out**2, dim=0) # Backprop loss.backward() print(rgnet[0][1][0].bias.grad)
tensor([-6.0363e-05, 0.0000e+00, 0.0000e+00, 7.7047e-05, 0.0000e+00, 5.7246e-05, 0.0000e+00, 0.0000e+00])

Tied parameters

Sometimes parameters are reused in multiple layers, as we show below. In this case, the gradients are added.

# We need to give the shared layer a name so that we can refer to its # parameters torch.manual_seed(0) shared = nn.Linear(8, 8) net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), shared, nn.ReLU(), shared, nn.ReLU(), nn.Linear(8, 1)) net(X) # Check whether the parameters are the same print(net[2].weight.data[0] == net[4].weight.data[0]) net[2].weight.data[0, 0] = 100 # Make sure that they are actually the same object rather than just having the # same value print(net[2].weight.data[0] == net[4].weight.data[0])
tensor([True, True, True, True, True, True, True, True]) tensor([True, True, True, True, True, True, True, True])

To compute gradient of a function that does not return a scalar (eg the gradient of each output wrt each input), you can do the following.

x = torch.tensor([-2, -1, 0, 1, 2], dtype=float, requires_grad=True) print(x) y = torch.pow(x, 2) print(y) y.backward(torch.ones_like(x)) print(x.grad)
tensor([-2., -1., 0., 1., 2.], dtype=torch.float64, requires_grad=True) tensor([4., 1., 0., 1., 4.], dtype=torch.float64, grad_fn=<PowBackward0>) tensor([-4., -2., 0., 2., 4.], dtype=torch.float64)