Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
POSTECH-CVLab
GitHub Repository: POSTECH-CVLab/PyTorch-StudioGAN
Path: blob/master/src/sync_batchnorm/batchnorm.py
809 views
1
"""
2
-*- coding: utf-8 -*-
3
File : batchnorm.py
4
Author : Jiayuan Mao
5
Email : [email protected]
6
Date : 27/01/2018
7
8
This file is part of Synchronized-BatchNorm-PyTorch.
9
https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
10
Distributed under MIT License.
11
12
MIT License
13
14
Copyright (c) 2018 Jiayuan MAO
15
16
Permission is hereby granted, free of charge, to any person obtaining a copy
17
of this software and associated documentation files (the "Software"), to deal
18
in the Software without restriction, including without limitation the rights
19
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
20
copies of the Software, and to permit persons to whom the Software is
21
furnished to do so, subject to the following conditions:
22
23
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
24
25
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
30
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31
SOFTWARE.
32
"""
33
34
import collections
35
import contextlib
36
37
import torch
38
import torch.nn.functional as F
39
40
from torch.nn.modules.batchnorm import _BatchNorm
41
42
try:
43
from torch.nn.parallel._functions import ReduceAddCoalesced, Broadcast
44
except ImportError:
45
ReduceAddCoalesced = Broadcast = None
46
47
try:
48
from jactorch.parallel.comm import SyncMaster
49
from jactorch.parallel.data_parallel import JacDataParallel as DataParallelWithCallback
50
except ImportError:
51
from .comm import SyncMaster
52
from .replicate import DataParallelWithCallback
53
54
__all__ = [
55
'SynchronizedBatchNorm1d', 'SynchronizedBatchNorm2d', 'SynchronizedBatchNorm3d', 'patch_sync_batchnorm',
56
'convert_model'
57
]
58
59
60
def _sum_ft(tensor):
61
"""sum over the first and last dimention"""
62
return tensor.sum(dim=0).sum(dim=-1)
63
64
65
def _unsqueeze_ft(tensor):
66
"""add new dimensions at the front and the tail"""
67
return tensor.unsqueeze(0).unsqueeze(-1)
68
69
70
_ChildMessage = collections.namedtuple('_ChildMessage', ['sum', 'ssum', 'sum_size'])
71
_MasterMessage = collections.namedtuple('_MasterMessage', ['sum', 'inv_std'])
72
73
74
class _SynchronizedBatchNorm(_BatchNorm):
75
def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True):
76
assert ReduceAddCoalesced is not None, 'Can not use Synchronized Batch Normalization without CUDA support.'
77
78
super(_SynchronizedBatchNorm, self).__init__(num_features,
79
eps=eps,
80
momentum=momentum,
81
affine=affine,
82
track_running_stats=track_running_stats)
83
84
if not self.track_running_stats:
85
import warnings
86
warnings.warn('track_running_stats=False is not supported by the SynchronizedBatchNorm.')
87
88
self._sync_master = SyncMaster(self._data_parallel_master)
89
90
self._is_parallel = False
91
self._parallel_id = None
92
self._slave_pipe = None
93
94
def forward(self, input):
95
# If it is not parallel computation or is in evaluation mode, use PyTorch's implementation.
96
if not (self._is_parallel and self.training):
97
return F.batch_norm(input, self.running_mean, self.running_var, self.weight, self.bias, self.training,
98
self.momentum, self.eps)
99
100
# Resize the input to (B, C, -1).
101
input_shape = input.size()
102
input = input.view(input.size(0), self.num_features, -1)
103
104
# Compute the sum and square-sum.
105
sum_size = input.size(0) * input.size(2)
106
input_sum = _sum_ft(input)
107
input_ssum = _sum_ft(input**2)
108
109
# Reduce-and-broadcast the statistics.
110
if self._parallel_id == 0:
111
mean, inv_std = self._sync_master.run_master(_ChildMessage(input_sum, input_ssum, sum_size))
112
else:
113
mean, inv_std = self._slave_pipe.run_slave(_ChildMessage(input_sum, input_ssum, sum_size))
114
115
# Compute the output.
116
if self.affine:
117
# MJY:: Fuse the multiplication for speed.
118
output = (input - _unsqueeze_ft(mean)) * _unsqueeze_ft(inv_std * self.weight) + _unsqueeze_ft(self.bias)
119
else:
120
output = (input - _unsqueeze_ft(mean)) * _unsqueeze_ft(inv_std)
121
122
# Reshape it.
123
return output.view(input_shape)
124
125
def __data_parallel_replicate__(self, ctx, copy_id):
126
self._is_parallel = True
127
self._parallel_id = copy_id
128
129
# parallel_id == 0 means master device.
130
if self._parallel_id == 0:
131
ctx.sync_master = self._sync_master
132
else:
133
self._slave_pipe = ctx.sync_master.register_slave(copy_id)
134
135
def _data_parallel_master(self, intermediates):
136
"""Reduce the sum and square-sum, compute the statistics, and broadcast it."""
137
138
# Always using same "device order" makes the ReduceAdd operation faster.
139
# Thanks to:: Tete Xiao (http://tetexiao.com/)
140
intermediates = sorted(intermediates, key=lambda i: i[1].sum.get_device())
141
142
to_reduce = [i[1][:2] for i in intermediates]
143
to_reduce = [j for i in to_reduce for j in i] # flatten
144
target_gpus = [i[1].sum.get_device() for i in intermediates]
145
146
sum_size = sum([i[1].sum_size for i in intermediates])
147
sum_, ssum = ReduceAddCoalesced.apply(target_gpus[0], 2, *to_reduce)
148
mean, inv_std = self._compute_mean_std(sum_, ssum, sum_size)
149
150
broadcasted = Broadcast.apply(target_gpus, mean, inv_std)
151
152
outputs = []
153
for i, rec in enumerate(intermediates):
154
outputs.append((rec[0], _MasterMessage(*broadcasted[i * 2:i * 2 + 2])))
155
156
return outputs
157
158
def _compute_mean_std(self, sum_, ssum, size):
159
"""Compute the mean and standard-deviation with sum and square-sum. This method
160
also maintains the moving average on the master device."""
161
assert size > 1, 'BatchNorm computes unbiased standard-deviation, which requires size > 1.'
162
mean = sum_ / size
163
sumvar = ssum - sum_ * mean
164
unbias_var = sumvar / (size - 1)
165
bias_var = sumvar / size
166
167
if hasattr(torch, 'no_grad'):
168
with torch.no_grad():
169
self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * mean.data
170
self.running_var = (1 - self.momentum) * self.running_var + self.momentum * unbias_var.data
171
else:
172
self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * mean.data
173
self.running_var = (1 - self.momentum) * self.running_var + self.momentum * unbias_var.data
174
175
return mean, bias_var.clamp(self.eps)**-0.5
176
177
178
class SynchronizedBatchNorm1d(_SynchronizedBatchNorm):
179
r"""Applies Synchronized Batch Normalization over a 2d or 3d input that is seen as a
180
mini-batch.
181
182
.. math::
183
184
y = \frac{x - mean[x]}{ \sqrt{Var[x] + \epsilon}} * gamma + beta
185
186
This module differs from the built-in PyTorch BatchNorm1d as the mean and
187
standard-deviation are reduced across all devices during training.
188
189
For example, when one uses `nn.DataParallel` to wrap the network during
190
training, PyTorch's implementation normalize the tensor on each device using
191
the statistics only on that device, which accelerated the computation and
192
is also easy to implement, but the statistics might be inaccurate.
193
Instead, in this synchronized version, the statistics will be computed
194
over all training samples distributed on multiple devices.
195
196
Note that, for one-GPU or CPU-only case, this module behaves exactly same
197
as the built-in PyTorch implementation.
198
199
The mean and standard-deviation are calculated per-dimension over
200
the mini-batches and gamma and beta are learnable parameter vectors
201
of size C (where C is the input size).
202
203
During training, this layer keeps a running estimate of its computed mean
204
and variance. The running sum is kept with a default momentum of 0.1.
205
206
During evaluation, this running mean/variance is used for normalization.
207
208
Because the BatchNorm is done over the `C` dimension, computing statistics
209
on `(N, L)` slices, it's common terminology to call this Temporal BatchNorm
210
211
Args:
212
num_features: num_features from an expected input of size
213
`batch_size x num_features [x width]`
214
eps: a value added to the denominator for numerical stability.
215
Default: 1e-5
216
momentum: the value used for the running_mean and running_var
217
computation. Default: 0.1
218
affine: a boolean value that when set to ``True``, gives the layer learnable
219
affine parameters. Default: ``True``
220
221
Shape::
222
- Input: :math:`(N, C)` or :math:`(N, C, L)`
223
- Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)
224
225
Examples:
226
>>> # With Learnable Parameters
227
>>> m = SynchronizedBatchNorm1d(100)
228
>>> # Without Learnable Parameters
229
>>> m = SynchronizedBatchNorm1d(100, affine=False)
230
>>> input = torch.autograd.Variable(torch.randn(20, 100))
231
>>> output = m(input)
232
"""
233
def _check_input_dim(self, input):
234
if input.dim() != 2 and input.dim() != 3:
235
raise ValueError('expected 2D or 3D input (got {}D input)'.format(input.dim()))
236
237
238
class SynchronizedBatchNorm2d(_SynchronizedBatchNorm):
239
r"""Applies Batch Normalization over a 4d input that is seen as a mini-batch
240
of 3d inputs
241
242
.. math::
243
244
y = \frac{x - mean[x]}{ \sqrt{Var[x] + \epsilon}} * gamma + beta
245
246
This module differs from the built-in PyTorch BatchNorm2d as the mean and
247
standard-deviation are reduced across all devices during training.
248
249
For example, when one uses `nn.DataParallel` to wrap the network during
250
training, PyTorch's implementation normalize the tensor on each device using
251
the statistics only on that device, which accelerated the computation and
252
is also easy to implement, but the statistics might be inaccurate.
253
Instead, in this synchronized version, the statistics will be computed
254
over all training samples distributed on multiple devices.
255
256
Note that, for one-GPU or CPU-only case, this module behaves exactly same
257
as the built-in PyTorch implementation.
258
259
The mean and standard-deviation are calculated per-dimension over
260
the mini-batches and gamma and beta are learnable parameter vectors
261
of size C (where C is the input size).
262
263
During training, this layer keeps a running estimate of its computed mean
264
and variance. The running sum is kept with a default momentum of 0.1.
265
266
During evaluation, this running mean/variance is used for normalization.
267
268
Because the BatchNorm is done over the `C` dimension, computing statistics
269
on `(N, H, W)` slices, it's common terminology to call this Spatial BatchNorm
270
271
Args:
272
num_features: num_features from an expected input of
273
size batch_size x num_features x height x width
274
eps: a value added to the denominator for numerical stability.
275
Default: 1e-5
276
momentum: the value used for the running_mean and running_var
277
computation. Default: 0.1
278
affine: a boolean value that when set to ``True``, gives the layer learnable
279
affine parameters. Default: ``True``
280
281
Shape::
282
- Input: :math:`(N, C, H, W)`
283
- Output: :math:`(N, C, H, W)` (same shape as input)
284
285
Examples:
286
>>> # With Learnable Parameters
287
>>> m = SynchronizedBatchNorm2d(100)
288
>>> # Without Learnable Parameters
289
>>> m = SynchronizedBatchNorm2d(100, affine=False)
290
>>> input = torch.autograd.Variable(torch.randn(20, 100, 35, 45))
291
>>> output = m(input)
292
"""
293
def _check_input_dim(self, input):
294
if input.dim() != 4:
295
raise ValueError('expected 4D input (got {}D input)'.format(input.dim()))
296
297
298
class SynchronizedBatchNorm3d(_SynchronizedBatchNorm):
299
r"""Applies Batch Normalization over a 5d input that is seen as a mini-batch
300
of 4d inputs
301
302
.. math::
303
304
y = \frac{x - mean[x]}{ \sqrt{Var[x] + \epsilon}} * gamma + beta
305
306
This module differs from the built-in PyTorch BatchNorm3d as the mean and
307
standard-deviation are reduced across all devices during training.
308
309
For example, when one uses `nn.DataParallel` to wrap the network during
310
training, PyTorch's implementation normalize the tensor on each device using
311
the statistics only on that device, which accelerated the computation and
312
is also easy to implement, but the statistics might be inaccurate.
313
Instead, in this synchronized version, the statistics will be computed
314
over all training samples distributed on multiple devices.
315
316
Note that, for one-GPU or CPU-only case, this module behaves exactly same
317
as the built-in PyTorch implementation.
318
319
The mean and standard-deviation are calculated per-dimension over
320
the mini-batches and gamma and beta are learnable parameter vectors
321
of size C (where C is the input size).
322
323
During training, this layer keeps a running estimate of its computed mean
324
and variance. The running sum is kept with a default momentum of 0.1.
325
326
During evaluation, this running mean/variance is used for normalization.
327
328
Because the BatchNorm is done over the `C` dimension, computing statistics
329
on `(N, D, H, W)` slices, it's common terminology to call this Volumetric BatchNorm
330
or Spatio-temporal BatchNorm
331
332
Args:
333
num_features: num_features from an expected input of
334
size batch_size x num_features x depth x height x width
335
eps: a value added to the denominator for numerical stability.
336
Default: 1e-5
337
momentum: the value used for the running_mean and running_var
338
computation. Default: 0.1
339
affine: a boolean value that when set to ``True``, gives the layer learnable
340
affine parameters. Default: ``True``
341
342
Shape::
343
- Input: :math:`(N, C, D, H, W)`
344
- Output: :math:`(N, C, D, H, W)` (same shape as input)
345
346
Examples:
347
>>> # With Learnable Parameters
348
>>> m = SynchronizedBatchNorm3d(100)
349
>>> # Without Learnable Parameters
350
>>> m = SynchronizedBatchNorm3d(100, affine=False)
351
>>> input = torch.autograd.Variable(torch.randn(20, 100, 35, 45, 10))
352
>>> output = m(input)
353
"""
354
def _check_input_dim(self, input):
355
if input.dim() != 5:
356
raise ValueError('expected 5D input (got {}D input)'.format(input.dim()))
357
358
359
@contextlib.contextmanager
360
def patch_sync_batchnorm():
361
import torch.nn as nn
362
363
backup = nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d
364
365
nn.BatchNorm1d = SynchronizedBatchNorm1d
366
nn.BatchNorm2d = SynchronizedBatchNorm2d
367
nn.BatchNorm3d = SynchronizedBatchNorm3d
368
369
yield
370
371
nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d = backup
372
373
374
def convert_model(module):
375
"""Traverse the input module and its child recursively
376
and replace all instance of torch.nn.modules.batchnorm.BatchNorm*N*d
377
to SynchronizedBatchNorm*N*d
378
379
Args:
380
module: the input module needs to be convert to SyncBN model
381
382
Examples:
383
>>> import torch.nn as nn
384
>>> import torchvision
385
>>> # m is a standard pytorch model
386
>>> m = torchvision.models.resnet18(True)
387
>>> m = nn.DataParallel(m)
388
>>> # after convert, m is using SyncBN
389
>>> m = convert_model(m)
390
"""
391
if isinstance(module, torch.nn.DataParallel):
392
mod = module.module
393
mod = convert_model(mod)
394
mod = DataParallelWithCallback(mod, device_ids=module.device_ids)
395
return mod
396
397
mod = module
398
for pth_module, sync_module in zip([
399
torch.nn.modules.batchnorm.BatchNorm1d, torch.nn.modules.batchnorm.BatchNorm2d,
400
torch.nn.modules.batchnorm.BatchNorm3d
401
], [SynchronizedBatchNorm1d, SynchronizedBatchNorm2d, SynchronizedBatchNorm3d]):
402
if isinstance(module, pth_module):
403
mod = sync_module(module.num_features, module.eps, module.momentum, module.affine)
404
mod.running_mean = module.running_mean
405
mod.running_var = module.running_var
406
if module.affine:
407
mod.weight.data = module.weight.data.clone().detach()
408
mod.bias.data = module.bias.data.clone().detach()
409
410
for name, child in module.named_children():
411
mod.add_module(name, convert_model(child))
412
413
return mod
414
415