CoCalc -- batchnorm.py

GitHub Repository: POSTECH-CVLab/PyTorch-StudioGAN
Path: blob/master/src/sync_batchnorm/batchnorm.py
⁸⁰⁹ views
1
"""
2
-*- coding: utf-8 -*-
3
File   : batchnorm.py
4
Author : Jiayuan Mao
5
Email  : [email protected]
6
Date   : 27/01/2018
7

8
This file is part of Synchronized-BatchNorm-PyTorch.
9
https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
10
Distributed under MIT License.
11

12
MIT License
13

14
Copyright (c) 2018 Jiayuan MAO
15

16
Permission is hereby granted, free of charge, to any person obtaining a copy
17
of this software and associated documentation files (the "Software"), to deal
18
in the Software without restriction, including without limitation the rights
19
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
20
copies of the Software, and to permit persons to whom the Software is
21
furnished to do so, subject to the following conditions:
22

23
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
24

25
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
30
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31
SOFTWARE.
32
"""
33

34
import collections
35
import contextlib
36

37
import torch
38
import torch.nn.functional as F
39

40
from torch.nn.modules.batchnorm import _BatchNorm
41

42
try:
43
    from torch.nn.parallel._functions import ReduceAddCoalesced, Broadcast
44
except ImportError:
45
    ReduceAddCoalesced = Broadcast = None
46

47
try:
48
    from jactorch.parallel.comm import SyncMaster
49
    from jactorch.parallel.data_parallel import JacDataParallel as DataParallelWithCallback
50
except ImportError:
51
    from .comm import SyncMaster
52
    from .replicate import DataParallelWithCallback
53

54
__all__ = [
55
    'SynchronizedBatchNorm1d', 'SynchronizedBatchNorm2d', 'SynchronizedBatchNorm3d', 'patch_sync_batchnorm',
56
    'convert_model'
57
]
58

59

60
def _sum_ft(tensor):
61
    """sum over the first and last dimention"""
62
    return tensor.sum(dim=0).sum(dim=-1)
63

64

65
def _unsqueeze_ft(tensor):
66
    """add new dimensions at the front and the tail"""
67
    return tensor.unsqueeze(0).unsqueeze(-1)
68

69

70
_ChildMessage = collections.namedtuple('_ChildMessage', ['sum', 'ssum', 'sum_size'])
71
_MasterMessage = collections.namedtuple('_MasterMessage', ['sum', 'inv_std'])
72

73

74
class _SynchronizedBatchNorm(_BatchNorm):
75
    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True):
76
        assert ReduceAddCoalesced is not None, 'Can not use Synchronized Batch Normalization without CUDA support.'
77

78
        super(_SynchronizedBatchNorm, self).__init__(num_features,
79
                                                     eps=eps,
80
                                                     momentum=momentum,
81
                                                     affine=affine,
82
                                                     track_running_stats=track_running_stats)
83

84
        if not self.track_running_stats:
85
            import warnings
86
            warnings.warn('track_running_stats=False is not supported by the SynchronizedBatchNorm.')
87

88
        self._sync_master = SyncMaster(self._data_parallel_master)
89

90
        self._is_parallel = False
91
        self._parallel_id = None
92
        self._slave_pipe = None
93

94
    def forward(self, input):
95
        # If it is not parallel computation or is in evaluation mode, use PyTorch's implementation.
96
        if not (self._is_parallel and self.training):
97
            return F.batch_norm(input, self.running_mean, self.running_var, self.weight, self.bias, self.training,
98
                                self.momentum, self.eps)
99

100
        # Resize the input to (B, C, -1).
101
        input_shape = input.size()
102
        input = input.view(input.size(0), self.num_features, -1)
103

104
        # Compute the sum and square-sum.
105
        sum_size = input.size(0) * input.size(2)
106
        input_sum = _sum_ft(input)
107
        input_ssum = _sum_ft(input**2)
108

109
        # Reduce-and-broadcast the statistics.
110
        if self._parallel_id == 0:
111
            mean, inv_std = self._sync_master.run_master(_ChildMessage(input_sum, input_ssum, sum_size))
112
        else:
113
            mean, inv_std = self._slave_pipe.run_slave(_ChildMessage(input_sum, input_ssum, sum_size))
114

115
        # Compute the output.
116
        if self.affine:
117
            # MJY:: Fuse the multiplication for speed.
118
            output = (input - _unsqueeze_ft(mean)) * _unsqueeze_ft(inv_std * self.weight) + _unsqueeze_ft(self.bias)
119
        else:
120
            output = (input - _unsqueeze_ft(mean)) * _unsqueeze_ft(inv_std)
121

122
        # Reshape it.
123
        return output.view(input_shape)
124

125
    def __data_parallel_replicate__(self, ctx, copy_id):
126
        self._is_parallel = True
127
        self._parallel_id = copy_id
128

129
        # parallel_id == 0 means master device.
130
        if self._parallel_id == 0:
131
            ctx.sync_master = self._sync_master
132
        else:
133
            self._slave_pipe = ctx.sync_master.register_slave(copy_id)
134

135
    def _data_parallel_master(self, intermediates):
136
        """Reduce the sum and square-sum, compute the statistics, and broadcast it."""
137

138
        # Always using same "device order" makes the ReduceAdd operation faster.
139
        # Thanks to:: Tete Xiao (http://tetexiao.com/)
140
        intermediates = sorted(intermediates, key=lambda i: i[1].sum.get_device())
141

142
        to_reduce = [i[1][:2] for i in intermediates]
143
        to_reduce = [j for i in to_reduce for j in i]  # flatten
144
        target_gpus = [i[1].sum.get_device() for i in intermediates]
145

146
        sum_size = sum([i[1].sum_size for i in intermediates])
147
        sum_, ssum = ReduceAddCoalesced.apply(target_gpus[0], 2, *to_reduce)
148
        mean, inv_std = self._compute_mean_std(sum_, ssum, sum_size)
149

150
        broadcasted = Broadcast.apply(target_gpus, mean, inv_std)
151

152
        outputs = []
153
        for i, rec in enumerate(intermediates):
154
            outputs.append((rec[0], _MasterMessage(*broadcasted[i * 2:i * 2 + 2])))
155

156
        return outputs
157

158
    def _compute_mean_std(self, sum_, ssum, size):
159
        """Compute the mean and standard-deviation with sum and square-sum. This method
160
        also maintains the moving average on the master device."""
161
        assert size > 1, 'BatchNorm computes unbiased standard-deviation, which requires size > 1.'
162
        mean = sum_ / size
163
        sumvar = ssum - sum_ * mean
164
        unbias_var = sumvar / (size - 1)
165
        bias_var = sumvar / size
166

167
        if hasattr(torch, 'no_grad'):
168
            with torch.no_grad():
169
                self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * mean.data
170
                self.running_var = (1 - self.momentum) * self.running_var + self.momentum * unbias_var.data
171
        else:
172
            self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * mean.data
173
            self.running_var = (1 - self.momentum) * self.running_var + self.momentum * unbias_var.data
174

175
        return mean, bias_var.clamp(self.eps)**-0.5
176

177

178
class SynchronizedBatchNorm1d(_SynchronizedBatchNorm):
179
    r"""Applies Synchronized Batch Normalization over a 2d or 3d input that is seen as a
180
    mini-batch.
181

182
    .. math::
183

184
        y = \frac{x - mean[x]}{ \sqrt{Var[x] + \epsilon}} * gamma + beta
185

186
    This module differs from the built-in PyTorch BatchNorm1d as the mean and
187
    standard-deviation are reduced across all devices during training.
188

189
    For example, when one uses `nn.DataParallel` to wrap the network during
190
    training, PyTorch's implementation normalize the tensor on each device using
191
    the statistics only on that device, which accelerated the computation and
192
    is also easy to implement, but the statistics might be inaccurate.
193
    Instead, in this synchronized version, the statistics will be computed
194
    over all training samples distributed on multiple devices.
195

196
    Note that, for one-GPU or CPU-only case, this module behaves exactly same
197
    as the built-in PyTorch implementation.
198

199
    The mean and standard-deviation are calculated per-dimension over
200
    the mini-batches and gamma and beta are learnable parameter vectors
201
    of size C (where C is the input size).
202

203
    During training, this layer keeps a running estimate of its computed mean
204
    and variance. The running sum is kept with a default momentum of 0.1.
205

206
    During evaluation, this running mean/variance is used for normalization.
207

208
    Because the BatchNorm is done over the `C` dimension, computing statistics
209
    on `(N, L)` slices, it's common terminology to call this Temporal BatchNorm
210

211
    Args:
212
        num_features: num_features from an expected input of size
213
            `batch_size x num_features [x width]`
214
        eps: a value added to the denominator for numerical stability.
215
            Default: 1e-5
216
        momentum: the value used for the running_mean and running_var
217
            computation. Default: 0.1
218
        affine: a boolean value that when set to ``True``, gives the layer learnable
219
            affine parameters. Default: ``True``
220

221
    Shape::
222
        - Input: :math:`(N, C)` or :math:`(N, C, L)`
223
        - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)
224

225
    Examples:
226
        >>> # With Learnable Parameters
227
        >>> m = SynchronizedBatchNorm1d(100)
228
        >>> # Without Learnable Parameters
229
        >>> m = SynchronizedBatchNorm1d(100, affine=False)
230
        >>> input = torch.autograd.Variable(torch.randn(20, 100))
231
        >>> output = m(input)
232
    """
233
    def _check_input_dim(self, input):
234
        if input.dim() != 2 and input.dim() != 3:
235
            raise ValueError('expected 2D or 3D input (got {}D input)'.format(input.dim()))
236

237

238
class SynchronizedBatchNorm2d(_SynchronizedBatchNorm):
239
    r"""Applies Batch Normalization over a 4d input that is seen as a mini-batch
240
    of 3d inputs
241

242
    .. math::
243

244
        y = \frac{x - mean[x]}{ \sqrt{Var[x] + \epsilon}} * gamma + beta
245

246
    This module differs from the built-in PyTorch BatchNorm2d as the mean and
247
    standard-deviation are reduced across all devices during training.
248

249
    For example, when one uses `nn.DataParallel` to wrap the network during
250
    training, PyTorch's implementation normalize the tensor on each device using
251
    the statistics only on that device, which accelerated the computation and
252
    is also easy to implement, but the statistics might be inaccurate.
253
    Instead, in this synchronized version, the statistics will be computed
254
    over all training samples distributed on multiple devices.
255

256
    Note that, for one-GPU or CPU-only case, this module behaves exactly same
257
    as the built-in PyTorch implementation.
258

259
    The mean and standard-deviation are calculated per-dimension over
260
    the mini-batches and gamma and beta are learnable parameter vectors
261
    of size C (where C is the input size).
262

263
    During training, this layer keeps a running estimate of its computed mean
264
    and variance. The running sum is kept with a default momentum of 0.1.
265

266
    During evaluation, this running mean/variance is used for normalization.
267

268
    Because the BatchNorm is done over the `C` dimension, computing statistics
269
    on `(N, H, W)` slices, it's common terminology to call this Spatial BatchNorm
270

271
    Args:
272
        num_features: num_features from an expected input of
273
            size batch_size x num_features x height x width
274
        eps: a value added to the denominator for numerical stability.
275
            Default: 1e-5
276
        momentum: the value used for the running_mean and running_var
277
            computation. Default: 0.1
278
        affine: a boolean value that when set to ``True``, gives the layer learnable
279
            affine parameters. Default: ``True``
280

281
    Shape::
282
        - Input: :math:`(N, C, H, W)`
283
        - Output: :math:`(N, C, H, W)` (same shape as input)
284

285
    Examples:
286
        >>> # With Learnable Parameters
287
        >>> m = SynchronizedBatchNorm2d(100)
288
        >>> # Without Learnable Parameters
289
        >>> m = SynchronizedBatchNorm2d(100, affine=False)
290
        >>> input = torch.autograd.Variable(torch.randn(20, 100, 35, 45))
291
        >>> output = m(input)
292
    """
293
    def _check_input_dim(self, input):
294
        if input.dim() != 4:
295
            raise ValueError('expected 4D input (got {}D input)'.format(input.dim()))
296

297

298
class SynchronizedBatchNorm3d(_SynchronizedBatchNorm):
299
    r"""Applies Batch Normalization over a 5d input that is seen as a mini-batch
300
    of 4d inputs
301

302
    .. math::
303

304
        y = \frac{x - mean[x]}{ \sqrt{Var[x] + \epsilon}} * gamma + beta
305

306
    This module differs from the built-in PyTorch BatchNorm3d as the mean and
307
    standard-deviation are reduced across all devices during training.
308

309
    For example, when one uses `nn.DataParallel` to wrap the network during
310
    training, PyTorch's implementation normalize the tensor on each device using
311
    the statistics only on that device, which accelerated the computation and
312
    is also easy to implement, but the statistics might be inaccurate.
313
    Instead, in this synchronized version, the statistics will be computed
314
    over all training samples distributed on multiple devices.
315

316
    Note that, for one-GPU or CPU-only case, this module behaves exactly same
317
    as the built-in PyTorch implementation.
318

319
    The mean and standard-deviation are calculated per-dimension over
320
    the mini-batches and gamma and beta are learnable parameter vectors
321
    of size C (where C is the input size).
322

323
    During training, this layer keeps a running estimate of its computed mean
324
    and variance. The running sum is kept with a default momentum of 0.1.
325

326
    During evaluation, this running mean/variance is used for normalization.
327

328
    Because the BatchNorm is done over the `C` dimension, computing statistics
329
    on `(N, D, H, W)` slices, it's common terminology to call this Volumetric BatchNorm
330
    or Spatio-temporal BatchNorm
331

332
    Args:
333
        num_features: num_features from an expected input of
334
            size batch_size x num_features x depth x height x width
335
        eps: a value added to the denominator for numerical stability.
336
            Default: 1e-5
337
        momentum: the value used for the running_mean and running_var
338
            computation. Default: 0.1
339
        affine: a boolean value that when set to ``True``, gives the layer learnable
340
            affine parameters. Default: ``True``
341

342
    Shape::
343
        - Input: :math:`(N, C, D, H, W)`
344
        - Output: :math:`(N, C, D, H, W)` (same shape as input)
345

346
    Examples:
347
        >>> # With Learnable Parameters
348
        >>> m = SynchronizedBatchNorm3d(100)
349
        >>> # Without Learnable Parameters
350
        >>> m = SynchronizedBatchNorm3d(100, affine=False)
351
        >>> input = torch.autograd.Variable(torch.randn(20, 100, 35, 45, 10))
352
        >>> output = m(input)
353
    """
354
    def _check_input_dim(self, input):
355
        if input.dim() != 5:
356
            raise ValueError('expected 5D input (got {}D input)'.format(input.dim()))
357

358

359
@contextlib.contextmanager
360
def patch_sync_batchnorm():
361
    import torch.nn as nn
362

363
    backup = nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d
364

365
    nn.BatchNorm1d = SynchronizedBatchNorm1d
366
    nn.BatchNorm2d = SynchronizedBatchNorm2d
367
    nn.BatchNorm3d = SynchronizedBatchNorm3d
368

369
    yield
370

371
    nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d = backup
372

373

374
def convert_model(module):
375
    """Traverse the input module and its child recursively
376
       and replace all instance of torch.nn.modules.batchnorm.BatchNorm*N*d
377
       to SynchronizedBatchNorm*N*d
378

379
    Args:
380
        module: the input module needs to be convert to SyncBN model
381

382
    Examples:
383
        >>> import torch.nn as nn
384
        >>> import torchvision
385
        >>> # m is a standard pytorch model
386
        >>> m = torchvision.models.resnet18(True)
387
        >>> m = nn.DataParallel(m)
388
        >>> # after convert, m is using SyncBN
389
        >>> m = convert_model(m)
390
    """
391
    if isinstance(module, torch.nn.DataParallel):
392
        mod = module.module
393
        mod = convert_model(mod)
394
        mod = DataParallelWithCallback(mod, device_ids=module.device_ids)
395
        return mod
396

397
    mod = module
398
    for pth_module, sync_module in zip([
399
            torch.nn.modules.batchnorm.BatchNorm1d, torch.nn.modules.batchnorm.BatchNorm2d,
400
            torch.nn.modules.batchnorm.BatchNorm3d
401
    ], [SynchronizedBatchNorm1d, SynchronizedBatchNorm2d, SynchronizedBatchNorm3d]):
402
        if isinstance(module, pth_module):
403
            mod = sync_module(module.num_features, module.eps, module.momentum, module.affine)
404
            mod.running_mean = module.running_mean
405
            mod.running_var = module.running_var
406
            if module.affine:
407
                mod.weight.data = module.weight.data.clone().detach()
408
                mod.bias.data = module.bias.data.clone().detach()
409

410
    for name, child in module.named_children():
411
        mod.add_module(name, convert_model(child))
412

413
    return mod
414

415
Product

Resources

Company