CoCalc -- gfpganv1

GitHub Repository: TencentARC/GFPGAN
Path: blob/master/gfpgan/archs/gfpganv1_arch.py
⁹⁵⁴ views
1
import math
2
import random
3
import torch
4
from basicsr.archs.stylegan2_arch import (ConvLayer, EqualConv2d, EqualLinear, ResBlock, ScaledLeakyReLU,
5
                                          StyleGAN2Generator)
6
from basicsr.ops.fused_act import FusedLeakyReLU
7
from basicsr.utils.registry import ARCH_REGISTRY
8
from torch import nn
9
from torch.nn import functional as F
10

11

12
class StyleGAN2GeneratorSFT(StyleGAN2Generator):
13
    """StyleGAN2 Generator with SFT modulation (Spatial Feature Transform).
14

15
    Args:
16
        out_size (int): The spatial size of outputs.
17
        num_style_feat (int): Channel number of style features. Default: 512.
18
        num_mlp (int): Layer number of MLP style layers. Default: 8.
19
        channel_multiplier (int): Channel multiplier for large networks of StyleGAN2. Default: 2.
20
        resample_kernel (list[int]): A list indicating the 1D resample kernel magnitude. A cross production will be
21
            applied to extent 1D resample kernel to 2D resample kernel. Default: (1, 3, 3, 1).
22
        lr_mlp (float): Learning rate multiplier for mlp layers. Default: 0.01.
23
        narrow (float): The narrow ratio for channels. Default: 1.
24
        sft_half (bool): Whether to apply SFT on half of the input channels. Default: False.
25
    """
26

27
    def __init__(self,
28
                 out_size,
29
                 num_style_feat=512,
30
                 num_mlp=8,
31
                 channel_multiplier=2,
32
                 resample_kernel=(1, 3, 3, 1),
33
                 lr_mlp=0.01,
34
                 narrow=1,
35
                 sft_half=False):
36
        super(StyleGAN2GeneratorSFT, self).__init__(
37
            out_size,
38
            num_style_feat=num_style_feat,
39
            num_mlp=num_mlp,
40
            channel_multiplier=channel_multiplier,
41
            resample_kernel=resample_kernel,
42
            lr_mlp=lr_mlp,
43
            narrow=narrow)
44
        self.sft_half = sft_half
45

46
    def forward(self,
47
                styles,
48
                conditions,
49
                input_is_latent=False,
50
                noise=None,
51
                randomize_noise=True,
52
                truncation=1,
53
                truncation_latent=None,
54
                inject_index=None,
55
                return_latents=False):
56
        """Forward function for StyleGAN2GeneratorSFT.
57

58
        Args:
59
            styles (list[Tensor]): Sample codes of styles.
60
            conditions (list[Tensor]): SFT conditions to generators.
61
            input_is_latent (bool): Whether input is latent style. Default: False.
62
            noise (Tensor | None): Input noise or None. Default: None.
63
            randomize_noise (bool): Randomize noise, used when 'noise' is False. Default: True.
64
            truncation (float): The truncation ratio. Default: 1.
65
            truncation_latent (Tensor | None): The truncation latent tensor. Default: None.
66
            inject_index (int | None): The injection index for mixing noise. Default: None.
67
            return_latents (bool): Whether to return style latents. Default: False.
68
        """
69
        # style codes -> latents with Style MLP layer
70
        if not input_is_latent:
71
            styles = [self.style_mlp(s) for s in styles]
72
        # noises
73
        if noise is None:
74
            if randomize_noise:
75
                noise = [None] * self.num_layers  # for each style conv layer
76
            else:  # use the stored noise
77
                noise = [getattr(self.noises, f'noise{i}') for i in range(self.num_layers)]
78
        # style truncation
79
        if truncation < 1:
80
            style_truncation = []
81
            for style in styles:
82
                style_truncation.append(truncation_latent + truncation * (style - truncation_latent))
83
            styles = style_truncation
84
        # get style latents with injection
85
        if len(styles) == 1:
86
            inject_index = self.num_latent
87

88
            if styles[0].ndim < 3:
89
                # repeat latent code for all the layers
90
                latent = styles[0].unsqueeze(1).repeat(1, inject_index, 1)
91
            else:  # used for encoder with different latent code for each layer
92
                latent = styles[0]
93
        elif len(styles) == 2:  # mixing noises
94
            if inject_index is None:
95
                inject_index = random.randint(1, self.num_latent - 1)
96
            latent1 = styles[0].unsqueeze(1).repeat(1, inject_index, 1)
97
            latent2 = styles[1].unsqueeze(1).repeat(1, self.num_latent - inject_index, 1)
98
            latent = torch.cat([latent1, latent2], 1)
99

100
        # main generation
101
        out = self.constant_input(latent.shape[0])
102
        out = self.style_conv1(out, latent[:, 0], noise=noise[0])
103
        skip = self.to_rgb1(out, latent[:, 1])
104

105
        i = 1
106
        for conv1, conv2, noise1, noise2, to_rgb in zip(self.style_convs[::2], self.style_convs[1::2], noise[1::2],
107
                                                        noise[2::2], self.to_rgbs):
108
            out = conv1(out, latent[:, i], noise=noise1)
109

110
            # the conditions may have fewer levels
111
            if i < len(conditions):
112
                # SFT part to combine the conditions
113
                if self.sft_half:  # only apply SFT to half of the channels
114
                    out_same, out_sft = torch.split(out, int(out.size(1) // 2), dim=1)
115
                    out_sft = out_sft * conditions[i - 1] + conditions[i]
116
                    out = torch.cat([out_same, out_sft], dim=1)
117
                else:  # apply SFT to all the channels
118
                    out = out * conditions[i - 1] + conditions[i]
119

120
            out = conv2(out, latent[:, i + 1], noise=noise2)
121
            skip = to_rgb(out, latent[:, i + 2], skip)  # feature back to the rgb space
122
            i += 2
123

124
        image = skip
125

126
        if return_latents:
127
            return image, latent
128
        else:
129
            return image, None
130

131

132
class ConvUpLayer(nn.Module):
133
    """Convolutional upsampling layer. It uses bilinear upsampler + Conv.
134

135
    Args:
136
        in_channels (int): Channel number of the input.
137
        out_channels (int): Channel number of the output.
138
        kernel_size (int): Size of the convolving kernel.
139
        stride (int): Stride of the convolution. Default: 1
140
        padding (int): Zero-padding added to both sides of the input. Default: 0.
141
        bias (bool): If ``True``, adds a learnable bias to the output. Default: ``True``.
142
        bias_init_val (float): Bias initialized value. Default: 0.
143
        activate (bool): Whether use activateion. Default: True.
144
    """
145

146
    def __init__(self,
147
                 in_channels,
148
                 out_channels,
149
                 kernel_size,
150
                 stride=1,
151
                 padding=0,
152
                 bias=True,
153
                 bias_init_val=0,
154
                 activate=True):
155
        super(ConvUpLayer, self).__init__()
156
        self.in_channels = in_channels
157
        self.out_channels = out_channels
158
        self.kernel_size = kernel_size
159
        self.stride = stride
160
        self.padding = padding
161
        # self.scale is used to scale the convolution weights, which is related to the common initializations.
162
        self.scale = 1 / math.sqrt(in_channels * kernel_size**2)
163

164
        self.weight = nn.Parameter(torch.randn(out_channels, in_channels, kernel_size, kernel_size))
165

166
        if bias and not activate:
167
            self.bias = nn.Parameter(torch.zeros(out_channels).fill_(bias_init_val))
168
        else:
169
            self.register_parameter('bias', None)
170

171
        # activation
172
        if activate:
173
            if bias:
174
                self.activation = FusedLeakyReLU(out_channels)
175
            else:
176
                self.activation = ScaledLeakyReLU(0.2)
177
        else:
178
            self.activation = None
179

180
    def forward(self, x):
181
        # bilinear upsample
182
        out = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=False)
183
        # conv
184
        out = F.conv2d(
185
            out,
186
            self.weight * self.scale,
187
            bias=self.bias,
188
            stride=self.stride,
189
            padding=self.padding,
190
        )
191
        # activation
192
        if self.activation is not None:
193
            out = self.activation(out)
194
        return out
195

196

197
class ResUpBlock(nn.Module):
198
    """Residual block with upsampling.
199

200
    Args:
201
        in_channels (int): Channel number of the input.
202
        out_channels (int): Channel number of the output.
203
    """
204

205
    def __init__(self, in_channels, out_channels):
206
        super(ResUpBlock, self).__init__()
207

208
        self.conv1 = ConvLayer(in_channels, in_channels, 3, bias=True, activate=True)
209
        self.conv2 = ConvUpLayer(in_channels, out_channels, 3, stride=1, padding=1, bias=True, activate=True)
210
        self.skip = ConvUpLayer(in_channels, out_channels, 1, bias=False, activate=False)
211

212
    def forward(self, x):
213
        out = self.conv1(x)
214
        out = self.conv2(out)
215
        skip = self.skip(x)
216
        out = (out + skip) / math.sqrt(2)
217
        return out
218

219

220
@ARCH_REGISTRY.register()
221
class GFPGANv1(nn.Module):
222
    """The GFPGAN architecture: Unet + StyleGAN2 decoder with SFT.
223

224
    Ref: GFP-GAN: Towards Real-World Blind Face Restoration with Generative Facial Prior.
225

226
    Args:
227
        out_size (int): The spatial size of outputs.
228
        num_style_feat (int): Channel number of style features. Default: 512.
229
        channel_multiplier (int): Channel multiplier for large networks of StyleGAN2. Default: 2.
230
        resample_kernel (list[int]): A list indicating the 1D resample kernel magnitude. A cross production will be
231
            applied to extent 1D resample kernel to 2D resample kernel. Default: (1, 3, 3, 1).
232
        decoder_load_path (str): The path to the pre-trained decoder model (usually, the StyleGAN2). Default: None.
233
        fix_decoder (bool): Whether to fix the decoder. Default: True.
234

235
        num_mlp (int): Layer number of MLP style layers. Default: 8.
236
        lr_mlp (float): Learning rate multiplier for mlp layers. Default: 0.01.
237
        input_is_latent (bool): Whether input is latent style. Default: False.
238
        different_w (bool): Whether to use different latent w for different layers. Default: False.
239
        narrow (float): The narrow ratio for channels. Default: 1.
240
        sft_half (bool): Whether to apply SFT on half of the input channels. Default: False.
241
    """
242

243
    def __init__(
244
            self,
245
            out_size,
246
            num_style_feat=512,
247
            channel_multiplier=1,
248
            resample_kernel=(1, 3, 3, 1),
249
            decoder_load_path=None,
250
            fix_decoder=True,
251
            # for stylegan decoder
252
            num_mlp=8,
253
            lr_mlp=0.01,
254
            input_is_latent=False,
255
            different_w=False,
256
            narrow=1,
257
            sft_half=False):
258

259
        super(GFPGANv1, self).__init__()
260
        self.input_is_latent = input_is_latent
261
        self.different_w = different_w
262
        self.num_style_feat = num_style_feat
263

264
        unet_narrow = narrow * 0.5  # by default, use a half of input channels
265
        channels = {
266
            '4': int(512 * unet_narrow),
267
            '8': int(512 * unet_narrow),
268
            '16': int(512 * unet_narrow),
269
            '32': int(512 * unet_narrow),
270
            '64': int(256 * channel_multiplier * unet_narrow),
271
            '128': int(128 * channel_multiplier * unet_narrow),
272
            '256': int(64 * channel_multiplier * unet_narrow),
273
            '512': int(32 * channel_multiplier * unet_narrow),
274
            '1024': int(16 * channel_multiplier * unet_narrow)
275
        }
276

277
        self.log_size = int(math.log(out_size, 2))
278
        first_out_size = 2**(int(math.log(out_size, 2)))
279

280
        self.conv_body_first = ConvLayer(3, channels[f'{first_out_size}'], 1, bias=True, activate=True)
281

282
        # downsample
283
        in_channels = channels[f'{first_out_size}']
284
        self.conv_body_down = nn.ModuleList()
285
        for i in range(self.log_size, 2, -1):
286
            out_channels = channels[f'{2**(i - 1)}']
287
            self.conv_body_down.append(ResBlock(in_channels, out_channels, resample_kernel))
288
            in_channels = out_channels
289

290
        self.final_conv = ConvLayer(in_channels, channels['4'], 3, bias=True, activate=True)
291

292
        # upsample
293
        in_channels = channels['4']
294
        self.conv_body_up = nn.ModuleList()
295
        for i in range(3, self.log_size + 1):
296
            out_channels = channels[f'{2**i}']
297
            self.conv_body_up.append(ResUpBlock(in_channels, out_channels))
298
            in_channels = out_channels
299

300
        # to RGB
301
        self.toRGB = nn.ModuleList()
302
        for i in range(3, self.log_size + 1):
303
            self.toRGB.append(EqualConv2d(channels[f'{2**i}'], 3, 1, stride=1, padding=0, bias=True, bias_init_val=0))
304

305
        if different_w:
306
            linear_out_channel = (int(math.log(out_size, 2)) * 2 - 2) * num_style_feat
307
        else:
308
            linear_out_channel = num_style_feat
309

310
        self.final_linear = EqualLinear(
311
            channels['4'] * 4 * 4, linear_out_channel, bias=True, bias_init_val=0, lr_mul=1, activation=None)
312

313
        # the decoder: stylegan2 generator with SFT modulations
314
        self.stylegan_decoder = StyleGAN2GeneratorSFT(
315
            out_size=out_size,
316
            num_style_feat=num_style_feat,
317
            num_mlp=num_mlp,
318
            channel_multiplier=channel_multiplier,
319
            resample_kernel=resample_kernel,
320
            lr_mlp=lr_mlp,
321
            narrow=narrow,
322
            sft_half=sft_half)
323

324
        # load pre-trained stylegan2 model if necessary
325
        if decoder_load_path:
326
            self.stylegan_decoder.load_state_dict(
327
                torch.load(decoder_load_path, map_location=lambda storage, loc: storage)['params_ema'])
328
        # fix decoder without updating params
329
        if fix_decoder:
330
            for _, param in self.stylegan_decoder.named_parameters():
331
                param.requires_grad = False
332

333
        # for SFT modulations (scale and shift)
334
        self.condition_scale = nn.ModuleList()
335
        self.condition_shift = nn.ModuleList()
336
        for i in range(3, self.log_size + 1):
337
            out_channels = channels[f'{2**i}']
338
            if sft_half:
339
                sft_out_channels = out_channels
340
            else:
341
                sft_out_channels = out_channels * 2
342
            self.condition_scale.append(
343
                nn.Sequential(
344
                    EqualConv2d(out_channels, out_channels, 3, stride=1, padding=1, bias=True, bias_init_val=0),
345
                    ScaledLeakyReLU(0.2),
346
                    EqualConv2d(out_channels, sft_out_channels, 3, stride=1, padding=1, bias=True, bias_init_val=1)))
347
            self.condition_shift.append(
348
                nn.Sequential(
349
                    EqualConv2d(out_channels, out_channels, 3, stride=1, padding=1, bias=True, bias_init_val=0),
350
                    ScaledLeakyReLU(0.2),
351
                    EqualConv2d(out_channels, sft_out_channels, 3, stride=1, padding=1, bias=True, bias_init_val=0)))
352

353
    def forward(self, x, return_latents=False, return_rgb=True, randomize_noise=True, **kwargs):
354
        """Forward function for GFPGANv1.
355

356
        Args:
357
            x (Tensor): Input images.
358
            return_latents (bool): Whether to return style latents. Default: False.
359
            return_rgb (bool): Whether return intermediate rgb images. Default: True.
360
            randomize_noise (bool): Randomize noise, used when 'noise' is False. Default: True.
361
        """
362
        conditions = []
363
        unet_skips = []
364
        out_rgbs = []
365

366
        # encoder
367
        feat = self.conv_body_first(x)
368
        for i in range(self.log_size - 2):
369
            feat = self.conv_body_down[i](feat)
370
            unet_skips.insert(0, feat)
371

372
        feat = self.final_conv(feat)
373

374
        # style code
375
        style_code = self.final_linear(feat.view(feat.size(0), -1))
376
        if self.different_w:
377
            style_code = style_code.view(style_code.size(0), -1, self.num_style_feat)
378

379
        # decode
380
        for i in range(self.log_size - 2):
381
            # add unet skip
382
            feat = feat + unet_skips[i]
383
            # ResUpLayer
384
            feat = self.conv_body_up[i](feat)
385
            # generate scale and shift for SFT layers
386
            scale = self.condition_scale[i](feat)
387
            conditions.append(scale.clone())
388
            shift = self.condition_shift[i](feat)
389
            conditions.append(shift.clone())
390
            # generate rgb images
391
            if return_rgb:
392
                out_rgbs.append(self.toRGB[i](feat))
393

394
        # decoder
395
        image, _ = self.stylegan_decoder([style_code],
396
                                         conditions,
397
                                         return_latents=return_latents,
398
                                         input_is_latent=self.input_is_latent,
399
                                         randomize_noise=randomize_noise)
400

401
        return image, out_rgbs
402

403

404
@ARCH_REGISTRY.register()
405
class FacialComponentDiscriminator(nn.Module):
406
    """Facial component (eyes, mouth, noise) discriminator used in GFPGAN.
407
    """
408

409
    def __init__(self):
410
        super(FacialComponentDiscriminator, self).__init__()
411
        # It now uses a VGG-style architectrue with fixed model size
412
        self.conv1 = ConvLayer(3, 64, 3, downsample=False, resample_kernel=(1, 3, 3, 1), bias=True, activate=True)
413
        self.conv2 = ConvLayer(64, 128, 3, downsample=True, resample_kernel=(1, 3, 3, 1), bias=True, activate=True)
414
        self.conv3 = ConvLayer(128, 128, 3, downsample=False, resample_kernel=(1, 3, 3, 1), bias=True, activate=True)
415
        self.conv4 = ConvLayer(128, 256, 3, downsample=True, resample_kernel=(1, 3, 3, 1), bias=True, activate=True)
416
        self.conv5 = ConvLayer(256, 256, 3, downsample=False, resample_kernel=(1, 3, 3, 1), bias=True, activate=True)
417
        self.final_conv = ConvLayer(256, 1, 3, bias=True, activate=False)
418

419
    def forward(self, x, return_feats=False, **kwargs):
420
        """Forward function for FacialComponentDiscriminator.
421

422
        Args:
423
            x (Tensor): Input images.
424
            return_feats (bool): Whether to return intermediate features. Default: False.
425
        """
426
        feat = self.conv1(x)
427
        feat = self.conv3(self.conv2(feat))
428
        rlt_feats = []
429
        if return_feats:
430
            rlt_feats.append(feat.clone())
431
        feat = self.conv5(self.conv4(feat))
432
        if return_feats:
433
            rlt_feats.append(feat.clone())
434
        out = self.final_conv(feat)
435

436
        if return_feats:
437
            return out, rlt_feats
438
        else:
439
            return out, None
440

441
Product

Resources

Company