Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
automatic1111
GitHub Repository: automatic1111/stable-diffusion-webui
Path: blob/master/modules/lowvram.py
3055 views
1
from collections import namedtuple
2
3
import torch
4
from modules import devices, shared
5
6
module_in_gpu = None
7
cpu = torch.device("cpu")
8
9
ModuleWithParent = namedtuple('ModuleWithParent', ['module', 'parent'], defaults=['None'])
10
11
def send_everything_to_cpu():
12
global module_in_gpu
13
14
if module_in_gpu is not None:
15
module_in_gpu.to(cpu)
16
17
module_in_gpu = None
18
19
20
def is_needed(sd_model):
21
return shared.cmd_opts.lowvram or shared.cmd_opts.medvram or shared.cmd_opts.medvram_sdxl and hasattr(sd_model, 'conditioner')
22
23
24
def apply(sd_model):
25
enable = is_needed(sd_model)
26
shared.parallel_processing_allowed = not enable
27
28
if enable:
29
setup_for_low_vram(sd_model, not shared.cmd_opts.lowvram)
30
else:
31
sd_model.lowvram = False
32
33
34
def setup_for_low_vram(sd_model, use_medvram):
35
if getattr(sd_model, 'lowvram', False):
36
return
37
38
sd_model.lowvram = True
39
40
parents = {}
41
42
def send_me_to_gpu(module, _):
43
"""send this module to GPU; send whatever tracked module was previous in GPU to CPU;
44
we add this as forward_pre_hook to a lot of modules and this way all but one of them will
45
be in CPU
46
"""
47
global module_in_gpu
48
49
module = parents.get(module, module)
50
51
if module_in_gpu == module:
52
return
53
54
if module_in_gpu is not None:
55
module_in_gpu.to(cpu)
56
57
module.to(devices.device)
58
module_in_gpu = module
59
60
# see below for register_forward_pre_hook;
61
# first_stage_model does not use forward(), it uses encode/decode, so register_forward_pre_hook is
62
# useless here, and we just replace those methods
63
64
first_stage_model = sd_model.first_stage_model
65
first_stage_model_encode = sd_model.first_stage_model.encode
66
first_stage_model_decode = sd_model.first_stage_model.decode
67
68
def first_stage_model_encode_wrap(x):
69
send_me_to_gpu(first_stage_model, None)
70
return first_stage_model_encode(x)
71
72
def first_stage_model_decode_wrap(z):
73
send_me_to_gpu(first_stage_model, None)
74
return first_stage_model_decode(z)
75
76
to_remain_in_cpu = [
77
(sd_model, 'first_stage_model'),
78
(sd_model, 'depth_model'),
79
(sd_model, 'embedder'),
80
(sd_model, 'model'),
81
]
82
83
is_sdxl = hasattr(sd_model, 'conditioner')
84
is_sd2 = not is_sdxl and hasattr(sd_model.cond_stage_model, 'model')
85
86
if hasattr(sd_model, 'medvram_fields'):
87
to_remain_in_cpu = sd_model.medvram_fields()
88
elif is_sdxl:
89
to_remain_in_cpu.append((sd_model, 'conditioner'))
90
elif is_sd2:
91
to_remain_in_cpu.append((sd_model.cond_stage_model, 'model'))
92
else:
93
to_remain_in_cpu.append((sd_model.cond_stage_model, 'transformer'))
94
95
# remove several big modules: cond, first_stage, depth/embedder (if applicable), and unet from the model
96
stored = []
97
for obj, field in to_remain_in_cpu:
98
module = getattr(obj, field, None)
99
stored.append(module)
100
setattr(obj, field, None)
101
102
# send the model to GPU.
103
sd_model.to(devices.device)
104
105
# put modules back. the modules will be in CPU.
106
for (obj, field), module in zip(to_remain_in_cpu, stored):
107
setattr(obj, field, module)
108
109
# register hooks for those the first three models
110
if hasattr(sd_model, "cond_stage_model") and hasattr(sd_model.cond_stage_model, "medvram_modules"):
111
for module in sd_model.cond_stage_model.medvram_modules():
112
if isinstance(module, ModuleWithParent):
113
parent = module.parent
114
module = module.module
115
else:
116
parent = None
117
118
if module:
119
module.register_forward_pre_hook(send_me_to_gpu)
120
121
if parent:
122
parents[module] = parent
123
124
elif is_sdxl:
125
sd_model.conditioner.register_forward_pre_hook(send_me_to_gpu)
126
elif is_sd2:
127
sd_model.cond_stage_model.model.register_forward_pre_hook(send_me_to_gpu)
128
sd_model.cond_stage_model.model.token_embedding.register_forward_pre_hook(send_me_to_gpu)
129
parents[sd_model.cond_stage_model.model] = sd_model.cond_stage_model
130
parents[sd_model.cond_stage_model.model.token_embedding] = sd_model.cond_stage_model
131
else:
132
sd_model.cond_stage_model.transformer.register_forward_pre_hook(send_me_to_gpu)
133
parents[sd_model.cond_stage_model.transformer] = sd_model.cond_stage_model
134
135
sd_model.first_stage_model.register_forward_pre_hook(send_me_to_gpu)
136
sd_model.first_stage_model.encode = first_stage_model_encode_wrap
137
sd_model.first_stage_model.decode = first_stage_model_decode_wrap
138
if getattr(sd_model, 'depth_model', None) is not None:
139
sd_model.depth_model.register_forward_pre_hook(send_me_to_gpu)
140
if getattr(sd_model, 'embedder', None) is not None:
141
sd_model.embedder.register_forward_pre_hook(send_me_to_gpu)
142
143
if use_medvram:
144
sd_model.model.register_forward_pre_hook(send_me_to_gpu)
145
else:
146
diff_model = sd_model.model.diffusion_model
147
148
# the third remaining model is still too big for 4 GB, so we also do the same for its submodules
149
# so that only one of them is in GPU at a time
150
stored = diff_model.input_blocks, diff_model.middle_block, diff_model.output_blocks, diff_model.time_embed
151
diff_model.input_blocks, diff_model.middle_block, diff_model.output_blocks, diff_model.time_embed = None, None, None, None
152
sd_model.model.to(devices.device)
153
diff_model.input_blocks, diff_model.middle_block, diff_model.output_blocks, diff_model.time_embed = stored
154
155
# install hooks for bits of third model
156
diff_model.time_embed.register_forward_pre_hook(send_me_to_gpu)
157
for block in diff_model.input_blocks:
158
block.register_forward_pre_hook(send_me_to_gpu)
159
diff_model.middle_block.register_forward_pre_hook(send_me_to_gpu)
160
for block in diff_model.output_blocks:
161
block.register_forward_pre_hook(send_me_to_gpu)
162
163
164
def is_enabled(sd_model):
165
return sd_model.lowvram
166
167