CoCalc -- basic

GitHub Repository: jantic/deoldify
Path: blob/master/fastai/basic_train.py
⁸⁴⁰ views
1
"Provides basic training and validation with `Learner`"
2
from .torch_core import *
3
from .basic_data import *
4
from .callback import *
5
from .data_block import *
6
from .utils.ipython import gpu_mem_restore
7
import inspect
8
from fastprogress.fastprogress import format_time, IN_NOTEBOOK
9
from time import time
10
from fastai.sixel import plot_sixel
11

12
__all__ = ['Learner', 'LearnerCallback', 'Recorder', 'RecordOnCPU', 'fit', 'loss_batch', 'train_epoch', 'validate',
13
           'get_preds', 'load_learner']
14

15
defaults.lr = slice(3e-3)
16
defaults.wd = 1e-2
17
defaults.extra_callbacks    = None
18
defaults.extra_callback_fns = None
19

20
def loss_batch(model:nn.Module, xb:Tensor, yb:Tensor, loss_func:OptLossFunc=None, opt:OptOptimizer=None,
21
               cb_handler:Optional[CallbackHandler]=None, count:[int]=[1], batch_multiplier:int=1)->Tuple[Union[Tensor,int,float,str]]:
22
    "Calculate loss and metrics for a batch, call out to callbacks as necessary."
23
    cb_handler = ifnone(cb_handler, CallbackHandler())
24
    if not is_listy(xb): xb = [xb]
25
    if not is_listy(yb): yb = [yb]
26
    out = model(*xb)
27

28
    if not loss_func: return to_detach(out), yb[0].detach()
29
    out = cb_handler.on_loss_begin(out)
30
    loss = loss_func(out, *yb)/batch_multiplier
31
    count[0]-=1
32

33
    if opt is not None:
34
        loss,skip_bwd = cb_handler.on_backward_begin(loss)
35
        if not skip_bwd:                     loss.backward()
36
        if count[0] == 0:
37
            if not cb_handler.on_backward_end(): opt.step()
38
            if not cb_handler.on_step_end():     opt.zero_grad()
39
            count[0] = batch_multiplier
40

41
    return loss.detach().cpu()
42

43
def get_preds(model:nn.Module, dl:DataLoader, pbar:Optional[PBar]=None, cb_handler:Optional[CallbackHandler]=None,
44
              activ:nn.Module=None, loss_func:OptLossFunc=None, n_batch:Optional[int]=None) -> List[Tensor]:
45
    "Tuple of predictions and targets, and optional losses (if `loss_func`) using `dl`, max batches `n_batch`."
46
    res = [torch.cat(o).cpu() for o in
47
           zip(*validate(model, dl, cb_handler=cb_handler, pbar=pbar, average=False, n_batch=n_batch))]
48
    if loss_func is not None:
49
        with NoneReduceOnCPU(loss_func) as lf: res.append(lf(res[0], res[1]))
50
    if activ is not None: res[0] = activ(res[0])
51
    return res
52

53
def validate(model:nn.Module, dl:DataLoader, loss_func:OptLossFunc=None, cb_handler:Optional[CallbackHandler]=None,
54
             pbar:Optional[PBar]=None, average=True, n_batch:Optional[int]=None)->Iterator[Tuple[Union[Tensor,int],...]]:
55
    "Calculate `loss_func` of `model` on `dl` in evaluation mode."
56
    model.eval()
57
    with torch.no_grad():
58
        val_losses,nums = [],[]
59
        if cb_handler: cb_handler.set_dl(dl)
60
        for xb,yb in progress_bar(dl, parent=pbar, leave=(pbar is not None)):
61
            if cb_handler: xb, yb = cb_handler.on_batch_begin(xb, yb, train=False)
62
            val_loss = loss_batch(model, xb, yb, loss_func, cb_handler=cb_handler)
63
            val_losses.append(val_loss)
64
            if not is_listy(yb): yb = [yb]
65
            nums.append(first_el(yb).shape[0])
66
            if cb_handler and cb_handler.on_batch_end(val_losses[-1]): break
67
            if n_batch and (len(nums)>=n_batch): break
68
        nums = np.array(nums, dtype=np.float32)
69
        if average: return (to_np(torch.stack(val_losses)) * nums).sum() / nums.sum()
70
        else:       return val_losses
71

72
def train_epoch(model:nn.Module, dl:DataLoader, opt:optim.Optimizer, loss_func:LossFunction)->None:
73
    "Simple training of `model` for 1 epoch of `dl` using optim `opt` and loss function `loss_func`."
74
    model.train()
75
    for xb,yb in dl:
76
        loss = loss_func(model(xb), yb)
77
        loss.backward()
78
        opt.step()
79
        opt.zero_grad()
80

81
@dataclass
82
class BasicLearner():
83
    model:nn.Module
84
    loss_func:LossFunction
85
    opt:optim.Optimizer
86
    data:DataBunch
87

88
def fit(epochs:int, learn:BasicLearner, callbacks:Optional[CallbackList]=None, metrics:OptMetrics=None, batch_multiplier:int=1)->None:
89
    "Fit the `model` on `data` and learn using `loss_func` and `opt`."
90
    assert len(learn.data.train_dl) != 0, f"""Your training dataloader is empty, can't train a model.
91
        Use a smaller batch size (batch size={learn.data.train_dl.batch_size} for {len(learn.data.train_dl.dataset)} elements)."""
92
    cb_handler = CallbackHandler(callbacks, metrics)
93
    pbar = master_bar(range(epochs))
94
    cb_handler.on_train_begin(epochs, pbar=pbar, metrics=metrics)
95

96
    exception=False
97
    try:
98
        for epoch in pbar:
99
            learn.model.train()
100
            cb_handler.set_dl(learn.data.train_dl)
101
            cb_handler.on_epoch_begin()
102
            count = [batch_multiplier]
103
            for xb,yb in progress_bar(learn.data.train_dl, parent=pbar):
104
                xb, yb = cb_handler.on_batch_begin(xb, yb)
105
                loss = loss_batch(learn.model, xb, yb, learn.loss_func, learn.opt, cb_handler, count=count, batch_multiplier=batch_multiplier)
106
                if cb_handler.on_batch_end(loss): break
107

108
            if not cb_handler.skip_validate and not learn.data.empty_val:
109
                val_loss = validate(learn.model, learn.data.valid_dl, loss_func=learn.loss_func,
110
                                       cb_handler=cb_handler, pbar=pbar)
111
            else: val_loss=None
112
            if cb_handler.on_epoch_end(val_loss): break
113
    except Exception as e:
114
        exception = e
115
        raise
116
    finally: cb_handler.on_train_end(exception)
117

118
loss_func_name2activ = {'cross_entropy_loss': F.softmax, 'nll_loss': torch.exp, 'poisson_nll_loss': torch.exp,
119
    'kl_div_loss': torch.exp, 'bce_with_logits_loss': torch.sigmoid, 'cross_entropy': F.softmax,
120
    'kl_div': torch.exp, 'binary_cross_entropy_with_logits': torch.sigmoid,
121
}
122

123
def _loss_func_name2activ(name:str, axis:int=-1):
124
    res = loss_func_name2activ[name]
125
    if res == F.softmax: res = partial(F.softmax, dim=axis)
126
    return res
127

128
def _loss_func2activ(loss_func):
129
    if getattr(loss_func,'keywords',None):
130
        if not loss_func.keywords.get('log_input', True): return
131
    axis = getattr(loss_func, 'axis', -1)
132
    # flattened loss
133
    loss_func = getattr(loss_func, 'func', loss_func)
134
    # could have a partial inside flattened loss! Duplicate on purpose.
135
    loss_func = getattr(loss_func, 'func', loss_func)
136
    cls_name = camel2snake(loss_func.__class__.__name__)
137
    if cls_name == 'mix_up_loss':
138
        loss_func = loss_func.crit
139
        cls_name = camel2snake(loss_func.__class__.__name__)
140
    if cls_name in loss_func_name2activ:
141
        if cls_name == 'poisson_nll_loss' and (not getattr(loss_func, 'log_input', True)): return
142
        return _loss_func_name2activ(cls_name, axis)
143
    if getattr(loss_func,'__name__','') in loss_func_name2activ:
144
        return _loss_func_name2activ(loss_func.__name__, axis)
145
    return noop
146

147
@dataclass
148
class Learner():
149
    "Trainer for `model` using `data` to minimize `loss_func` with optimizer `opt_func`."
150
    data:DataBunch
151
    model:nn.Module
152
    opt_func:Callable=AdamW
153
    loss_func:Callable=None
154
    metrics:Collection[Callable]=None
155
    true_wd:bool=True
156
    bn_wd:bool=True
157
    wd:Floats=defaults.wd
158
    train_bn:bool=True
159
    path:str = None
160
    model_dir:PathOrStr = 'models'
161
    callback_fns:Collection[Callable]=None
162
    callbacks:Collection[Callback]=field(default_factory=list)
163
    layer_groups:Collection[nn.Module]=None
164
    add_time:bool=True
165
    silent:bool=None
166
    def __post_init__(self)->None:
167
        "Setup path,metrics, callbacks and ensure model directory exists."
168
        self.path = Path(ifnone(self.path, self.data.path))
169
        self.model = self.model.to(self.data.device)
170
        self.loss_func = self.loss_func or self.data.loss_func
171
        self.metrics=listify(self.metrics)
172
        if not self.layer_groups: self.layer_groups = [nn.Sequential(*flatten_model(self.model))]
173
        self.callbacks = listify(self.callbacks)
174
        if self.silent is None: self.silent = defaults.silent
175
        self.callback_fns = [partial(Recorder, add_time=self.add_time, silent=self.silent)] + listify(self.callback_fns)
176

177
    def init(self, init): apply_init(self.model, init)
178

179
    def _test_writeable_path(self):
180
        path = self.path/self.model_dir
181
        try:
182
            path.mkdir(parents=True, exist_ok=True)
183
            tmp_file = get_tmp_file(path)
184
        except OSError as e:
185
            raise Exception(f"{e}\nCan't write to '{path}', set `learn.model_dir` attribute in Learner to a full libpath path that is writable") from None
186
        os.remove(tmp_file)
187

188
    def lr_range(self, lr:Union[float,slice])->np.ndarray:
189
        "Build differential learning rates from `lr`."
190
        if not isinstance(lr,slice): return lr
191
        if lr.start: res = even_mults(lr.start, lr.stop, len(self.layer_groups))
192
        else: res = [lr.stop/10]*(len(self.layer_groups)-1) + [lr.stop]
193
        return np.array(res)
194

195
    def fit(self, epochs:int, lr:Union[Floats,slice]=defaults.lr,
196
            wd:Floats=None, callbacks:Collection[Callback]=None, batch_multiplier:int=1)->None:
197
        "Fit the model on this learner with `lr` learning rate, `wd` weight decay for `epochs` with `callbacks`."
198
        lr = self.lr_range(lr)
199
        if wd is None: wd = self.wd
200
        if not getattr(self, 'opt', False): self.create_opt(lr, wd)
201
        else: self.opt.lr,self.opt.wd = lr,wd
202
        callbacks = [cb(self) for cb in self.callback_fns + listify(defaults.extra_callback_fns)] + listify(callbacks)
203
        if defaults.extra_callbacks is not None: callbacks += defaults.extra_callbacks
204
        fit(epochs, self, metrics=self.metrics, callbacks=self.callbacks+callbacks, batch_multiplier=batch_multiplier)
205

206
    def create_opt(self, lr:Floats, wd:Floats=0.)->None:
207
        "Create optimizer with `lr` learning rate and `wd` weight decay."
208
        self.opt = OptimWrapper.create(self.opt_func, lr, self.layer_groups, wd=wd, true_wd=self.true_wd, bn_wd=self.bn_wd)
209

210
    def split(self, split_on:SplitFuncOrIdxList)->None:
211
        "Split the model at `split_on`."
212
        if isinstance(split_on,Callable): split_on = split_on(self.model)
213
        self.layer_groups = split_model(self.model, split_on)
214
        return self
215

216
    def freeze_to(self, n:int)->None:
217
        "Freeze layers up to layer group `n`."
218
        for g in self.layer_groups[:n]:
219
            for l in g:
220
                if not self.train_bn or not isinstance(l, bn_types): requires_grad(l, False)
221
        for g in self.layer_groups[n:]: requires_grad(g, True)
222
        self.create_opt(defaults.lr)
223

224
    def freeze(self)->None:
225
        "Freeze up to last layer group."
226
        assert(len(self.layer_groups)>1)
227
        self.freeze_to(-1)
228

229
    def unfreeze(self):
230
        "Unfreeze entire model."
231
        self.freeze_to(0)
232

233
    def export(self, file:PathLikeOrBinaryStream='export.pkl', destroy=False):
234
        "Export the state of the `Learner` in `self.path/file`. `file` can be file-like (file or buffer)"
235
        if rank_distrib(): return # don't save if slave proc
236
        args = ['opt_func', 'loss_func', 'metrics', 'true_wd', 'bn_wd', 'wd', 'train_bn', 'model_dir', 'callback_fns']
237
        state = {a:getattr(self,a) for a in args}
238
        state['cb_state'] = {cb.__class__:cb.get_state() for cb in self.callbacks}
239
        #layer_groups -> need to find a way
240
        #TO SEE: do we save model structure and weights separately?
241
        with ModelOnCPU(self.model) as m:
242
            state['model'] = m
243
            xtra = dict(normalize=self.data.norm.keywords) if getattr(self.data, 'norm', False) else {}
244
            state['data'] = self.data.valid_ds.get_state(**xtra)
245
            state['cls'] = self.__class__
246
            try_save(state, self.path, file)
247
        if destroy: self.destroy()
248

249
    def save(self, file:PathLikeOrBinaryStream=None, return_path:bool=False, with_opt:bool=True):
250
        "Save model and optimizer state (if `with_opt`) with `file` to `self.model_dir`. `file` can be file-like (file or buffer)"
251
        if is_pathlike(file): self._test_writeable_path()
252
        if rank_distrib(): return # don't save if slave proc
253
        target = self.path/self.model_dir/f'{file}.pth' if is_pathlike(file) else file
254
        if not hasattr(self, 'opt'): with_opt=False
255
        if not with_opt: state = get_model(self.model).state_dict()
256
        else: state = {'model': get_model(self.model).state_dict(), 'opt':self.opt.state_dict()}
257
        torch.save(state, target)
258
        if return_path: return target
259

260
    def dl(self, ds_type:DatasetType=DatasetType.Valid):
261
        "Return DataLoader for DatasetType `ds_type`."
262
        return self.data.dl(ds_type)
263

264
    def load(self, file:PathLikeOrBinaryStream=None, device:torch.device=None, strict:bool=True,
265
             with_opt:bool=None, purge:bool=True, remove_module:bool=False):
266
        "Load model and optimizer state (if `with_opt`) `file` from `self.model_dir` using `device`. `file` can be file-like (file or buffer)"
267
        if purge: self.purge(clear_opt=ifnone(with_opt, False))
268
        if device is None: device = self.data.device
269
        elif isinstance(device, int): device = torch.device('cuda', device)
270
        source = self.path/self.model_dir/f'{file}.pth' if is_pathlike(file) else file
271
        state = torch.load(source, map_location=device)
272
        if set(state.keys()) == {'model', 'opt'}:
273
            model_state = state['model']
274
            if remove_module: model_state = remove_module_load(model_state)
275
            get_model(self.model).load_state_dict(model_state, strict=strict)
276
            if ifnone(with_opt,True):
277
                if not hasattr(self, 'opt'): self.create_opt(defaults.lr, self.wd)
278
                try:    self.opt.load_state_dict(state['opt'])
279
                except: pass
280
        else:
281
            if with_opt: warn("Saved filed doesn't contain an optimizer state.")
282
            if remove_module: state = remove_module_load(state)
283
            get_model(self.model).load_state_dict(state, strict=strict)
284
        del state
285
        gc.collect()
286
        return self
287

288
    def destroy(self):
289
        "Free the Learner internals, leaving just an empty shell that consumes no memory"
290

291
        class ZombieLearner(Learner):
292
            msg = "this object has been destroyed"
293
            def __getattr__(self, item):    print(ZombieLearner.msg); return None
294
            def destroyed(*args, **kwargs): print(ZombieLearner.msg)
295

296
        attrs = [k for k in self.__dict__.keys() if not k.startswith("__")]
297
        for a in attrs: delattr(self, a)
298
        # the instance methods can still be called, but will just give a message
299
        methods = [k for k in dir(self) if not k.startswith("__") and inspect.isroutine(getattr(self, k))]
300
        for m in methods: setattr(self, m, ZombieLearner.destroyed)
301
        self.__class__ = ZombieLearner
302
        gc.collect()
303
        print("this Learner object self-destroyed - it still exists, but no longer usable")
304

305
    def purge(self, clear_opt:bool=True):
306
        "Purge the `Learner` of all cached attributes to release some GPU memory."
307
        self._test_writeable_path()
308
        attrs_all = [k for k in self.__dict__.keys() if not k.startswith("__")]
309
        attrs_pkl = ['bn_wd', 'callback_fns', 'layer_groups', 'loss_func', 'metrics', 'model',
310
                     'model_dir', 'opt_func', 'path', 'train_bn', 'true_wd', 'wd']
311
        # +callbacks: get pickled too, but not directly
312
        attrs_keep = ['data', 'recorder']
313
        attrs_del = list(set(attrs_all) - set(attrs_keep))
314
        state = {a:getattr(self, a) for a in attrs_pkl}
315
        state['cb_state'] = {cb.__class__:cb.get_state() for cb in self.callbacks}
316
        if hasattr(self, 'opt'): state['opt'] = self.opt.get_state()
317

318
        tmp_file = get_tmp_file(self.path/self.model_dir)
319
        torch.save(state, open(tmp_file, 'wb'))
320
        for a in attrs_del: delattr(self, a)
321
        gc.collect()
322
        state = torch.load(tmp_file)
323
        os.remove(tmp_file)
324

325
        for a in attrs_pkl: setattr(self, a, state[a])
326
        cb_state = state.pop('cb_state')
327
        self.callbacks = [load_callback(c,s, self) for c,s in cb_state.items()]
328
        if not clear_opt and 'opt' in state:
329
            try: self.opt = OptimWrapper.load_with_state_and_layer_group(state['opt'], self.layer_groups)
330
            except: warn("Wasn't able to properly load the optimizer state again.")
331
        del state
332
        gc.collect()
333
        return self
334

335
    def get_preds(self, ds_type:DatasetType=DatasetType.Valid, with_loss:bool=False, n_batch:Optional[int]=None,
336
                  pbar:Optional[PBar]=None) -> List[Tensor]:
337
        "Return predictions and targets on `ds_type` dataset."
338
        lf = self.loss_func if with_loss else None
339
        return get_preds(self.model, self.dl(ds_type), cb_handler=CallbackHandler(self.callbacks),
340
                         activ=_loss_func2activ(self.loss_func), loss_func=lf, n_batch=n_batch, pbar=pbar)
341

342
    def pred_batch(self, ds_type:DatasetType=DatasetType.Valid, batch:Tuple=None, reconstruct:bool=False, with_dropout:bool=False) -> List[Tensor]:
343
        with torch.no_grad():
344
            training = self.model.training
345
            self.model.train(False)
346
            "Return output of the model on one batch from `ds_type` dataset."
347
            if batch is not None: xb,yb = batch
348
            else: xb,yb = self.data.one_batch(ds_type, detach=False, denorm=False)
349
            cb_handler = CallbackHandler(self.callbacks)
350
            xb,yb = cb_handler.on_batch_begin(xb,yb, train=False)
351
            if not with_dropout: 
352
                preds = loss_batch(self.model.eval(), xb, yb, cb_handler=cb_handler)
353
            else: 
354
                preds = loss_batch(self.model.eval().apply(self.apply_dropout), xb, yb, cb_handler=cb_handler)
355
            res = _loss_func2activ(self.loss_func)(preds[0])
356
            self.model.train(training)
357
            if not reconstruct: return res
358
            res = res.detach().cpu()
359
            ds = self.dl(ds_type).dataset
360
            norm = getattr(self.data, 'norm', False)
361
            if norm and norm.keywords.get('do_y',False):
362
                res = self.data.denorm(res, do_x=True)
363
            return [ds.reconstruct(o) for o in res]
364

365
    def backward(self, item):
366
        "Pass `item` through the model and computes the gradient. Useful if `backward_hooks` are attached."
367
        xb,yb = self.data.one_item(item)
368
        loss = loss_batch(self.model.eval(), xb, yb, self.loss_func, opt=FakeOptimizer(),
369
                          cb_handler=CallbackHandler(self.callbacks))
370
        return loss
371

372
    def predict(self, item:ItemBase, return_x:bool=False, batch_first:bool=True, with_dropout:bool=False, **kwargs):
373
        "Return predicted class, label and probabilities for `item`."
374
        batch = self.data.one_item(item)
375
        res = self.pred_batch(batch=batch, with_dropout=with_dropout)
376
        raw_pred,x = grab_idx(res,0,batch_first=batch_first),batch[0]
377
        norm = getattr(self.data,'norm',False)
378
        if norm:
379
            x = self.data.denorm(x)
380
            if norm.keywords.get('do_y',False): raw_pred = self.data.denorm(raw_pred)
381
        ds = self.data.single_ds
382
        pred = ds.y.analyze_pred(raw_pred, **kwargs)
383
        x = ds.x.reconstruct(grab_idx(x, 0))
384
        y = ds.y.reconstruct(pred, x) if has_arg(ds.y.reconstruct, 'x') else ds.y.reconstruct(pred)
385
        return (x, y, pred, raw_pred) if return_x else (y, pred, raw_pred)
386

387
    def validate(self, dl=None, callbacks=None, metrics=None):
388
        "Validate on `dl` with potential `callbacks` and `metrics`."
389
        dl = ifnone(dl, self.data.valid_dl)
390
        metrics = ifnone(metrics, self.metrics)
391
        cb_handler = CallbackHandler(self.callbacks + ifnone(callbacks, []), metrics)
392
        cb_handler.on_epoch_begin()
393
        val_metrics = validate(self.model, dl, self.loss_func, cb_handler)
394
        cb_handler.on_epoch_end(val_metrics)
395
        return cb_handler.state_dict['last_metrics']
396

397
    def show_results(self, ds_type=DatasetType.Valid, rows:int=5, **kwargs):
398
        "Show `rows` result of predictions on `ds_type` dataset."
399
        #TODO: get read of has_arg x and split_kwargs_by_func if possible
400
        #TODO: simplify this and refactor with pred_batch(...reconstruct=True)
401
        n_items = rows ** 2 if self.data.train_ds.x._square_show_res else rows
402
        if self.dl(ds_type).batch_size < n_items: n_items = self.dl(ds_type).batch_size
403
        ds = self.dl(ds_type).dataset
404
        self.callbacks.append(RecordOnCPU())
405
        preds = self.pred_batch(ds_type)
406
        *self.callbacks,rec_cpu = self.callbacks
407
        x,y = rec_cpu.input,rec_cpu.target
408
        norm = getattr(self.data,'norm',False)
409
        if norm:
410
            x = self.data.denorm(x)
411
            if norm.keywords.get('do_y',False):
412
                y     = self.data.denorm(y, do_x=True)
413
                preds = self.data.denorm(preds, do_x=True)
414
        analyze_kwargs,kwargs = split_kwargs_by_func(kwargs, ds.y.analyze_pred)
415
        preds = [ds.y.analyze_pred(grab_idx(preds, i), **analyze_kwargs) for i in range(n_items)]
416
        xs = [ds.x.reconstruct(grab_idx(x, i)) for i in range(n_items)]
417
        if has_arg(ds.y.reconstruct, 'x'):
418
            ys = [ds.y.reconstruct(grab_idx(y, i), x=x) for i,x in enumerate(xs)]
419
            zs = [ds.y.reconstruct(z, x=x) for z,x in zip(preds,xs)]
420
        else :
421
            ys = [ds.y.reconstruct(grab_idx(y, i)) for i in range(n_items)]
422
            zs = [ds.y.reconstruct(z) for z in preds]
423
        ds.x.show_xyzs(xs, ys, zs, **kwargs)
424

425
    def apply_dropout(self, m):
426
        "If a module contains 'dropout' in it's name, it will be switched to .train() mode."
427
        if 'dropout' in m.__class__.__name__.lower(): m.train()
428

429
    def predict_with_mc_dropout(self, item:ItemBase, with_dropout:bool=True, n_times=10, **kwargs):
430
        "Make predictions with dropout turned on for n_times (default 10)."
431
        return [self.predict(item, with_dropout=with_dropout) for _ in range(n_times)]
432

433
class RecordOnCPU(Callback):
434
    "Store the `input` and `target` going through the model on the CPU."
435
    def on_batch_begin(self, last_input,last_target,**kwargs):
436
        self.input,self.target = to_cpu(last_input),to_cpu(last_target)
437

438
class LearnerCallback(Callback):
439
    "Base class for creating callbacks for a `Learner`."
440
    def __init__(self, learn):
441
        self._learn = weakref.ref(learn)
442
        self.exclude,self.not_min = ['_learn'],[]
443
        setattr(self.learn, self.cb_name, self)
444

445
    def __getattr__(self,k): return getattr(self.learn, k)
446
    def __setstate__(self,data:Any): self.__dict__.update(data)
447

448
    @property
449
    def learn(self) -> Learner: return self._learn()
450
    @learn.setter
451
    def learn(self, learn: Learner) -> None: self._learn = weakref.ref(learn)
452

453
    @property
454
    def cb_name(self): return camel2snake(self.__class__.__name__)
455

456
class Recorder(LearnerCallback):
457
    "A `LearnerCallback` that records epoch, loss, opt and metric data during training."
458
    _order=-10
459
    def __init__(self, learn:Learner, add_time:bool=True, silent:bool=False):
460
        super().__init__(learn)
461
        self.opt = self.learn.opt
462
        self.train_dl = self.learn.data.train_dl
463
        self.no_val,self.silent,self.add_time = False,silent,add_time
464

465
    def on_train_begin(self, pbar:PBar, metrics_names:Collection[str], **kwargs:Any)->None:
466
        "Initialize recording status at beginning of training."
467
        self.pbar = pbar
468
        self.names = ['epoch', 'train_loss'] if self.no_val else ['epoch', 'train_loss', 'valid_loss']
469
        self.metrics_names = metrics_names
470
        if hasattr(self, '_added_met_names'): self.metrics_names += self._added_met_names
471
        self.names += self.metrics_names
472
        if self.add_time: self.names.append('time')
473
        if not self.silent: self.pbar.write(self.names, table=True)
474
        self.losses,self.val_losses,self.lrs,self.moms,self.metrics,self.nb_batches = [],[],[],[],[],[]
475

476
    def on_epoch_begin(self, **kwargs:Any)->None:
477
        if self.add_time: self.start_epoch = time()
478

479
    def on_batch_begin(self, train, **kwargs:Any)->None:
480
        "Record learning rate and momentum at beginning of batch."
481
        if train:
482
            self.lrs.append(self.opt.lr)
483
            self.moms.append(self.opt.mom)
484

485
    def on_backward_begin(self, smooth_loss:Tensor, **kwargs:Any)->None:
486
        "Record the loss before any other callback has a chance to modify it."
487
        self.losses.append(smooth_loss)
488
        if self.pbar is not None and hasattr(self.pbar,'child'):
489
            self.pbar.child.comment = f'{smooth_loss:.4f}'
490

491
    def on_epoch_end(self, epoch:int, num_batch:int, smooth_loss:Tensor,
492
                     last_metrics=MetricsList, **kwargs:Any)->bool:
493
        "Save epoch info: num_batch, smooth_loss, metrics."
494
        self.nb_batches.append(num_batch)
495
        if last_metrics is not None: self.val_losses.append(last_metrics[0])
496
        else: last_metrics = [] if self.no_val else [None]
497
        if len(last_metrics) > 1: self.metrics.append(last_metrics[1:])
498
        self.format_stats([epoch, smooth_loss] + last_metrics)
499

500
    def format_stats(self, stats:TensorOrNumList)->None:
501
        "Format stats before printing."
502
        str_stats = []
503
        for name,stat in zip(self.names,stats):
504
            str_stats.append('#na#' if stat is None else str(stat) if isinstance(stat, int) else f'{stat:.6f}')
505
        if self.add_time: str_stats.append(format_time(time() - self.start_epoch))
506
        if not self.silent: self.pbar.write(str_stats, table=True)
507

508
    def add_metric_names(self, names):
509
        "Add `names` to the inner metric names."
510
        if hasattr(self, '_added_met_names'): self._added_met_names += names
511
        else:                                 self._added_met_names  = names
512

513
    def plot_lr(self, show_moms=False, skip_start:int=0, skip_end:int=0, return_fig:bool=None)->Optional[plt.Figure]:
514
        "Plot learning rate, `show_moms` to include momentum."
515
        lrs = self._split_list(self.lrs, skip_start, skip_end)
516
        iterations = self._split_list(range_of(self.lrs), skip_start, skip_end)
517
        if show_moms:
518
            moms = self._split_list(self.moms, skip_start, skip_end)
519
            fig, axs = plt.subplots(1,2, figsize=(12,4))
520
            axs[0].plot(iterations, lrs)
521
            axs[0].set_xlabel('Iterations')
522
            axs[0].set_ylabel('Learning Rate')
523
            axs[1].plot(iterations, moms)
524
            axs[1].set_xlabel('Iterations')
525
            axs[1].set_ylabel('Momentum')
526
        else:
527
            fig, ax = plt.subplots()
528
            ax.plot(iterations, lrs)
529
            ax.set_xlabel('Iterations')
530
            ax.set_ylabel('Learning Rate')
531
        if ifnone(return_fig, defaults.return_fig): return fig
532
        if not IN_NOTEBOOK: plot_sixel(fig)
533

534
    @staticmethod
535
    def smoothen_by_spline(xs, ys, **kwargs):
536
        xs = np.arange(len(ys))
537
        spl = scipy.interpolate.UnivariateSpline(xs, ys, **kwargs)
538
        ys = spl(xs)
539
        return ys
540

541
    def plot(self, skip_start:int=10, skip_end:int=5, suggestion:bool=False, return_fig:bool=None,
542
             **kwargs)->Optional[plt.Figure]:
543
        "Plot learning rate and losses, trimmed between `skip_start` and `skip_end`. Optionally plot and return min gradient"
544
        lrs = self._split_list(self.lrs, skip_start, skip_end)
545
        losses = self._split_list(self.losses, skip_start, skip_end)
546
        losses = [x.item() for x in losses]
547
        if 'k' in kwargs: losses = self.smoothen_by_spline(lrs, losses, **kwargs)
548
        fig, ax = plt.subplots(1,1)
549
        ax.plot(lrs, losses)
550
        ax.set_ylabel("Loss")
551
        ax.set_xlabel("Learning Rate")
552
        ax.set_xscale('log')
553
        ax.xaxis.set_major_formatter(plt.FormatStrFormatter('%.0e'))
554
        if suggestion:
555
            try: mg = (np.gradient(np.array(losses))).argmin()
556
            except:
557
                print("Failed to compute the gradients, there might not be enough points.")
558
                return
559
            print(f"Min numerical gradient: {lrs[mg]:.2E}")
560
            ax.plot(lrs[mg],losses[mg],markersize=10,marker='o',color='red')
561
            self.min_grad_lr = lrs[mg]
562
            ml = np.argmin(losses)
563
            print(f"Min loss divided by 10: {lrs[ml]/10:.2E}")
564
        if ifnone(return_fig, defaults.return_fig): return fig
565
        if not IN_NOTEBOOK: plot_sixel(fig)
566

567
    def plot_losses(self, skip_start:int=0, skip_end:int=0, return_fig:bool=None)->Optional[plt.Figure]:
568
        "Plot training and validation losses."
569
        fig, ax = plt.subplots(1,1)
570
        losses = self._split_list(self.losses, skip_start, skip_end)
571
        iterations = self._split_list(range_of(self.losses), skip_start, skip_end)
572
        ax.plot(iterations, losses, label='Train')
573
        val_iter = self._split_list_val(np.cumsum(self.nb_batches), skip_start, skip_end)
574
        val_losses = self._split_list_val(self.val_losses, skip_start, skip_end)
575
        ax.plot(val_iter, val_losses, label='Validation')
576
        ax.set_ylabel('Loss')
577
        ax.set_xlabel('Batches processed')
578
        ax.legend()
579
        if ifnone(return_fig, defaults.return_fig): return fig
580
        if not IN_NOTEBOOK: plot_sixel(fig)
581

582
    def plot_metrics(self, skip_start:int=0, skip_end:int=0, return_fig:bool=None)->Optional[plt.Figure]:
583
        "Plot metrics collected during training."
584
        assert len(self.metrics) != 0, "There are no metrics to plot."
585
        fig, axes = plt.subplots(len(self.metrics[0]),1,figsize=(6, 4*len(self.metrics[0])))
586
        val_iter = self._split_list_val(np.cumsum(self.nb_batches), skip_start, skip_end)
587
        axes = axes.flatten() if len(self.metrics[0]) != 1 else [axes]
588
        for i, ax in enumerate(axes):
589
            values = [met[i] for met in self.metrics]
590
            values = self._split_list_val(values, skip_start, skip_end)
591
            ax.plot(val_iter, values)
592
            ax.set_ylabel(str(self.metrics_names[i]))
593
            ax.set_xlabel('Batches processed')             
594
        if ifnone(return_fig, defaults.return_fig): return fig
595
        if not IN_NOTEBOOK: plot_sixel(fig)
596

597
    def _split_list(self, vals:Collection[float], skip_start:int, skip_end:int):
598
        return vals[skip_start:-skip_end] if skip_end > 0 else vals[skip_start:]
599

600
    def _split_list_val(self, vals:Collection[float], skip_start:int, skip_end:int):
601
        val_iter = np.cumsum(self.nb_batches)
602
        start_val = (val_iter - skip_start >= 0).nonzero()[0].min()
603
        end_val = (val_iter[-1] - val_iter - skip_end >= 0).nonzero()[0].max()+1
604
        return vals[start_val:end_val] if skip_end > 0 else vals[start_val:]
605

606
class FakeOptimizer():
607
    def step(self): pass
608
    def zero_grad(self): pass
609

610
def load_callback(class_func, state, learn:Learner):
611
    init_kwargs, others = split_kwargs_by_func(state, class_func.__init__)
612
    res = class_func(learn, **init_kwargs) if issubclass(class_func, LearnerCallback) else class_func(**init_kwargs)
613
    for k,v in others.items(): setattr(res, k, v)
614
    return res
615

616
def load_learner(path:PathOrStr, file:PathLikeOrBinaryStream='export.pkl', test:ItemList=None, **db_kwargs):
617
    "Load a `Learner` object saved with `export_state` in `path/file` with empty data, optionally add `test` and load on `cpu`. `file` can be file-like (file or buffer)"
618
    source = Path(path)/file if is_pathlike(file) else file
619
    state = torch.load(source, map_location='cpu') if defaults.device == torch.device('cpu') else torch.load(source)
620
    model = state.pop('model')
621
    src = LabelLists.load_state(path, state.pop('data'))
622
    if test is not None: src.add_test(test)
623
    data = src.databunch(**db_kwargs)
624
    cb_state = state.pop('cb_state')
625
    clas_func = state.pop('cls')
626
    res = clas_func(data, model, **state)
627
    res.callback_fns = state['callback_fns'] #to avoid duplicates
628
    res.callbacks = [load_callback(c,s, res) for c,s in cb_state.items()]
629
    return res
630

631
Product

Resources

Company