CoCalc -- train.py

hukaixuan19970627
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: hukaixuan19970627/yolov5_obb
Path: blob/master/train.py
Views: ⁴⁷⁵
1
# YOLOv5 🚀 by Ultralytics, GPL-3.0 license
2
"""
3
Train a YOLOv5 model on a custom dataset
4

5
Usage:
6
    $ python path/to/train.py --data coco128.yaml --weights yolov5s.pt --img 640
7
"""
8
import argparse
9
import math
10
import os
11
import random
12
import sys
13
import time
14
from copy import deepcopy
15
from datetime import datetime
16
from pathlib import Path
17

18
import numpy as np
19
import torch
20
import torch.distributed as dist
21
import torch.nn as nn
22
import yaml
23
from torch.cuda import amp
24
from torch.nn.parallel import DistributedDataParallel as DDP
25
from torch.optim import SGD, Adam, lr_scheduler
26
from tqdm import tqdm
27

28
FILE = Path(__file__).resolve()
29
ROOT = FILE.parents[0]  # YOLOv5 root directory
30
if str(ROOT) not in sys.path:
31
    sys.path.append(str(ROOT))  # add ROOT to PATH
32
ROOT = Path(os.path.relpath(ROOT, Path.cwd()))  # relative
33

34
import val  # for end-of-epoch mAP
35
from models.experimental import attempt_load
36
from models.yolo import Model
37
from utils.autoanchor import check_anchors
38
from utils.autobatch import check_train_batch_size
39
from utils.callbacks import Callbacks
40
from utils.datasets import create_dataloader
41
from utils.downloads import attempt_download
42
from utils.general import (LOGGER, check_dataset, check_file, check_git_status, check_img_size, check_requirements,
43
                           check_suffix, check_yaml, colorstr, get_latest_run, increment_path, init_seeds,
44
                           intersect_dicts, labels_to_class_weights, labels_to_image_weights, methods, one_cycle,
45
                           print_args, print_mutation, strip_optimizer)
46
from utils.loggers import Loggers
47
from utils.loggers.wandb.wandb_utils import check_wandb_resume
48
from utils.loss import ComputeLoss
49
from utils.metrics import fitness
50
from utils.plots import plot_evolve, plot_labels
51
from utils.torch_utils import EarlyStopping, ModelEMA, de_parallel, select_device, torch_distributed_zero_first
52

53
LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1))  # https://pytorch.org/docs/stable/elastic/run.html
54
RANK = int(os.getenv('RANK', -1))
55
WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1))
56

57
def train(hyp,  # path/to/hyp.yaml or hyp dictionary
58
          opt,
59
          device,
60
          callbacks
61
          ):
62
    save_dir, epochs, batch_size, weights, single_cls, evolve, data, cfg, resume, noval, nosave, workers, freeze = \
63
        Path(opt.save_dir), opt.epochs, opt.batch_size, opt.weights, opt.single_cls, opt.evolve, opt.data, opt.cfg, \
64
        opt.resume, opt.noval, opt.nosave, opt.workers, opt.freeze
65

66
    # Directories
67
    w = save_dir / 'weights'  # weights dir
68
    (w.parent if evolve else w).mkdir(parents=True, exist_ok=True)  # make dir
69
    last, best = w / 'last.pt', w / 'best.pt'
70

71
    # Hyperparameters
72
    if isinstance(hyp, str):
73
        with open(hyp, errors='ignore') as f:
74
            hyp = yaml.safe_load(f)  # load hyps dict
75
    LOGGER.info(colorstr('hyperparameters: ') + ', '.join(f'{k}={v}' for k, v in hyp.items()))
76

77
    # Save run settings
78
    if not evolve:
79
        with open(save_dir / 'hyp.yaml', 'w') as f:
80
            yaml.safe_dump(hyp, f, sort_keys=False)
81
        with open(save_dir / 'opt.yaml', 'w') as f:
82
            yaml.safe_dump(vars(opt), f, sort_keys=False)
83

84
    # Loggers
85
    data_dict = None
86
    if RANK in [-1, 0]:
87
        loggers = Loggers(save_dir, weights, opt, hyp, LOGGER)  # loggers instance
88
        if loggers.wandb:
89
            data_dict = loggers.wandb.data_dict
90
            if resume:
91
                weights, epochs, hyp = opt.weights, opt.epochs, opt.hyp
92

93
        # Register actions
94
        for k in methods(loggers):
95
            callbacks.register_action(k, callback=getattr(loggers, k))
96

97
    # Config
98
    plots = not evolve  # create plots
99
    cuda = device.type != 'cpu'
100
    init_seeds(1 + RANK)
101
    with torch_distributed_zero_first(LOCAL_RANK):
102
        data_dict = data_dict or check_dataset(data)  # check if None
103
    train_path, val_path = data_dict['train'], data_dict['val']
104
    nc = 1 if single_cls else int(data_dict['nc'])  # number of classes
105
    names = ['item'] if single_cls and len(data_dict['names']) != 1 else data_dict['names']  # class names
106
    assert len(names) == nc, f'{len(names)} names found for nc={nc} dataset in {data}'  # check
107
    is_coco = isinstance(val_path, str) and val_path.endswith('coco/val2017.txt')  # COCO dataset
108

109
    # Model
110
    check_suffix(weights, '.pt')  # check weights
111
    pretrained = weights.endswith('.pt')
112
    if pretrained:
113
        with torch_distributed_zero_first(LOCAL_RANK):
114
            weights = attempt_download(weights)  # download if not found locally
115
        ckpt = torch.load(weights, map_location=device)  # load checkpoint
116
        model = Model(cfg or ckpt['model'].yaml, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device)  # create
117
        exclude = ['anchor'] if (cfg or hyp.get('anchors')) and not resume else []  # exclude keys
118
        csd = ckpt['model'].float().state_dict()  # checkpoint state_dict as FP32
119
        csd = intersect_dicts(csd, model.state_dict(), exclude=exclude)  # intersect
120
        model.load_state_dict(csd, strict=False)  # load
121
        LOGGER.info(f'Transferred {len(csd)}/{len(model.state_dict())} items from {weights}')  # report
122
    else:
123
        model = Model(cfg, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device)  # create
124

125
    # Freeze
126
    freeze = [f'model.{x}.' for x in (freeze if len(freeze) > 1 else range(freeze[0]))]  # layers to freeze
127
    for k, v in model.named_parameters():
128
        v.requires_grad = True  # train all layers
129
        if any(x in k for x in freeze):
130
            LOGGER.info(f'freezing {k}')
131
            v.requires_grad = False
132

133
    # Image size
134
    gs = max(int(model.stride.max()), 32)  # grid size (max stride)
135
    imgsz = check_img_size(opt.imgsz, gs, floor=gs * 2)  # verify imgsz is gs-multiple
136

137
    # Batch size
138
    if RANK == -1 and batch_size == -1:  # single-GPU only, estimate best batch size
139
        batch_size = check_train_batch_size(model, imgsz)
140
        loggers.on_params_update({"batch_size": batch_size})
141

142
    # Optimizer
143
    nbs = 64  # nominal batch size
144
    accumulate = max(round(nbs / batch_size), 1)  # accumulate loss before optimizing
145
    hyp['weight_decay'] *= batch_size * accumulate / nbs  # scale weight_decay
146
    LOGGER.info(f"Scaled weight_decay = {hyp['weight_decay']}")
147

148
    g0, g1, g2 = [], [], []  # optimizer parameter groups
149
    for v in model.modules():
150
        if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter):  # bias
151
            g2.append(v.bias)
152
        if isinstance(v, nn.BatchNorm2d):  # weight (no decay)
153
            g0.append(v.weight)
154
        elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter):  # weight (with decay)
155
            g1.append(v.weight)
156

157
    if opt.adam:
158
        optimizer = Adam(g0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999))  # adjust beta1 to momentum
159
    else:
160
        optimizer = SGD(g0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
161

162
    optimizer.add_param_group({'params': g1, 'weight_decay': hyp['weight_decay']})  # add g1 with weight_decay
163
    optimizer.add_param_group({'params': g2})  # add g2 (biases)
164
    LOGGER.info(f"{colorstr('optimizer:')} {type(optimizer).__name__} with parameter groups "
165
                f"{len(g0)} weight, {len(g1)} weight (no decay), {len(g2)} bias")
166
    del g0, g1, g2
167

168
    # Scheduler
169
    if opt.linear_lr:
170
        lf = lambda x: (1 - x / (epochs - 1)) * (1.0 - hyp['lrf']) + hyp['lrf']  # linear
171
    else:
172
        lf = one_cycle(1, hyp['lrf'], epochs)  # cosine 1->hyp['lrf']
173
    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)  # plot_lr_scheduler(optimizer, scheduler, epochs)
174

175
    # EMA
176
    ema = ModelEMA(model) if RANK in [-1, 0] else None
177

178
    # Resume
179
    start_epoch, best_fitness = 0, 0.0
180
    if pretrained:
181
        # Optimizer
182
        if ckpt['optimizer'] is not None:
183
            optimizer.load_state_dict(ckpt['optimizer'])
184
            best_fitness = ckpt['best_fitness']
185

186
        # EMA
187
        if ema and ckpt.get('ema'):
188
            ema.ema.load_state_dict(ckpt['ema'].float().state_dict())
189
            ema.updates = ckpt['updates']
190

191
        # Epochs
192
        start_epoch = ckpt['epoch'] + 1
193
        if resume:
194
            assert start_epoch > 0, f'{weights} training to {epochs} epochs is finished, nothing to resume.'
195
        if epochs < start_epoch:
196
            LOGGER.info(f"{weights} has been trained for {ckpt['epoch']} epochs. Fine-tuning for {epochs} more epochs.")
197
            epochs += ckpt['epoch']  # finetune additional epochs
198

199
        del ckpt, csd
200

201
    # DP mode
202
    if cuda and RANK == -1 and torch.cuda.device_count() > 1:
203
        LOGGER.warning('WARNING: DP not recommended, use torch.distributed.run for best DDP Multi-GPU results.\n'
204
                       'See Multi-GPU Tutorial at https://github.com/ultralytics/yolov5/issues/475 to get started.')
205
        model = torch.nn.DataParallel(model)
206

207
    # SyncBatchNorm
208
    if opt.sync_bn and cuda and RANK != -1:
209
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
210
        LOGGER.info('Using SyncBatchNorm()')
211

212
    # Trainloader
213
    train_loader, dataset = create_dataloader(train_path, imgsz, batch_size // WORLD_SIZE, gs, names, single_cls,
214
                                              hyp=hyp, augment=True, cache=opt.cache, rect=opt.rect, rank=LOCAL_RANK,
215
                                              workers=workers, image_weights=opt.image_weights, quad=opt.quad,
216
                                              prefix=colorstr('train: '), shuffle=True)
217
    mlc = int(np.concatenate(dataset.labels, 0)[:, 0].max())  # max label class
218
    nb = len(train_loader)  # number of batches
219
    assert mlc < nc, f'Label class {mlc} exceeds nc={nc} in {data}. Possible class labels are 0-{nc - 1}'
220

221
    # Process 0
222
    if RANK in [-1, 0]:
223
        val_loader = create_dataloader(val_path, imgsz, batch_size // WORLD_SIZE * 2, gs, names, single_cls,
224
                                       hyp=hyp, cache=None if noval else opt.cache, rect=True, rank=-1,
225
                                       workers=workers, pad=0.5,
226
                                       prefix=colorstr('val: '))[0]
227

228
        if not resume:
229
            labels = np.concatenate(dataset.labels, 0) # labels(array): (all_images_gt_num, [cls_id, poly])
230
            # c = torch.tensor(labels[:, 0])  # classes
231
            # cf = torch.bincount(c.long(), minlength=nc) + 1.  # frequency
232
            # model._initialize_biases(cf.to(device))
233
            if plots:
234
                plot_labels(labels, names, save_dir, imgsz)
235

236
            # Anchors
237
            if not opt.noautoanchor:
238
                check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz)
239
            model.half().float()  # pre-reduce anchor precision
240

241
        callbacks.run('on_pretrain_routine_end')
242

243
    # DDP mode
244
    if cuda and RANK != -1:
245
        model = DDP(model, device_ids=[LOCAL_RANK], output_device=LOCAL_RANK)
246

247
    # Model attributes
248
    nl = de_parallel(model).model[-1].nl  # number of detection layers (to scale hyps)
249
    hyp['box'] *= 3 / nl  # scale to layers
250
    hyp['cls'] *= nc / 80 * 3 / nl  # scale to classes and layers
251
    hyp['obj'] *= (imgsz / 640) ** 2 * 3 / nl  # scale to image size and layers
252
    hyp['theta'] *= 3 / nl
253
    hyp['label_smoothing'] = opt.label_smoothing
254
    model.nc = nc  # attach number of classes to model
255
    model.hyp = hyp  # attach hyperparameters to model
256
    model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) * nc  # attach class weights
257
    model.names = names
258

259
    # Start training
260
    t0 = time.time()
261
    nw = max(round(hyp['warmup_epochs'] * nb), 1000)  # number of warmup iterations, max(3 epochs, 1k iterations)
262
    # nw = min(nw, (epochs - start_epoch) / 2 * nb)  # limit warmup to < 1/2 of training
263
    last_opt_step = -1
264
    maps = np.zeros(nc)  # mAP per class
265
    results = (0, 0, 0, 0, 0, 0, 0, 0)  # P, R, [email protected], [email protected], val_loss(box, obj, cls, theta)
266
    scheduler.last_epoch = start_epoch - 1  # do not move
267
    scaler = amp.GradScaler(enabled=cuda)
268
    stopper = EarlyStopping(patience=opt.patience)
269
    compute_loss = ComputeLoss(model)  # init loss class
270
    LOGGER.info(f'Image sizes {imgsz} train, {imgsz} val\n'
271
                f'Using {train_loader.num_workers * WORLD_SIZE} dataloader workers\n'
272
                f"Logging results to {colorstr('bold', save_dir)}\n"
273
                f'Starting training for {epochs} epochs...')
274
    for epoch in range(start_epoch, epochs):  # epoch ------------------------------------------------------------------
275
        model.train()
276

277
        # Update image weights (optional, single-GPU only)
278
        if opt.image_weights:
279
            cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2 / nc  # class weights
280
            iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw)  # image weights
281
            dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n)  # rand weighted idx
282

283
        # Update mosaic border (optional)
284
        # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs)
285
        # dataset.mosaic_border = [b - imgsz, -b]  # height, width borders
286

287
        # mloss = torch.zeros(3, device=device)  # mean losses
288
        mloss = torch.zeros(4, device=device)  # mean losses
289
        if RANK != -1:
290
            train_loader.sampler.set_epoch(epoch)
291
        pbar = enumerate(train_loader)
292
        # LOGGER.info(('\n' + '%10s' * 7) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls',  'labels', 'img_size'))
293
        LOGGER.info(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'theta', 'labels', 'img_size'))
294
        if RANK in [-1, 0]:
295
            pbar = tqdm(pbar, total=nb, bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}')  # progress bar
296
        optimizer.zero_grad()
297
        for i, (imgs, targets, paths, _) in pbar:  # batch -------------------------------------------------------------
298
            ni = i + nb * epoch  # number integrated batches (since train start)
299
            imgs = imgs.to(device, non_blocking=True).float() / 255  # uint8 to float32, 0-255 to 0.0-1.0
300

301
            # Warmup
302
            if ni <= nw:
303
                xi = [0, nw]  # x interp
304
                # compute_loss.gr = np.interp(ni, xi, [0.0, 1.0])  # iou loss ratio (obj_loss = 1.0 or iou)
305
                accumulate = max(1, np.interp(ni, xi, [1, nbs / batch_size]).round())
306
                for j, x in enumerate(optimizer.param_groups):
307
                    # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
308
                    x['lr'] = np.interp(ni, xi, [hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch)])
309
                    if 'momentum' in x:
310
                        x['momentum'] = np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']])
311

312
            # Multi-scale
313
            if opt.multi_scale and not opt.rect:
314
                sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs  # size
315
                sf = sz / max(imgs.shape[2:])  # scale factor , img (tensor): (b, 3, height, width)
316
                if sf != 1:
317
                    ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]]  # new shape (stretched to gs-multiple) [h_new, w_new]
318
                    label_ratio = float(ns[0]) / imgs.shape[2]
319
                    imgs = nn.functional.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)
320
                    targets[:, 2:6] *= label_ratio # targets (tensor): (n_targets, [img_index clsid cx cy l s theta gaussian_θ_labels])
321

322

323
            # Forward
324
            with amp.autocast(enabled=cuda):
325
                pred = model(imgs)  # forward
326
                loss, loss_items = compute_loss(pred, targets.to(device))  # loss scaled by batch_size
327
                if RANK != -1:
328
                    loss *= WORLD_SIZE  # gradient averaged between devices in DDP mode
329
                if opt.quad:
330
                    loss *= 4.
331

332
            # Backward
333
            scaler.scale(loss).backward()
334

335
            # Optimize
336
            if ni - last_opt_step >= accumulate:
337
                scaler.step(optimizer)  # optimizer.step
338
                scaler.update()
339
                optimizer.zero_grad()
340
                if ema:
341
                    ema.update(model)
342
                last_opt_step = ni
343

344
            # Log
345
            if RANK in [-1, 0]:
346
                mloss = (mloss * i + loss_items) / (i + 1)  # update mean losses
347
                mem = f'{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G'  # (GB)
348
                # pbar.set_description(('%10s' * 2 + '%10.4g' * 5) % (
349
                pbar.set_description(('%10s' * 2 + '%10.4g' * 6) % (
350
                    f'{epoch}/{epochs - 1}', mem, *mloss, targets.shape[0], imgs.shape[-1]))
351
                callbacks.run('on_train_batch_end', ni, model, imgs, targets, paths, plots, opt.sync_bn)
352
            # end batch ------------------------------------------------------------------------------------------------
353

354
        # Scheduler
355
        lr = [x['lr'] for x in optimizer.param_groups]  # for loggers
356
        scheduler.step()
357

358
        if RANK in [-1, 0]:
359
            # mAP
360
            callbacks.run('on_train_epoch_end', epoch=epoch)
361
            ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'names', 'stride', 'class_weights'])
362
            final_epoch = (epoch + 1 == epochs) or stopper.possible_stop
363
            if not noval or final_epoch:  # Calculate mAP
364
                results, maps, _ = val.run(data_dict,
365
                                           batch_size=batch_size // WORLD_SIZE * 2,
366
                                           imgsz=imgsz,
367
                                           model=ema.ema,
368
                                           single_cls=single_cls,
369
                                           dataloader=val_loader,
370
                                           save_dir=save_dir,
371
                                           plots=False,
372
                                           callbacks=callbacks,
373
                                           compute_loss=compute_loss)
374

375
            # Update best mAP
376
            fi = fitness(np.array(results).reshape(1, -1))  # weighted combination of [P, R, [email protected], [email protected]]
377
            if fi > best_fitness:
378
                best_fitness = fi
379
            log_vals = list(mloss) + list(results) + lr
380
            callbacks.run('on_fit_epoch_end', log_vals, epoch, best_fitness, fi)
381

382
            # Save model
383
            if (not nosave) or (final_epoch and not evolve):  # if save
384
                ckpt = {'epoch': epoch,
385
                        'best_fitness': best_fitness,
386
                        'model': deepcopy(de_parallel(model)).half(),
387
                        'ema': deepcopy(ema.ema).half(),
388
                        'updates': ema.updates,
389
                        'optimizer': optimizer.state_dict(),
390
                        'wandb_id': loggers.wandb.wandb_run.id if loggers.wandb else None,
391
                        'date': datetime.now().isoformat()}
392

393
                # Save last, best and delete
394
                torch.save(ckpt, last)
395
                if best_fitness == fi:
396
                    torch.save(ckpt, best)
397
                if (epoch > 0) and (opt.save_period > 0) and (epoch % opt.save_period == 0):
398
                    torch.save(ckpt, w / f'epoch{epoch}.pt')
399
                del ckpt
400
                callbacks.run('on_model_save', last, epoch, final_epoch, best_fitness, fi)
401

402
            # Stop Single-GPU
403
            if RANK == -1 and stopper(epoch=epoch, fitness=fi):
404
                break
405

406
            # Stop DDP TODO: known issues shttps://github.com/ultralytics/yolov5/pull/4576
407
            # stop = stopper(epoch=epoch, fitness=fi)
408
            # if RANK == 0:
409
            #    dist.broadcast_object_list([stop], 0)  # broadcast 'stop' to all ranks
410

411
        # Stop DPP
412
        # with torch_distributed_zero_first(RANK):
413
        # if stop:
414
        #    break  # must break all DDP ranks
415

416
        # end epoch ----------------------------------------------------------------------------------------------------
417
    # end training -----------------------------------------------------------------------------------------------------
418
    if RANK in [-1, 0]:
419
        LOGGER.info(f'\n{epoch - start_epoch + 1} epochs completed in {(time.time() - t0) / 3600:.3f} hours.')
420
        for f in last, best:
421
            if f.exists():
422
                strip_optimizer(f)  # strip optimizers
423
                if f is best:
424
                    LOGGER.info(f'\nValidating {f}...')
425
                    results, _, _ = val.run(data_dict,
426
                                            batch_size=batch_size // WORLD_SIZE * 2,
427
                                            imgsz=imgsz,
428
                                            model=attempt_load(f, device).half(),
429
                                            iou_thres=0.65 if is_coco else 0.60,  # best pycocotools results at 0.65
430
                                            single_cls=single_cls,
431
                                            dataloader=val_loader,
432
                                            save_dir=save_dir,
433
                                            save_json=is_coco,
434
                                            verbose=True,
435
                                            plots=True,
436
                                            callbacks=callbacks,
437
                                            compute_loss=compute_loss)  # val best model with plots
438
                    if is_coco:
439
                        callbacks.run('on_fit_epoch_end', list(mloss) + list(results) + lr, epoch, best_fitness, fi)
440

441
        callbacks.run('on_train_end', last, best, plots, epoch, results)
442
        LOGGER.info(f"Results saved to {colorstr('bold', save_dir)}")
443

444
    torch.cuda.empty_cache()
445
    return results
446

447

448
def parse_opt(known=False):
449
    parser = argparse.ArgumentParser()
450
    parser.add_argument('--weights', type=str, default=ROOT / 'weights/yolov5n.pt', help='initial weights path')
451
    parser.add_argument('--cfg', type=str, default='', help='model.yaml path')
452
    parser.add_argument('--data', type=str, default=ROOT / 'data/DroneVehicle_poly.yaml', help='dataset.yaml path')
453
    parser.add_argument('--hyp', type=str, default=ROOT / 'data/hyps/obb/hyp.finetune_dota.yaml', help='hyperparameters path')
454
    parser.add_argument('--epochs', type=int, default=5)
455
    parser.add_argument('--batch-size', type=int, default=128, help='total batch size for all GPUs, -1 for autobatch')
456
    parser.add_argument('--imgsz', '--img', '--img-size', type=int, default=840, help='train, val image size (pixels)')
457
    parser.add_argument('--rect', action='store_true', help='rectangular training')
458
    parser.add_argument('--resume', nargs='?', const=True, default=False, help='resume most recent training')
459
    parser.add_argument('--nosave', action='store_true', help='only save final checkpoint')
460
    parser.add_argument('--noval', action='store_true', help='only validate final epoch')
461
    parser.add_argument('--noautoanchor', action='store_true', help='disable autoanchor check')
462
    parser.add_argument('--evolve', type=int, nargs='?', const=300, help='evolve hyperparameters for x generations')
463
    parser.add_argument('--bucket', type=str, default='', help='gsutil bucket')
464
    parser.add_argument('--cache', type=str, nargs='?', const='ram', help='--cache images in "ram" (default) or "disk"')
465
    parser.add_argument('--image-weights', action='store_true', help='use weighted image selection for training')
466
    parser.add_argument('--device', default='1', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
467
    parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%')
468
    parser.add_argument('--single-cls', action='store_true', help='train multi-class data as single-class')
469
    parser.add_argument('--adam', action='store_true', help='use torch.optim.Adam() optimizer')
470
    parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode')
471
    parser.add_argument('--workers', type=int, default=8, help='max dataloader workers (per RANK in DDP mode)')
472
    parser.add_argument('--project', default=ROOT / 'runs/train', help='save to project/name')
473
    parser.add_argument('--name', default='exp', help='save to project/name')
474
    parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment')
475
    parser.add_argument('--quad', action='store_true', help='quad dataloader')
476
    parser.add_argument('--linear-lr', action='store_true', help='linear LR')
477
    parser.add_argument('--label-smoothing', type=float, default=0.0, help='Label smoothing epsilon')
478
    parser.add_argument('--patience', type=int, default=100, help='EarlyStopping patience (epochs without improvement)')
479
    parser.add_argument('--freeze', nargs='+', type=int, default=[0], help='Freeze layers: backbone=10, first3=0 1 2')
480
    parser.add_argument('--save-period', type=int, default=-1, help='Save checkpoint every x epochs (disabled if < 1)')
481
    parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify')
482

483
    # Weights & Biases arguments
484
    parser.add_argument('--entity', default=None, help='W&B: Entity')
485
    parser.add_argument('--upload_dataset', nargs='?', const=True, default=False, help='W&B: Upload data, "val" option')
486
    parser.add_argument('--bbox_interval', type=int, default=-1, help='W&B: Set bounding-box image logging interval')
487
    parser.add_argument('--artifact_alias', type=str, default='latest', help='W&B: Version of dataset artifact to use')
488

489
    opt = parser.parse_known_args()[0] if known else parser.parse_args()
490
    return opt
491

492

493
def main(opt, callbacks=Callbacks()):
494
    # Checks
495
    if RANK in [-1, 0]:
496
        print_args(FILE.stem, opt)
497
        check_git_status()
498
        check_requirements(exclude=['thop'])
499

500
    # Resume
501
    if opt.resume and not check_wandb_resume(opt) and not opt.evolve:  # resume an interrupted run
502
        ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run()  # specified or most recent path
503
        assert os.path.isfile(ckpt), 'ERROR: --resume checkpoint does not exist'
504
        with open(Path(ckpt).parent.parent / 'opt.yaml', errors='ignore') as f:
505
            opt = argparse.Namespace(**yaml.safe_load(f))  # replace
506
        opt.cfg, opt.weights, opt.resume = '', ckpt, True  # reinstate
507
        LOGGER.info(f'Resuming training from {ckpt}')
508
    else:
509
        opt.data, opt.cfg, opt.hyp, opt.weights, opt.project = \
510
            check_file(opt.data), check_yaml(opt.cfg), check_yaml(opt.hyp), str(opt.weights), str(opt.project)  # checks
511
        assert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified'
512
        if opt.evolve:
513
            opt.project = str(ROOT / 'runs/evolve')
514
            opt.exist_ok, opt.resume = opt.resume, False  # pass resume to exist_ok and disable resume
515
        opt.save_dir = str(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok))
516

517
    # DDP mode
518
    device = select_device(opt.device, batch_size=opt.batch_size)
519
    if LOCAL_RANK != -1:
520
        assert torch.cuda.device_count() > LOCAL_RANK, 'insufficient CUDA devices for DDP command'
521
        assert opt.batch_size % WORLD_SIZE == 0, '--batch-size must be multiple of CUDA device count'
522
        assert not opt.image_weights, '--image-weights argument is not compatible with DDP training'
523
        assert not opt.evolve, '--evolve argument is not compatible with DDP training'
524
        torch.cuda.set_device(LOCAL_RANK)
525
        device = torch.device('cuda', LOCAL_RANK)
526
        dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo")
527

528
    # Train
529
    if not opt.evolve:
530
        train(opt.hyp, opt, device, callbacks)
531
        if WORLD_SIZE > 1 and RANK == 0:
532
            LOGGER.info('Destroying process group... ')
533
            dist.destroy_process_group()
534

535
    # Evolve hyperparameters (optional)
536
    else:
537
        # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit)
538
        meta = {'lr0': (1, 1e-5, 1e-1),  # initial learning rate (SGD=1E-2, Adam=1E-3)
539
                'lrf': (1, 0.01, 1.0),  # final OneCycleLR learning rate (lr0 * lrf)
540
                'momentum': (0.3, 0.6, 0.98),  # SGD momentum/Adam beta1
541
                'weight_decay': (1, 0.0, 0.001),  # optimizer weight decay
542
                'warmup_epochs': (1, 0.0, 5.0),  # warmup epochs (fractions ok)
543
                'warmup_momentum': (1, 0.0, 0.95),  # warmup initial momentum
544
                'warmup_bias_lr': (1, 0.0, 0.2),  # warmup initial bias lr
545
                'box': (1, 0.02, 0.2),  # box loss gain
546
                'cls': (1, 0.2, 4.0),  # cls loss gain
547
                'cls_pw': (1, 0.5, 2.0),  # cls BCELoss positive_weight
548
                'obj': (1, 0.2, 4.0),  # obj loss gain (scale with pixels)
549
                'obj_pw': (1, 0.5, 2.0),  # obj BCELoss positive_weight
550
                'iou_t': (0, 0.1, 0.7),  # IoU training threshold
551
                'anchor_t': (1, 2.0, 8.0),  # anchor-multiple threshold
552
                'anchors': (2, 2.0, 10.0),  # anchors per output grid (0 to ignore)
553
                'fl_gamma': (0, 0.0, 2.0),  # focal loss gamma (efficientDet default gamma=1.5)
554
                'hsv_h': (1, 0.0, 0.1),  # image HSV-Hue augmentation (fraction)
555
                'hsv_s': (1, 0.0, 0.9),  # image HSV-Saturation augmentation (fraction)
556
                'hsv_v': (1, 0.0, 0.9),  # image HSV-Value augmentation (fraction)
557
                'degrees': (1, 0.0, 45.0),  # image rotation (+/- deg)
558
                'translate': (1, 0.0, 0.9),  # image translation (+/- fraction)
559
                'scale': (1, 0.0, 0.9),  # image scale (+/- gain)
560
                'shear': (1, 0.0, 10.0),  # image shear (+/- deg)
561
                'perspective': (0, 0.0, 0.001),  # image perspective (+/- fraction), range 0-0.001
562
                'flipud': (1, 0.0, 1.0),  # image flip up-down (probability)
563
                'fliplr': (0, 0.0, 1.0),  # image flip left-right (probability)
564
                'mosaic': (1, 0.0, 1.0),  # image mixup (probability)
565
                'mixup': (1, 0.0, 1.0),  # image mixup (probability)
566
                'copy_paste': (1, 0.0, 1.0)}  # segment copy-paste (probability)
567

568
        with open(opt.hyp, errors='ignore') as f:
569
            hyp = yaml.safe_load(f)  # load hyps dict
570
            if 'anchors' not in hyp:  # anchors commented in hyp.yaml
571
                hyp['anchors'] = 3
572
        opt.noval, opt.nosave, save_dir = True, True, Path(opt.save_dir)  # only val/save final epoch
573
        # ei = [isinstance(x, (int, float)) for x in hyp.values()]  # evolvable indices
574
        evolve_yaml, evolve_csv = save_dir / 'hyp_evolve.yaml', save_dir / 'evolve.csv'
575
        if opt.bucket:
576
            os.system(f'gsutil cp gs://{opt.bucket}/evolve.csv {save_dir}')  # download evolve.csv if exists
577

578
        for _ in range(opt.evolve):  # generations to evolve
579
            if evolve_csv.exists():  # if evolve.csv exists: select best hyps and mutate
580
                # Select parent(s)
581
                parent = 'single'  # parent selection method: 'single' or 'weighted'
582
                x = np.loadtxt(evolve_csv, ndmin=2, delimiter=',', skiprows=1)
583
                n = min(5, len(x))  # number of previous results to consider
584
                x = x[np.argsort(-fitness(x))][:n]  # top n mutations
585
                w = fitness(x) - fitness(x).min() + 1E-6  # weights (sum > 0)
586
                if parent == 'single' or len(x) == 1:
587
                    # x = x[random.randint(0, n - 1)]  # random selection
588
                    x = x[random.choices(range(n), weights=w)[0]]  # weighted selection
589
                elif parent == 'weighted':
590
                    x = (x * w.reshape(n, 1)).sum(0) / w.sum()  # weighted combination
591

592
                # Mutate
593
                mp, s = 0.8, 0.2  # mutation probability, sigma
594
                npr = np.random
595
                npr.seed(int(time.time()))
596
                g = np.array([meta[k][0] for k in hyp.keys()])  # gains 0-1
597
                ng = len(meta)
598
                v = np.ones(ng)
599
                while all(v == 1):  # mutate until a change occurs (prevent duplicates)
600
                    v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0)
601
                for i, k in enumerate(hyp.keys()):  # plt.hist(v.ravel(), 300)
602
                    hyp[k] = float(x[i + 7] * v[i])  # mutate
603

604
            # Constrain to limits
605
            for k, v in meta.items():
606
                hyp[k] = max(hyp[k], v[1])  # lower limit
607
                hyp[k] = min(hyp[k], v[2])  # upper limit
608
                hyp[k] = round(hyp[k], 5)  # significant digits
609

610
            # Train mutation
611
            results = train(hyp.copy(), opt, device, callbacks)
612

613
            # Write mutation results
614
            print_mutation(results, hyp.copy(), save_dir, opt.bucket)
615

616
        # Plot results
617
        plot_evolve(evolve_csv)
618
        LOGGER.info(f'Hyperparameter evolution finished\n'
619
                    f"Results saved to {colorstr('bold', save_dir)}\n"
620
                    f'Use best hyperparameters example: $ python train.py --hyp {evolve_yaml}')
621

622

623
def run(**kwargs):
624
    # Usage: import train; train.run(data='coco128.yaml', imgsz=320, weights='yolov5m.pt')
625
    opt = parse_opt(True)
626
    for k, v in kwargs.items():
627
        setattr(opt, k, v)
628
    main(opt)
629

630

631
if __name__ == "__main__":
632
    opt = parse_opt()
633
    main(opt)
634

635
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

Product

Resources

Company

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more, all in one place. Commercial Alternative to JupyterHub.

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.