CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
hukaixuan19970627

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: hukaixuan19970627/yolov5_obb
Path: blob/master/train.py
Views: 475
1
# YOLOv5 🚀 by Ultralytics, GPL-3.0 license
2
"""
3
Train a YOLOv5 model on a custom dataset
4
5
Usage:
6
$ python path/to/train.py --data coco128.yaml --weights yolov5s.pt --img 640
7
"""
8
import argparse
9
import math
10
import os
11
import random
12
import sys
13
import time
14
from copy import deepcopy
15
from datetime import datetime
16
from pathlib import Path
17
18
import numpy as np
19
import torch
20
import torch.distributed as dist
21
import torch.nn as nn
22
import yaml
23
from torch.cuda import amp
24
from torch.nn.parallel import DistributedDataParallel as DDP
25
from torch.optim import SGD, Adam, lr_scheduler
26
from tqdm import tqdm
27
28
FILE = Path(__file__).resolve()
29
ROOT = FILE.parents[0] # YOLOv5 root directory
30
if str(ROOT) not in sys.path:
31
sys.path.append(str(ROOT)) # add ROOT to PATH
32
ROOT = Path(os.path.relpath(ROOT, Path.cwd())) # relative
33
34
import val # for end-of-epoch mAP
35
from models.experimental import attempt_load
36
from models.yolo import Model
37
from utils.autoanchor import check_anchors
38
from utils.autobatch import check_train_batch_size
39
from utils.callbacks import Callbacks
40
from utils.datasets import create_dataloader
41
from utils.downloads import attempt_download
42
from utils.general import (LOGGER, check_dataset, check_file, check_git_status, check_img_size, check_requirements,
43
check_suffix, check_yaml, colorstr, get_latest_run, increment_path, init_seeds,
44
intersect_dicts, labels_to_class_weights, labels_to_image_weights, methods, one_cycle,
45
print_args, print_mutation, strip_optimizer)
46
from utils.loggers import Loggers
47
from utils.loggers.wandb.wandb_utils import check_wandb_resume
48
from utils.loss import ComputeLoss
49
from utils.metrics import fitness
50
from utils.plots import plot_evolve, plot_labels
51
from utils.torch_utils import EarlyStopping, ModelEMA, de_parallel, select_device, torch_distributed_zero_first
52
53
LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1)) # https://pytorch.org/docs/stable/elastic/run.html
54
RANK = int(os.getenv('RANK', -1))
55
WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1))
56
57
def train(hyp, # path/to/hyp.yaml or hyp dictionary
58
opt,
59
device,
60
callbacks
61
):
62
save_dir, epochs, batch_size, weights, single_cls, evolve, data, cfg, resume, noval, nosave, workers, freeze = \
63
Path(opt.save_dir), opt.epochs, opt.batch_size, opt.weights, opt.single_cls, opt.evolve, opt.data, opt.cfg, \
64
opt.resume, opt.noval, opt.nosave, opt.workers, opt.freeze
65
66
# Directories
67
w = save_dir / 'weights' # weights dir
68
(w.parent if evolve else w).mkdir(parents=True, exist_ok=True) # make dir
69
last, best = w / 'last.pt', w / 'best.pt'
70
71
# Hyperparameters
72
if isinstance(hyp, str):
73
with open(hyp, errors='ignore') as f:
74
hyp = yaml.safe_load(f) # load hyps dict
75
LOGGER.info(colorstr('hyperparameters: ') + ', '.join(f'{k}={v}' for k, v in hyp.items()))
76
77
# Save run settings
78
if not evolve:
79
with open(save_dir / 'hyp.yaml', 'w') as f:
80
yaml.safe_dump(hyp, f, sort_keys=False)
81
with open(save_dir / 'opt.yaml', 'w') as f:
82
yaml.safe_dump(vars(opt), f, sort_keys=False)
83
84
# Loggers
85
data_dict = None
86
if RANK in [-1, 0]:
87
loggers = Loggers(save_dir, weights, opt, hyp, LOGGER) # loggers instance
88
if loggers.wandb:
89
data_dict = loggers.wandb.data_dict
90
if resume:
91
weights, epochs, hyp = opt.weights, opt.epochs, opt.hyp
92
93
# Register actions
94
for k in methods(loggers):
95
callbacks.register_action(k, callback=getattr(loggers, k))
96
97
# Config
98
plots = not evolve # create plots
99
cuda = device.type != 'cpu'
100
init_seeds(1 + RANK)
101
with torch_distributed_zero_first(LOCAL_RANK):
102
data_dict = data_dict or check_dataset(data) # check if None
103
train_path, val_path = data_dict['train'], data_dict['val']
104
nc = 1 if single_cls else int(data_dict['nc']) # number of classes
105
names = ['item'] if single_cls and len(data_dict['names']) != 1 else data_dict['names'] # class names
106
assert len(names) == nc, f'{len(names)} names found for nc={nc} dataset in {data}' # check
107
is_coco = isinstance(val_path, str) and val_path.endswith('coco/val2017.txt') # COCO dataset
108
109
# Model
110
check_suffix(weights, '.pt') # check weights
111
pretrained = weights.endswith('.pt')
112
if pretrained:
113
with torch_distributed_zero_first(LOCAL_RANK):
114
weights = attempt_download(weights) # download if not found locally
115
ckpt = torch.load(weights, map_location=device) # load checkpoint
116
model = Model(cfg or ckpt['model'].yaml, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create
117
exclude = ['anchor'] if (cfg or hyp.get('anchors')) and not resume else [] # exclude keys
118
csd = ckpt['model'].float().state_dict() # checkpoint state_dict as FP32
119
csd = intersect_dicts(csd, model.state_dict(), exclude=exclude) # intersect
120
model.load_state_dict(csd, strict=False) # load
121
LOGGER.info(f'Transferred {len(csd)}/{len(model.state_dict())} items from {weights}') # report
122
else:
123
model = Model(cfg, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create
124
125
# Freeze
126
freeze = [f'model.{x}.' for x in (freeze if len(freeze) > 1 else range(freeze[0]))] # layers to freeze
127
for k, v in model.named_parameters():
128
v.requires_grad = True # train all layers
129
if any(x in k for x in freeze):
130
LOGGER.info(f'freezing {k}')
131
v.requires_grad = False
132
133
# Image size
134
gs = max(int(model.stride.max()), 32) # grid size (max stride)
135
imgsz = check_img_size(opt.imgsz, gs, floor=gs * 2) # verify imgsz is gs-multiple
136
137
# Batch size
138
if RANK == -1 and batch_size == -1: # single-GPU only, estimate best batch size
139
batch_size = check_train_batch_size(model, imgsz)
140
loggers.on_params_update({"batch_size": batch_size})
141
142
# Optimizer
143
nbs = 64 # nominal batch size
144
accumulate = max(round(nbs / batch_size), 1) # accumulate loss before optimizing
145
hyp['weight_decay'] *= batch_size * accumulate / nbs # scale weight_decay
146
LOGGER.info(f"Scaled weight_decay = {hyp['weight_decay']}")
147
148
g0, g1, g2 = [], [], [] # optimizer parameter groups
149
for v in model.modules():
150
if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): # bias
151
g2.append(v.bias)
152
if isinstance(v, nn.BatchNorm2d): # weight (no decay)
153
g0.append(v.weight)
154
elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter): # weight (with decay)
155
g1.append(v.weight)
156
157
if opt.adam:
158
optimizer = Adam(g0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum
159
else:
160
optimizer = SGD(g0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
161
162
optimizer.add_param_group({'params': g1, 'weight_decay': hyp['weight_decay']}) # add g1 with weight_decay
163
optimizer.add_param_group({'params': g2}) # add g2 (biases)
164
LOGGER.info(f"{colorstr('optimizer:')} {type(optimizer).__name__} with parameter groups "
165
f"{len(g0)} weight, {len(g1)} weight (no decay), {len(g2)} bias")
166
del g0, g1, g2
167
168
# Scheduler
169
if opt.linear_lr:
170
lf = lambda x: (1 - x / (epochs - 1)) * (1.0 - hyp['lrf']) + hyp['lrf'] # linear
171
else:
172
lf = one_cycle(1, hyp['lrf'], epochs) # cosine 1->hyp['lrf']
173
scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs)
174
175
# EMA
176
ema = ModelEMA(model) if RANK in [-1, 0] else None
177
178
# Resume
179
start_epoch, best_fitness = 0, 0.0
180
if pretrained:
181
# Optimizer
182
if ckpt['optimizer'] is not None:
183
optimizer.load_state_dict(ckpt['optimizer'])
184
best_fitness = ckpt['best_fitness']
185
186
# EMA
187
if ema and ckpt.get('ema'):
188
ema.ema.load_state_dict(ckpt['ema'].float().state_dict())
189
ema.updates = ckpt['updates']
190
191
# Epochs
192
start_epoch = ckpt['epoch'] + 1
193
if resume:
194
assert start_epoch > 0, f'{weights} training to {epochs} epochs is finished, nothing to resume.'
195
if epochs < start_epoch:
196
LOGGER.info(f"{weights} has been trained for {ckpt['epoch']} epochs. Fine-tuning for {epochs} more epochs.")
197
epochs += ckpt['epoch'] # finetune additional epochs
198
199
del ckpt, csd
200
201
# DP mode
202
if cuda and RANK == -1 and torch.cuda.device_count() > 1:
203
LOGGER.warning('WARNING: DP not recommended, use torch.distributed.run for best DDP Multi-GPU results.\n'
204
'See Multi-GPU Tutorial at https://github.com/ultralytics/yolov5/issues/475 to get started.')
205
model = torch.nn.DataParallel(model)
206
207
# SyncBatchNorm
208
if opt.sync_bn and cuda and RANK != -1:
209
model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
210
LOGGER.info('Using SyncBatchNorm()')
211
212
# Trainloader
213
train_loader, dataset = create_dataloader(train_path, imgsz, batch_size // WORLD_SIZE, gs, names, single_cls,
214
hyp=hyp, augment=True, cache=opt.cache, rect=opt.rect, rank=LOCAL_RANK,
215
workers=workers, image_weights=opt.image_weights, quad=opt.quad,
216
prefix=colorstr('train: '), shuffle=True)
217
mlc = int(np.concatenate(dataset.labels, 0)[:, 0].max()) # max label class
218
nb = len(train_loader) # number of batches
219
assert mlc < nc, f'Label class {mlc} exceeds nc={nc} in {data}. Possible class labels are 0-{nc - 1}'
220
221
# Process 0
222
if RANK in [-1, 0]:
223
val_loader = create_dataloader(val_path, imgsz, batch_size // WORLD_SIZE * 2, gs, names, single_cls,
224
hyp=hyp, cache=None if noval else opt.cache, rect=True, rank=-1,
225
workers=workers, pad=0.5,
226
prefix=colorstr('val: '))[0]
227
228
if not resume:
229
labels = np.concatenate(dataset.labels, 0) # labels(array): (all_images_gt_num, [cls_id, poly])
230
# c = torch.tensor(labels[:, 0]) # classes
231
# cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency
232
# model._initialize_biases(cf.to(device))
233
if plots:
234
plot_labels(labels, names, save_dir, imgsz)
235
236
# Anchors
237
if not opt.noautoanchor:
238
check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz)
239
model.half().float() # pre-reduce anchor precision
240
241
callbacks.run('on_pretrain_routine_end')
242
243
# DDP mode
244
if cuda and RANK != -1:
245
model = DDP(model, device_ids=[LOCAL_RANK], output_device=LOCAL_RANK)
246
247
# Model attributes
248
nl = de_parallel(model).model[-1].nl # number of detection layers (to scale hyps)
249
hyp['box'] *= 3 / nl # scale to layers
250
hyp['cls'] *= nc / 80 * 3 / nl # scale to classes and layers
251
hyp['obj'] *= (imgsz / 640) ** 2 * 3 / nl # scale to image size and layers
252
hyp['theta'] *= 3 / nl
253
hyp['label_smoothing'] = opt.label_smoothing
254
model.nc = nc # attach number of classes to model
255
model.hyp = hyp # attach hyperparameters to model
256
model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) * nc # attach class weights
257
model.names = names
258
259
# Start training
260
t0 = time.time()
261
nw = max(round(hyp['warmup_epochs'] * nb), 1000) # number of warmup iterations, max(3 epochs, 1k iterations)
262
# nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training
263
last_opt_step = -1
264
maps = np.zeros(nc) # mAP per class
265
results = (0, 0, 0, 0, 0, 0, 0, 0) # P, R, [email protected], [email protected], val_loss(box, obj, cls, theta)
266
scheduler.last_epoch = start_epoch - 1 # do not move
267
scaler = amp.GradScaler(enabled=cuda)
268
stopper = EarlyStopping(patience=opt.patience)
269
compute_loss = ComputeLoss(model) # init loss class
270
LOGGER.info(f'Image sizes {imgsz} train, {imgsz} val\n'
271
f'Using {train_loader.num_workers * WORLD_SIZE} dataloader workers\n'
272
f"Logging results to {colorstr('bold', save_dir)}\n"
273
f'Starting training for {epochs} epochs...')
274
for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------
275
model.train()
276
277
# Update image weights (optional, single-GPU only)
278
if opt.image_weights:
279
cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2 / nc # class weights
280
iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights
281
dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx
282
283
# Update mosaic border (optional)
284
# b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs)
285
# dataset.mosaic_border = [b - imgsz, -b] # height, width borders
286
287
# mloss = torch.zeros(3, device=device) # mean losses
288
mloss = torch.zeros(4, device=device) # mean losses
289
if RANK != -1:
290
train_loader.sampler.set_epoch(epoch)
291
pbar = enumerate(train_loader)
292
# LOGGER.info(('\n' + '%10s' * 7) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'labels', 'img_size'))
293
LOGGER.info(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'theta', 'labels', 'img_size'))
294
if RANK in [-1, 0]:
295
pbar = tqdm(pbar, total=nb, bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}') # progress bar
296
optimizer.zero_grad()
297
for i, (imgs, targets, paths, _) in pbar: # batch -------------------------------------------------------------
298
ni = i + nb * epoch # number integrated batches (since train start)
299
imgs = imgs.to(device, non_blocking=True).float() / 255 # uint8 to float32, 0-255 to 0.0-1.0
300
301
# Warmup
302
if ni <= nw:
303
xi = [0, nw] # x interp
304
# compute_loss.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou)
305
accumulate = max(1, np.interp(ni, xi, [1, nbs / batch_size]).round())
306
for j, x in enumerate(optimizer.param_groups):
307
# bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
308
x['lr'] = np.interp(ni, xi, [hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch)])
309
if 'momentum' in x:
310
x['momentum'] = np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']])
311
312
# Multi-scale
313
if opt.multi_scale and not opt.rect:
314
sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size
315
sf = sz / max(imgs.shape[2:]) # scale factor , img (tensor): (b, 3, height, width)
316
if sf != 1:
317
ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]] # new shape (stretched to gs-multiple) [h_new, w_new]
318
label_ratio = float(ns[0]) / imgs.shape[2]
319
imgs = nn.functional.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)
320
targets[:, 2:6] *= label_ratio # targets (tensor): (n_targets, [img_index clsid cx cy l s theta gaussian_θ_labels])
321
322
323
# Forward
324
with amp.autocast(enabled=cuda):
325
pred = model(imgs) # forward
326
loss, loss_items = compute_loss(pred, targets.to(device)) # loss scaled by batch_size
327
if RANK != -1:
328
loss *= WORLD_SIZE # gradient averaged between devices in DDP mode
329
if opt.quad:
330
loss *= 4.
331
332
# Backward
333
scaler.scale(loss).backward()
334
335
# Optimize
336
if ni - last_opt_step >= accumulate:
337
scaler.step(optimizer) # optimizer.step
338
scaler.update()
339
optimizer.zero_grad()
340
if ema:
341
ema.update(model)
342
last_opt_step = ni
343
344
# Log
345
if RANK in [-1, 0]:
346
mloss = (mloss * i + loss_items) / (i + 1) # update mean losses
347
mem = f'{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G' # (GB)
348
# pbar.set_description(('%10s' * 2 + '%10.4g' * 5) % (
349
pbar.set_description(('%10s' * 2 + '%10.4g' * 6) % (
350
f'{epoch}/{epochs - 1}', mem, *mloss, targets.shape[0], imgs.shape[-1]))
351
callbacks.run('on_train_batch_end', ni, model, imgs, targets, paths, plots, opt.sync_bn)
352
# end batch ------------------------------------------------------------------------------------------------
353
354
# Scheduler
355
lr = [x['lr'] for x in optimizer.param_groups] # for loggers
356
scheduler.step()
357
358
if RANK in [-1, 0]:
359
# mAP
360
callbacks.run('on_train_epoch_end', epoch=epoch)
361
ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'names', 'stride', 'class_weights'])
362
final_epoch = (epoch + 1 == epochs) or stopper.possible_stop
363
if not noval or final_epoch: # Calculate mAP
364
results, maps, _ = val.run(data_dict,
365
batch_size=batch_size // WORLD_SIZE * 2,
366
imgsz=imgsz,
367
model=ema.ema,
368
single_cls=single_cls,
369
dataloader=val_loader,
370
save_dir=save_dir,
371
plots=False,
372
callbacks=callbacks,
373
compute_loss=compute_loss)
374
375
# Update best mAP
376
fi = fitness(np.array(results).reshape(1, -1)) # weighted combination of [P, R, [email protected], [email protected]]
377
if fi > best_fitness:
378
best_fitness = fi
379
log_vals = list(mloss) + list(results) + lr
380
callbacks.run('on_fit_epoch_end', log_vals, epoch, best_fitness, fi)
381
382
# Save model
383
if (not nosave) or (final_epoch and not evolve): # if save
384
ckpt = {'epoch': epoch,
385
'best_fitness': best_fitness,
386
'model': deepcopy(de_parallel(model)).half(),
387
'ema': deepcopy(ema.ema).half(),
388
'updates': ema.updates,
389
'optimizer': optimizer.state_dict(),
390
'wandb_id': loggers.wandb.wandb_run.id if loggers.wandb else None,
391
'date': datetime.now().isoformat()}
392
393
# Save last, best and delete
394
torch.save(ckpt, last)
395
if best_fitness == fi:
396
torch.save(ckpt, best)
397
if (epoch > 0) and (opt.save_period > 0) and (epoch % opt.save_period == 0):
398
torch.save(ckpt, w / f'epoch{epoch}.pt')
399
del ckpt
400
callbacks.run('on_model_save', last, epoch, final_epoch, best_fitness, fi)
401
402
# Stop Single-GPU
403
if RANK == -1 and stopper(epoch=epoch, fitness=fi):
404
break
405
406
# Stop DDP TODO: known issues shttps://github.com/ultralytics/yolov5/pull/4576
407
# stop = stopper(epoch=epoch, fitness=fi)
408
# if RANK == 0:
409
# dist.broadcast_object_list([stop], 0) # broadcast 'stop' to all ranks
410
411
# Stop DPP
412
# with torch_distributed_zero_first(RANK):
413
# if stop:
414
# break # must break all DDP ranks
415
416
# end epoch ----------------------------------------------------------------------------------------------------
417
# end training -----------------------------------------------------------------------------------------------------
418
if RANK in [-1, 0]:
419
LOGGER.info(f'\n{epoch - start_epoch + 1} epochs completed in {(time.time() - t0) / 3600:.3f} hours.')
420
for f in last, best:
421
if f.exists():
422
strip_optimizer(f) # strip optimizers
423
if f is best:
424
LOGGER.info(f'\nValidating {f}...')
425
results, _, _ = val.run(data_dict,
426
batch_size=batch_size // WORLD_SIZE * 2,
427
imgsz=imgsz,
428
model=attempt_load(f, device).half(),
429
iou_thres=0.65 if is_coco else 0.60, # best pycocotools results at 0.65
430
single_cls=single_cls,
431
dataloader=val_loader,
432
save_dir=save_dir,
433
save_json=is_coco,
434
verbose=True,
435
plots=True,
436
callbacks=callbacks,
437
compute_loss=compute_loss) # val best model with plots
438
if is_coco:
439
callbacks.run('on_fit_epoch_end', list(mloss) + list(results) + lr, epoch, best_fitness, fi)
440
441
callbacks.run('on_train_end', last, best, plots, epoch, results)
442
LOGGER.info(f"Results saved to {colorstr('bold', save_dir)}")
443
444
torch.cuda.empty_cache()
445
return results
446
447
448
def parse_opt(known=False):
449
parser = argparse.ArgumentParser()
450
parser.add_argument('--weights', type=str, default=ROOT / 'weights/yolov5n.pt', help='initial weights path')
451
parser.add_argument('--cfg', type=str, default='', help='model.yaml path')
452
parser.add_argument('--data', type=str, default=ROOT / 'data/DroneVehicle_poly.yaml', help='dataset.yaml path')
453
parser.add_argument('--hyp', type=str, default=ROOT / 'data/hyps/obb/hyp.finetune_dota.yaml', help='hyperparameters path')
454
parser.add_argument('--epochs', type=int, default=5)
455
parser.add_argument('--batch-size', type=int, default=128, help='total batch size for all GPUs, -1 for autobatch')
456
parser.add_argument('--imgsz', '--img', '--img-size', type=int, default=840, help='train, val image size (pixels)')
457
parser.add_argument('--rect', action='store_true', help='rectangular training')
458
parser.add_argument('--resume', nargs='?', const=True, default=False, help='resume most recent training')
459
parser.add_argument('--nosave', action='store_true', help='only save final checkpoint')
460
parser.add_argument('--noval', action='store_true', help='only validate final epoch')
461
parser.add_argument('--noautoanchor', action='store_true', help='disable autoanchor check')
462
parser.add_argument('--evolve', type=int, nargs='?', const=300, help='evolve hyperparameters for x generations')
463
parser.add_argument('--bucket', type=str, default='', help='gsutil bucket')
464
parser.add_argument('--cache', type=str, nargs='?', const='ram', help='--cache images in "ram" (default) or "disk"')
465
parser.add_argument('--image-weights', action='store_true', help='use weighted image selection for training')
466
parser.add_argument('--device', default='1', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
467
parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%')
468
parser.add_argument('--single-cls', action='store_true', help='train multi-class data as single-class')
469
parser.add_argument('--adam', action='store_true', help='use torch.optim.Adam() optimizer')
470
parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode')
471
parser.add_argument('--workers', type=int, default=8, help='max dataloader workers (per RANK in DDP mode)')
472
parser.add_argument('--project', default=ROOT / 'runs/train', help='save to project/name')
473
parser.add_argument('--name', default='exp', help='save to project/name')
474
parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment')
475
parser.add_argument('--quad', action='store_true', help='quad dataloader')
476
parser.add_argument('--linear-lr', action='store_true', help='linear LR')
477
parser.add_argument('--label-smoothing', type=float, default=0.0, help='Label smoothing epsilon')
478
parser.add_argument('--patience', type=int, default=100, help='EarlyStopping patience (epochs without improvement)')
479
parser.add_argument('--freeze', nargs='+', type=int, default=[0], help='Freeze layers: backbone=10, first3=0 1 2')
480
parser.add_argument('--save-period', type=int, default=-1, help='Save checkpoint every x epochs (disabled if < 1)')
481
parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify')
482
483
# Weights & Biases arguments
484
parser.add_argument('--entity', default=None, help='W&B: Entity')
485
parser.add_argument('--upload_dataset', nargs='?', const=True, default=False, help='W&B: Upload data, "val" option')
486
parser.add_argument('--bbox_interval', type=int, default=-1, help='W&B: Set bounding-box image logging interval')
487
parser.add_argument('--artifact_alias', type=str, default='latest', help='W&B: Version of dataset artifact to use')
488
489
opt = parser.parse_known_args()[0] if known else parser.parse_args()
490
return opt
491
492
493
def main(opt, callbacks=Callbacks()):
494
# Checks
495
if RANK in [-1, 0]:
496
print_args(FILE.stem, opt)
497
check_git_status()
498
check_requirements(exclude=['thop'])
499
500
# Resume
501
if opt.resume and not check_wandb_resume(opt) and not opt.evolve: # resume an interrupted run
502
ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run() # specified or most recent path
503
assert os.path.isfile(ckpt), 'ERROR: --resume checkpoint does not exist'
504
with open(Path(ckpt).parent.parent / 'opt.yaml', errors='ignore') as f:
505
opt = argparse.Namespace(**yaml.safe_load(f)) # replace
506
opt.cfg, opt.weights, opt.resume = '', ckpt, True # reinstate
507
LOGGER.info(f'Resuming training from {ckpt}')
508
else:
509
opt.data, opt.cfg, opt.hyp, opt.weights, opt.project = \
510
check_file(opt.data), check_yaml(opt.cfg), check_yaml(opt.hyp), str(opt.weights), str(opt.project) # checks
511
assert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified'
512
if opt.evolve:
513
opt.project = str(ROOT / 'runs/evolve')
514
opt.exist_ok, opt.resume = opt.resume, False # pass resume to exist_ok and disable resume
515
opt.save_dir = str(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok))
516
517
# DDP mode
518
device = select_device(opt.device, batch_size=opt.batch_size)
519
if LOCAL_RANK != -1:
520
assert torch.cuda.device_count() > LOCAL_RANK, 'insufficient CUDA devices for DDP command'
521
assert opt.batch_size % WORLD_SIZE == 0, '--batch-size must be multiple of CUDA device count'
522
assert not opt.image_weights, '--image-weights argument is not compatible with DDP training'
523
assert not opt.evolve, '--evolve argument is not compatible with DDP training'
524
torch.cuda.set_device(LOCAL_RANK)
525
device = torch.device('cuda', LOCAL_RANK)
526
dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo")
527
528
# Train
529
if not opt.evolve:
530
train(opt.hyp, opt, device, callbacks)
531
if WORLD_SIZE > 1 and RANK == 0:
532
LOGGER.info('Destroying process group... ')
533
dist.destroy_process_group()
534
535
# Evolve hyperparameters (optional)
536
else:
537
# Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit)
538
meta = {'lr0': (1, 1e-5, 1e-1), # initial learning rate (SGD=1E-2, Adam=1E-3)
539
'lrf': (1, 0.01, 1.0), # final OneCycleLR learning rate (lr0 * lrf)
540
'momentum': (0.3, 0.6, 0.98), # SGD momentum/Adam beta1
541
'weight_decay': (1, 0.0, 0.001), # optimizer weight decay
542
'warmup_epochs': (1, 0.0, 5.0), # warmup epochs (fractions ok)
543
'warmup_momentum': (1, 0.0, 0.95), # warmup initial momentum
544
'warmup_bias_lr': (1, 0.0, 0.2), # warmup initial bias lr
545
'box': (1, 0.02, 0.2), # box loss gain
546
'cls': (1, 0.2, 4.0), # cls loss gain
547
'cls_pw': (1, 0.5, 2.0), # cls BCELoss positive_weight
548
'obj': (1, 0.2, 4.0), # obj loss gain (scale with pixels)
549
'obj_pw': (1, 0.5, 2.0), # obj BCELoss positive_weight
550
'iou_t': (0, 0.1, 0.7), # IoU training threshold
551
'anchor_t': (1, 2.0, 8.0), # anchor-multiple threshold
552
'anchors': (2, 2.0, 10.0), # anchors per output grid (0 to ignore)
553
'fl_gamma': (0, 0.0, 2.0), # focal loss gamma (efficientDet default gamma=1.5)
554
'hsv_h': (1, 0.0, 0.1), # image HSV-Hue augmentation (fraction)
555
'hsv_s': (1, 0.0, 0.9), # image HSV-Saturation augmentation (fraction)
556
'hsv_v': (1, 0.0, 0.9), # image HSV-Value augmentation (fraction)
557
'degrees': (1, 0.0, 45.0), # image rotation (+/- deg)
558
'translate': (1, 0.0, 0.9), # image translation (+/- fraction)
559
'scale': (1, 0.0, 0.9), # image scale (+/- gain)
560
'shear': (1, 0.0, 10.0), # image shear (+/- deg)
561
'perspective': (0, 0.0, 0.001), # image perspective (+/- fraction), range 0-0.001
562
'flipud': (1, 0.0, 1.0), # image flip up-down (probability)
563
'fliplr': (0, 0.0, 1.0), # image flip left-right (probability)
564
'mosaic': (1, 0.0, 1.0), # image mixup (probability)
565
'mixup': (1, 0.0, 1.0), # image mixup (probability)
566
'copy_paste': (1, 0.0, 1.0)} # segment copy-paste (probability)
567
568
with open(opt.hyp, errors='ignore') as f:
569
hyp = yaml.safe_load(f) # load hyps dict
570
if 'anchors' not in hyp: # anchors commented in hyp.yaml
571
hyp['anchors'] = 3
572
opt.noval, opt.nosave, save_dir = True, True, Path(opt.save_dir) # only val/save final epoch
573
# ei = [isinstance(x, (int, float)) for x in hyp.values()] # evolvable indices
574
evolve_yaml, evolve_csv = save_dir / 'hyp_evolve.yaml', save_dir / 'evolve.csv'
575
if opt.bucket:
576
os.system(f'gsutil cp gs://{opt.bucket}/evolve.csv {save_dir}') # download evolve.csv if exists
577
578
for _ in range(opt.evolve): # generations to evolve
579
if evolve_csv.exists(): # if evolve.csv exists: select best hyps and mutate
580
# Select parent(s)
581
parent = 'single' # parent selection method: 'single' or 'weighted'
582
x = np.loadtxt(evolve_csv, ndmin=2, delimiter=',', skiprows=1)
583
n = min(5, len(x)) # number of previous results to consider
584
x = x[np.argsort(-fitness(x))][:n] # top n mutations
585
w = fitness(x) - fitness(x).min() + 1E-6 # weights (sum > 0)
586
if parent == 'single' or len(x) == 1:
587
# x = x[random.randint(0, n - 1)] # random selection
588
x = x[random.choices(range(n), weights=w)[0]] # weighted selection
589
elif parent == 'weighted':
590
x = (x * w.reshape(n, 1)).sum(0) / w.sum() # weighted combination
591
592
# Mutate
593
mp, s = 0.8, 0.2 # mutation probability, sigma
594
npr = np.random
595
npr.seed(int(time.time()))
596
g = np.array([meta[k][0] for k in hyp.keys()]) # gains 0-1
597
ng = len(meta)
598
v = np.ones(ng)
599
while all(v == 1): # mutate until a change occurs (prevent duplicates)
600
v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0)
601
for i, k in enumerate(hyp.keys()): # plt.hist(v.ravel(), 300)
602
hyp[k] = float(x[i + 7] * v[i]) # mutate
603
604
# Constrain to limits
605
for k, v in meta.items():
606
hyp[k] = max(hyp[k], v[1]) # lower limit
607
hyp[k] = min(hyp[k], v[2]) # upper limit
608
hyp[k] = round(hyp[k], 5) # significant digits
609
610
# Train mutation
611
results = train(hyp.copy(), opt, device, callbacks)
612
613
# Write mutation results
614
print_mutation(results, hyp.copy(), save_dir, opt.bucket)
615
616
# Plot results
617
plot_evolve(evolve_csv)
618
LOGGER.info(f'Hyperparameter evolution finished\n'
619
f"Results saved to {colorstr('bold', save_dir)}\n"
620
f'Use best hyperparameters example: $ python train.py --hyp {evolve_yaml}')
621
622
623
def run(**kwargs):
624
# Usage: import train; train.run(data='coco128.yaml', imgsz=320, weights='yolov5m.pt')
625
opt = parse_opt(True)
626
for k, v in kwargs.items():
627
setattr(opt, k, v)
628
main(opt)
629
630
631
if __name__ == "__main__":
632
opt = parse_opt()
633
main(opt)
634
635