Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hukaixuan19970627
GitHub Repository: hukaixuan19970627/yolov5_obb
Path: blob/master/utils/aws/resume.py
801 views
1
# Resume all interrupted trainings in yolov5/ dir including DDP trainings
2
# Usage: $ python utils/aws/resume.py
3
4
import os
5
import sys
6
from pathlib import Path
7
8
import torch
9
import yaml
10
11
FILE = Path(__file__).resolve()
12
ROOT = FILE.parents[2] # YOLOv5 root directory
13
if str(ROOT) not in sys.path:
14
sys.path.append(str(ROOT)) # add ROOT to PATH
15
16
port = 0 # --master_port
17
path = Path('').resolve()
18
for last in path.rglob('*/**/last.pt'):
19
ckpt = torch.load(last)
20
if ckpt['optimizer'] is None:
21
continue
22
23
# Load opt.yaml
24
with open(last.parent.parent / 'opt.yaml', errors='ignore') as f:
25
opt = yaml.safe_load(f)
26
27
# Get device count
28
d = opt['device'].split(',') # devices
29
nd = len(d) # number of devices
30
ddp = nd > 1 or (nd == 0 and torch.cuda.device_count() > 1) # distributed data parallel
31
32
if ddp: # multi-GPU
33
port += 1
34
cmd = f'python -m torch.distributed.run --nproc_per_node {nd} --master_port {port} train.py --resume {last}'
35
else: # single-GPU
36
cmd = f'python train.py --resume {last}'
37
38
cmd += ' > /dev/null 2>&1 &' # redirect output to dev/null and run in daemon thread
39
print(cmd)
40
os.system(cmd)
41
42