train.py 44 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762
  1. # YOLOv5 🚀 by Ultralytics, GPL-3.0 license
  2. """
  3. Train a YOLOv5 model on a custom dataset.
  4. Models and datasets download automatically from the latest YOLOv5 release.
  5. Models: https://github.com/ultralytics/yolov5/tree/master/models
  6. Datasets: https://github.com/ultralytics/yolov5/tree/master/data
  7. Tutorial: https://github.com/ultralytics/yolov5/wiki/Train-Custom-Data
  8. Usage:
  9. $ python path/to/train.py --data coco128.yaml --weights yolov5s.pt --img 640 # from pretrained (RECOMMENDED)
  10. $ python path/to/train.py --data coco128.yaml --weights '' --cfg yolov5s.yaml --img 640 # from scratch
  11. """
  12. import argparse
  13. import math
  14. import os
  15. import random
  16. import sys
  17. import time
  18. from copy import deepcopy
  19. from datetime import datetime
  20. from pathlib import Path
  21. import numpy as np
  22. import torch
  23. import torch.distributed as dist
  24. import torch.nn as nn
  25. import yaml
  26. from torch.cuda import amp
  27. from torch.nn.parallel import DistributedDataParallel as DDP
  28. from torch.optim import SGD, Adam, AdamW, lr_scheduler
  29. from tqdm.auto import tqdm
  30. FILE = Path(__file__).resolve()
  31. ROOT = FILE.parents[0] # YOLOv5 root directory
  32. if str(ROOT) not in sys.path:
  33. sys.path.append(str(ROOT)) # add ROOT to PATH
  34. ROOT = Path(os.path.relpath(ROOT, Path.cwd())) # relative
  35. import val # for end-of-epoch mAP
  36. from models.experimental import attempt_load
  37. from models.yolo import Model
  38. from utils.autoanchor import check_anchors
  39. from utils.autobatch import check_train_batch_size
  40. from utils.callbacks import Callbacks
  41. from utils.datasets import create_dataloader
  42. from utils.downloads import attempt_download
  43. from utils.general import (LOGGER, check_dataset, check_file, check_git_status, check_img_size, check_requirements,
  44. check_suffix, check_yaml, colorstr, get_latest_run, increment_path, init_seeds,
  45. intersect_dicts, is_ascii, labels_to_class_weights, labels_to_image_weights, methods,
  46. one_cycle, print_args, print_mutation, strip_optimizer)
  47. from utils.loggers import Loggers
  48. from utils.loggers.wandb.wandb_utils import check_wandb_resume
  49. from utils.loss import ComputeLoss
  50. from utils.metrics import fitness
  51. from utils.plots import check_font, plot_evolve, plot_labels
  52. from utils.torch_utils import EarlyStopping, ModelEMA, de_parallel, select_device, torch_distributed_zero_first
  53. LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1)) # https://pytorch.org/docs/stable/elastic/run.html
  54. RANK = int(os.getenv('RANK', -1))
  55. WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1))
  56. def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictionary
  57. save_dir, epochs, batch_size, weights, single_cls, evolve, data, cfg, resume, noval, nosave, workers, freeze = \
  58. Path(opt.save_dir), opt.epochs, opt.batch_size, opt.weights, opt.single_cls, opt.evolve, opt.data, opt.cfg, \
  59. opt.resume, opt.noval, opt.nosave, opt.workers, opt.freeze
  60. callbacks.run('on_pretrain_routine_start')
  61. # Directories
  62. w = save_dir / 'weights' # weights dir 权重保存目录
  63. (w.parent if evolve else w).mkdir(parents=True, exist_ok=True) # make dir
  64. last, best = w / 'last.pt', w / 'best.pt'
  65. # Hyperparameters
  66. if isinstance(hyp, str): # 超参数文件
  67. with open(hyp, errors='ignore') as f:
  68. hyp = yaml.safe_load(f) # load hyps dict 解析yaml文件
  69. LOGGER.info(colorstr('hyperparameters: ') + ', '.join(f'{k}={v}' for k, v in hyp.items()))
  70. # Save run settings 保存hyp和opt
  71. if not evolve:
  72. with open(save_dir / 'hyp.yaml', 'w') as f:
  73. yaml.safe_dump(hyp, f, sort_keys=False)
  74. with open(save_dir / 'opt.yaml', 'w') as f:
  75. yaml.safe_dump(vars(opt), f, sort_keys=False)
  76. # Loggers
  77. data_dict = None
  78. if RANK in [-1, 0]:
  79. loggers = Loggers(save_dir, weights, opt, hyp, LOGGER) # loggers instance
  80. if loggers.wandb:
  81. data_dict = loggers.wandb.data_dict
  82. if resume:
  83. weights, epochs, hyp, batch_size = opt.weights, opt.epochs, opt.hyp, opt.batch_size
  84. # Register actions
  85. for k in methods(loggers):
  86. callbacks.register_action(k, callback=getattr(loggers, k))
  87. # Config
  88. plots = not evolve and not opt.noplots # create plots
  89. cuda = device.type != 'cpu'
  90. init_seeds(1 + RANK) # 初始化随机种子
  91. with torch_distributed_zero_first(LOCAL_RANK): # 加载数据配置信息
  92. data_dict = data_dict or check_dataset(data) # check if None
  93. if not is_ascii(data_dict['names']): # non-latin labels, i.e. asian, arabic, cyrillic
  94. check_font('Arial.Unicode.ttf', progress=True)
  95. train_path, val_path = data_dict['train'], data_dict['val'] # 获取训练集、验证集路径
  96. nc = 1 if single_cls else int(data_dict['nc']) # number of classes 获取类别数量,如果设置了opt。single_cls则为一类
  97. names = ['item'] if single_cls and len(data_dict['names']) != 1 else data_dict['names'] # class names 获取类别名字,如果设置了opt。single_cls则为一类
  98. assert len(names) == nc, f'{len(names)} names found for nc={nc} dataset in {data}' # check
  99. is_coco = isinstance(val_path, str) and val_path.endswith('coco/val2017.txt') # COCO dataset
  100. # Model
  101. check_suffix(weights, '.pt') # check weights
  102. pretrained = weights.endswith('.pt') # 检查权重文件后缀名是否为.pt
  103. # 是否采用预训练
  104. if pretrained: # 加载预训练模型
  105. with torch_distributed_zero_first(LOCAL_RANK): # 加载模型
  106. weights = attempt_download(weights) # download if not found locally 如果本地没有则需要下载
  107. ckpt = torch.load(weights, map_location='cpu') # load checkpoint to CPU to avoid CUDA memory leak 加载模型及参数
  108. '''
  109. 这里模型的创建可以通过opt.cfg,也可以通过ckpt['model'].yaml
  110. 这里的区别在于是否是resume, resume时会将opt.cfg设为空,
  111. 按照ckpt['model'].yaml创建模型;
  112. 这也影响着下面是否除去anchor的key,如果resume则不加载anchor
  113. 主要是因为保存的模型会保存anchors,有时候用户自定义anchor之后,再resume,则原来基于coco数据集的anchor就会壶盖自己设定的anchor,
  114. 所以下面设置了intersect_dicts,该函数就是忽略掉exclude
  115. '''
  116. model = Model(cfg or ckpt['model'].yaml, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create 通过opt.cfg或者ckpt['model'].yaml创建加载模型
  117. exclude = ['anchor'] if (cfg or hyp.get('anchors')) and not resume else [] # exclude keys
  118. csd = ckpt['model'].float().state_dict() # checkpoint state_dict as FP32 读取ckpt的参数值
  119. csd = intersect_dicts(csd, model.state_dict(), exclude=exclude) # intersect 转换为字典格式
  120. model.load_state_dict(csd, strict=False) # load 预训练模型加载
  121. # 显示加载预训练权重的键值对和创建模型的键值对
  122. # 如果pretrained为true,则会少加载两个键值对(anchors, anchor_grid)
  123. LOGGER.info(f'Transferred {len(csd)}/{len(model.state_dict())} items from {weights}') # report
  124. else:
  125. #创建新的模型,从头开始训练,ch为输入图片通道
  126. model = Model(cfg, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create
  127. # Freeze 冻结模型层
  128. """
  129. 设置冻结层名字即可冻结模型层
  130. 作者不建议冻结层,因为在实验中显示冻结层不能获得更好的性能
  131. 作者为了使得优化参数分组可以正常进行,在下面将所有参数的requires_grad设为True
  132. """
  133. freeze = [f'model.{x}.' for x in (freeze if len(freeze) > 1 else range(freeze[0]))] # layers to freeze 将需要冻结的层放入列表中
  134. for k, v in model.named_parameters():
  135. v.requires_grad = True # train all layers
  136. if any(x in k for x in freeze): # 如果当前层为需要冻结的层,则将v.requires_grad置为false
  137. LOGGER.info(f'freezing {k}')
  138. v.requires_grad = False
  139. # Image size
  140. gs = max(int(model.stride.max()), 32) # grid size (max stride)
  141. imgsz = check_img_size(opt.imgsz, gs, floor=gs * 2) # verify imgsz is gs-multiple # 检查输入图片分辨率确保能够整除总步长gs
  142. # Batch size
  143. if RANK == -1 and batch_size == -1: # single-GPU only, estimate best batch size
  144. batch_size = check_train_batch_size(model, imgsz)
  145. loggers.on_params_update({"batch_size": batch_size})
  146. # Optimizer
  147. # nbs为模拟的batch_size; 比如默认上面设置的opt.batch_size为16,nbs为64,则模型梯度累积了64/16=4次之后再更新一次模型,变相扩大了batch_size
  148. nbs = 64 # nominal batch size
  149. accumulate = max(round(nbs / batch_size), 1) # accumulate loss before optimizing
  150. hyp['weight_decay'] *= batch_size * accumulate / nbs # scale weight_decay 根据accumulate设置权重衰减系数
  151. LOGGER.info(f"Scaled weight_decay = {hyp['weight_decay']}")
  152. g = [], [], [] # optimizer parameter groups ********************************************************************
  153. bn = tuple(v for k, v in nn.__dict__.items() if 'Norm' in k) # normalization layers, i.e. BatchNorm2d()
  154. #将模型分为三组(weight, bias, 其他所有参数)进行优化 将模型分为三组(卷积神经网络的权重参数weights, 卷积神经网络偏置参数bias, 批归一化的权重参数weights)进行优化
  155. for v in model.modules():
  156. if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): # 卷积神经网络偏置参数bias
  157. g[2].append(v.bias)
  158. if isinstance(v, bn): # weight (no decay) 批归一化的权重参数weights
  159. g[1].append(v.weight)
  160. elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter): # weight (with decay) 卷积神经网络的权重参数weights
  161. g[0].append(v.weight)
  162. # 选用优化器,并设置g[2]的优化方式
  163. if opt.optimizer == 'Adam':
  164. optimizer = Adam(g[2], lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum
  165. elif opt.optimizer == 'AdamW':
  166. optimizer = AdamW(g[2], lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum
  167. else:
  168. optimizer = SGD(g[2], lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
  169. optimizer.add_param_group({'params': g[0], 'weight_decay': hyp['weight_decay']}) # add g0 with weight_decay 设置weight的优化方式
  170. optimizer.add_param_group({'params': g[1]}) # add g1 (BatchNorm2d weights) 设置biases的优化方式
  171. LOGGER.info(f"{colorstr('optimizer:')} {type(optimizer).__name__} with parameter groups "
  172. f"{len(g[1])} weight (no decay), {len(g[0])} weight, {len(g[2])} bias")
  173. del g
  174. # Scheduler 设置学习率衰减
  175. if opt.cos_lr:
  176. lf = one_cycle(1, hyp['lrf'], epochs) # cosine 1->hyp['lrf'] 余弦退火
  177. else:
  178. lf = lambda x: (1 - x / epochs) * (1.0 - hyp['lrf']) + hyp['lrf'] # linear
  179. scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs)
  180. # EMA指数移动平均(动量)
  181. ema = ModelEMA(model) if RANK in [-1, 0] else None
  182. # 加载预训练权重
  183. # 初始化开始训练的epoch和最好的结果
  184. # best_fitness是以[0.0, 0.0, 0.1, 0.9]为系数并乘以[精确度, 召回率, mAP@0.5, mAP@0.5:0.95]在求和所得
  185. # 根据best_fitness来保存best.pt
  186. start_epoch, best_fitness = 0, 0.0
  187. if pretrained:
  188. # Optimizer
  189. if ckpt['optimizer'] is not None:
  190. optimizer.load_state_dict(ckpt['optimizer'])
  191. best_fitness = ckpt['best_fitness']
  192. # EMA
  193. if ema and ckpt.get('ema'):
  194. ema.ema.load_state_dict(ckpt['ema'].float().state_dict())
  195. ema.updates = ckpt['updates']
  196. # Epochs
  197. start_epoch = ckpt['epoch'] + 1
  198. # 如果resume,则备份权重 主要为了防止resume时出现其他问题导致把之前的权重覆盖掉,在这里进行备份。
  199. if resume:
  200. assert start_epoch > 0, f'{weights} training to {epochs} epochs is finished, nothing to resume.'
  201. # 如果新设置epochs小于加载的eposh,则视新设置的epochs为需要训练的轮次数而不是总的轮次数。
  202. if epochs < start_epoch:
  203. LOGGER.info(f"{weights} has been trained for {ckpt['epoch']} epochs. Fine-tuning for {epochs} more epochs.")
  204. epochs += ckpt['epoch'] # finetune additional epochs
  205. del ckpt, csd
  206. # DP mode 分布式训练,DataParallel模式,仅支持单机多卡,一般不会使用DP model 因为DDP model要比DP model优秀
  207. # rank为进程编号,如果设置为rank=-1并且有多块GPU,则使用DataParallel模式
  208. # rank=-1且gpu数量=1时,不会进行分布式
  209. if cuda and RANK == -1 and torch.cuda.device_count() > 1:
  210. LOGGER.warning('WARNING: DP not recommended, use torch.distributed.run for best DDP Multi-GPU results.\n'
  211. 'See Multi-GPU Tutorial at https://github.com/ultralytics/yolov5/issues/475 to get started.')
  212. model = torch.nn.DataParallel(model)
  213. # SyncBatchNorm 使用跨卡同步BN
  214. if opt.sync_bn and cuda and RANK != -1:
  215. model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
  216. LOGGER.info('Using SyncBatchNorm()')
  217. print('----------------------------------------------')
  218. opt.cache = 'val'
  219. # Trainloader 创建训练集dataloader
  220. train_loader, dataset = create_dataloader(train_path,
  221. imgsz,
  222. batch_size // WORLD_SIZE,
  223. gs,
  224. single_cls,
  225. hyp=hyp,
  226. augment=True,
  227. cache=None if opt.cache == 'val' else opt.cache,
  228. rect=opt.rect,
  229. rank=LOCAL_RANK,
  230. workers=workers,
  231. image_weights=opt.image_weights,
  232. quad=opt.quad,
  233. prefix=colorstr('train: '),
  234. shuffle=True)
  235. # 获取标签中最大的类别值, 并与类别数作比较,如果小于类别数则表示有问题
  236. mlc = int(np.concatenate(dataset.labels, 0)[:, 0].max()) # max label class
  237. nb = len(train_loader) # number of batches
  238. assert mlc < nc, f'Label class {mlc} exceeds nc={nc} in {data}. Possible class labels are 0-{nc - 1}'
  239. # Process 0 TestLoader
  240. if RANK in [-1, 0]:
  241. val_loader = create_dataloader(val_path,
  242. imgsz,
  243. batch_size // WORLD_SIZE * 2,
  244. gs,
  245. single_cls,
  246. hyp=hyp,
  247. cache=None if noval else opt.cache,
  248. rect=True,
  249. rank=-1,
  250. workers=workers * 2,
  251. pad=0.5,
  252. prefix=colorstr('val: '))[0]
  253. if not resume:
  254. labels = np.concatenate(dataset.labels, 0)
  255. # c = torch.tensor(labels[:, 0]) # classes
  256. # cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency
  257. # model._initialize_biases(cf.to(device))
  258. if plots:
  259. plot_labels(labels, names, save_dir)
  260. # Anchors
  261. if not opt.noautoanchor:
  262. check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz)
  263. model.half().float() # pre-reduce anchor precision
  264. callbacks.run('on_pretrain_routine_end')
  265. # DDP mode
  266. if cuda and RANK != -1:
  267. model = DDP(model, device_ids=[LOCAL_RANK], output_device=LOCAL_RANK)
  268. # Model attributes
  269. nl = de_parallel(model).model[-1].nl # number of detection layers (to scale hyps)
  270. hyp['box'] *= 3 / nl # scale to layers box系数
  271. hyp['cls'] *= nc / 80 * 3 / nl # scale to classes and layers 根据自己数据集的类别数设置分类损失的系数
  272. hyp['obj'] *= (imgsz / 640) ** 2 * 3 / nl # scale to image size and layers
  273. hyp['label_smoothing'] = opt.label_smoothing
  274. model.nc = nc # attach number of classes to model 模型类别数
  275. model.hyp = hyp # attach hyperparameters to model 超参数
  276. model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) * nc # attach class weights 从训练岩本标签得到类别权重(和类别中的目标数--即类别频率--成反比)
  277. model.names = names # 获取类别的名字
  278. # Start training
  279. t0 = time.time()
  280. nw = max(round(hyp['warmup_epochs'] * nb), 100) # number of warmup iterations, max(3 epochs, 100 iterations) 获取热身训练的迭代次数
  281. # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training
  282. last_opt_step = -1
  283. # 初始化mAP和results
  284. maps = np.zeros(nc) # mAP per class
  285. results = (0, 0, 0, 0, 0, 0, 0) # P, R, mAP@.5, mAP@.5-.95, val_loss(box, obj, cls)
  286. '''
  287. 设置学习率衰减所进行到的轮次,
  288. 目的是打断训练后,--resume接着训练也能正常的衔接之前的论训练进行学习率衰减
  289. '''
  290. scheduler.last_epoch = start_epoch - 1 # do not move
  291. scaler = amp.GradScaler(enabled=cuda) # 设置混合进度训练 在训练最开始之气那实例化一个GradScaler对象
  292. stopper = EarlyStopping(patience=opt.patience)
  293. compute_loss = ComputeLoss(model) # 初始化Loss
  294. callbacks.run('on_train_start')
  295. '''
  296. 打印训练和测试输入图片分辨率
  297. 加载图片时调用的cpu进程数
  298. 从哪个epoch开始训练
  299. '''
  300. LOGGER.info(f'Image sizes {imgsz} train, {imgsz} val\n'
  301. f'Using {train_loader.num_workers * WORLD_SIZE} dataloader workers\n'
  302. f"Logging results to {colorstr('bold', save_dir)}\n"
  303. f'Starting training for {epochs} epochs...')
  304. # 训练
  305. for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------
  306. callbacks.run('on_train_epoch_start')
  307. model.train()
  308. # Update image weights (optional, single-GPU only)
  309. if opt.image_weights:
  310. '''
  311. 平衡类别策略
  312. 如果设置进行图片采样策略,则根据前面初始化的图片采样权重model.class_weights以及maps配合每张图片包含的类别数,通过random.choices生成图片索引indeices从而进行采样
  313. '''
  314. cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2 / nc # class weights
  315. iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights
  316. dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx
  317. # Update mosaic border (optional)
  318. # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs)
  319. # dataset.mosaic_border = [b - imgsz, -b] # height, width borders
  320. # 初始化训练时打印的平均损失信息
  321. mloss = torch.zeros(3, device=device) # mean losses
  322. if RANK != -1:
  323. # DDP模式打乱数据,ddp.sampler的随机此阿阳数据时基于epoch+seed作为随机种子
  324. # 每次epoch不同,随机种子就不同
  325. train_loader.sampler.set_epoch(epoch)
  326. pbar = enumerate(train_loader)
  327. LOGGER.info(('\n' + '%10s' * 7) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'labels', 'img_size'))
  328. if RANK in (-1, 0):
  329. pbar = tqdm(pbar, total=nb, bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}') # progress bar # 创建进度条
  330. optimizer.zero_grad() # 梯度清零
  331. for i, (imgs, targets, paths, _) in pbar: # batch -------------------------------------------------------------
  332. callbacks.run('on_train_batch_start')
  333. # 计算迭代的次数iteration
  334. ni = i + nb * epoch # number integrated batches (since train start)
  335. imgs = imgs.to(device, non_blocking=True).float() / 255 # uint8 to float32, 0-255 to 0.0-1.0
  336. # Warmup
  337. '''
  338. 热身训练(前nw次迭代)
  339. 在前nw次迭代中,根据以下方式选取accumulate和学习率
  340. '''
  341. if ni <= nw:
  342. xi = [0, nw] # x interp
  343. # compute_loss.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou)
  344. accumulate = max(1, np.interp(ni, xi, [1, nbs / batch_size]).round()) # 累计n次后更新梯度值
  345. for j, x in enumerate(optimizer.param_groups):
  346. # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
  347. '''
  348. bias的学习率从0.1下降到基准学习率lr*lf(epoch),
  349. 其他的参数学习率从0增加到lr*lf(epoch).
  350. lf为上面设置的余弦退火的衰减函数
  351. '''
  352. x['lr'] = np.interp(ni, xi, [hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch)])
  353. if 'momentum' in x: #动量momentum也从0.9慢慢变到hyp['momentum']中设置的值
  354. x['momentum'] = np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']])
  355. # Multi-scale 设置多尺度训练
  356. if opt.multi_scale:
  357. sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size 从imgsz*0.5-imgsz*1.5+gs随机选取尺寸(可以被gs整除的最小整数)
  358. sf = sz / max(imgs.shape[2:]) # scale factor
  359. if sf != 1:
  360. ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]] # new shape (stretched to gs-multiple)
  361. imgs = nn.functional.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)
  362. # Forward 混合精度训练
  363. with amp.autocast(enabled=cuda): # 开启autocast的context managers语义
  364. pred = model(imgs) # forward;前向传播
  365. # 计算损失, 包括分类损失,objectness损失, 框的回归损失
  366. # loss为总损失值, loss_items为一个元组, 包含分类损失, objectness损失, 框的回归损失和总损失。
  367. loss, loss_items = compute_loss(pred, targets.to(device)) # loss scaled by batch_size
  368. if RANK != -1:
  369. # 平均不同gpu之间的梯度
  370. loss *= WORLD_SIZE # gradient averaged between devices in DDP mode
  371. if opt.quad:
  372. loss *= 4.
  373. # Backward
  374. scaler.scale(loss).backward() # 反向传播;Scales loss, 为了梯度放大
  375. # Optimize
  376. if ni - last_opt_step >= accumulate: # 模型反向传播accmulate次之后再根据累计的梯度更新一次参数
  377. # scaler.step() 首先把梯度的值unscale回来。
  378. # 如果梯度的值不是infs或者NaNs,那么调用optimizer.step()来更新权重,
  379. # 否则忽略step调用, 从而保证权重不更新
  380. scaler.step(optimizer) # optimizer.step 进行参数更新
  381. # 准备是否要增大scaler
  382. scaler.update()
  383. optimizer.zero_grad() # 梯度清零
  384. if ema:
  385. ema.update(model)
  386. last_opt_step = ni
  387. # Log
  388. if RANK in (-1, 0):
  389. # 显存,进行的轮次, 损失, target的数量和图片的size等信息
  390. mloss = (mloss * i + loss_items) / (i + 1) # update mean losses
  391. mem = f'{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G' # (GB)
  392. pbar.set_description(('%10s' * 2 + '%10.4g' * 5) %
  393. (f'{epoch}/{epochs - 1}', mem, *mloss, targets.shape[0], imgs.shape[-1]))
  394. callbacks.run('on_train_batch_end', ni, model, imgs, targets, paths, plots)
  395. if callbacks.stop_training:
  396. return
  397. # end batch ------------------------------------------------------------------------------------------------
  398. # Scheduler 进行学习率衰减
  399. lr = [x['lr'] for x in optimizer.param_groups] # for loggers
  400. scheduler.step() # 对lr进行调整
  401. if RANK in (-1, 0):
  402. # mAP
  403. callbacks.run('on_train_epoch_end', epoch=epoch)
  404. ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'names', 'stride', 'class_weights']) #更新EMA的属性, 添加include的属性
  405. final_epoch = (epoch + 1 == epochs) or stopper.possible_stop #判断该epoch是否为最后一轮
  406. # 对测试集进行测试, 计算mAP等指标
  407. # 测试时使用的是EMA模型
  408. if not noval or final_epoch: # Calculate mAP
  409. results, maps, _ = val.run(data_dict,
  410. batch_size=batch_size // WORLD_SIZE * 2,
  411. imgsz=imgsz,
  412. model=ema.ema,
  413. single_cls=single_cls,
  414. dataloader=val_loader,
  415. save_dir=save_dir,
  416. plots=False,
  417. callbacks=callbacks,
  418. compute_loss=compute_loss)
  419. # Update best mAP 更新best_fitness
  420. fi = fitness(np.array(results).reshape(1, -1)) # weighted combination of [P, R, mAP@.5, mAP@.5-.95]
  421. if fi > best_fitness:
  422. best_fitness = fi
  423. log_vals = list(mloss) + list(results) + lr
  424. callbacks.run('on_fit_epoch_end', log_vals, epoch, best_fitness, fi)
  425. # Save model
  426. '''
  427. 保存&加载带checkpoint的模型用于inference或者resuming training;
  428. 保存模型,还保存了epoch, results, optimizer等信息,
  429. optimizer将不会在最后一轮完成后保存
  430. model保存的是EMA的模型
  431. '''
  432. if (not nosave) or (final_epoch and not evolve): # if save
  433. ckpt = {
  434. 'epoch': epoch,
  435. 'best_fitness': best_fitness,
  436. 'model': deepcopy(de_parallel(model)).half(),
  437. 'ema': deepcopy(ema.ema).half(),
  438. 'updates': ema.updates,
  439. 'optimizer': optimizer.state_dict(),
  440. 'wandb_id': loggers.wandb.wandb_run.id if loggers.wandb else None,
  441. 'date': datetime.now().isoformat()}
  442. # Save last, best and delete 更新last、best模型
  443. torch.save(ckpt, last)
  444. if best_fitness == fi:
  445. torch.save(ckpt, best)
  446. if (epoch > 0) and (opt.save_period > 0) and (epoch % opt.save_period == 0):
  447. torch.save(ckpt, w / f'epoch{epoch}.pt')
  448. del ckpt
  449. callbacks.run('on_model_save', last, epoch, final_epoch, best_fitness, fi)
  450. # Stop Single-GPU
  451. if RANK == -1 and stopper(epoch=epoch, fitness=fi):
  452. break
  453. # Stop DDP TODO: known issues shttps://github.com/ultralytics/yolov5/pull/4576
  454. # stop = stopper(epoch=epoch, fitness=fi)
  455. # if RANK == 0:
  456. # dist.broadcast_object_list([stop], 0) # broadcast 'stop' to all ranks
  457. # Stop DPP
  458. # with torch_distributed_zero_first(RANK):
  459. # if stop:
  460. # break # must break all DDP ranks
  461. # end epoch ----------------------------------------------------------------------------------------------------
  462. # end training -----------------------------------------------------------------------------------------------------
  463. if RANK in (-1, 0):
  464. LOGGER.info(f'\n{epoch - start_epoch + 1} epochs completed in {(time.time() - t0) / 3600:.3f} hours.')
  465. for f in last, best:
  466. if f.exists():
  467. strip_optimizer(f) # strip optimizers
  468. if f is best: #测试best模型的效果
  469. LOGGER.info(f'\nValidating {f}...')
  470. results, _, _ = val.run(
  471. data_dict,
  472. batch_size=batch_size // WORLD_SIZE * 2,
  473. imgsz=imgsz,
  474. model=attempt_load(f, device).half(),
  475. iou_thres=0.65 if is_coco else 0.60, # best pycocotools results at 0.65
  476. single_cls=single_cls,
  477. dataloader=val_loader,
  478. save_dir=save_dir,
  479. save_json=is_coco,
  480. verbose=True,
  481. plots=plots,
  482. callbacks=callbacks,
  483. compute_loss=compute_loss) # val best model with plots
  484. if is_coco:
  485. callbacks.run('on_fit_epoch_end', list(mloss) + list(results) + lr, epoch, best_fitness, fi)
  486. callbacks.run('on_train_end', last, best, plots, epoch, results)
  487. LOGGER.info(f"Results saved to {colorstr('bold', save_dir)}")
  488. torch.cuda.empty_cache() # 释放显存
  489. return results
  490. def parse_opt(known=False):
  491. parser = argparse.ArgumentParser()
  492. parser.add_argument('--weights', type=str, default=ROOT / 'yolov5s.pt', help='initial weights path') # 预训练权重
  493. parser.add_argument('--cfg', type=str, default='', help='model.yaml path') # 模型配置文件及网络结构
  494. parser.add_argument('--data', type=str, default=ROOT / 'data/coco128.yaml', help='dataset.yaml path') # 数据集配置文件,数据集路径,类名等
  495. parser.add_argument('--hyp', type=str, default=ROOT / 'data/hyps/hyp.scratch-low.yaml', help='hyperparameters path') # 超参数文件
  496. parser.add_argument('--epochs', type=int, default=300) # 训练总轮次
  497. parser.add_argument('--batch-size', type=int, default=16, help='total batch size for all GPUs, -1 for autobatch') # 批次大小
  498. parser.add_argument('--imgsz', '--img', '--img-size', type=int, default=640, help='train, val image size (pixels)') # 图像分辨率大小
  499. parser.add_argument('--rect', action='store_true', help='rectangular training') # 是否采用矩阵训练,默认采用False
  500. parser.add_argument('--resume', nargs='?', const=True, default=False, help='resume most recent training') # 断点续训
  501. parser.add_argument('--nosave', action='store_true', help='only save final checkpoint') # 不保存模型
  502. parser.add_argument('--noval', action='store_true', help='only validate final epoch') # 不进行验证
  503. parser.add_argument('--noautoanchor', action='store_true', help='disable AutoAnchor') # 不自动调整anchor,默认为False
  504. parser.add_argument('--noplots', action='store_true', help='save no plot files') #不保存绘制文件
  505. parser.add_argument('--evolve', type=int, nargs='?', const=300, help='evolve hyperparameters for x generations') # 是否进行超参数进化,默认为False
  506. parser.add_argument('--bucket', type=str, default='', help='gsutil bucket') # 谷歌云盘bucket
  507. parser.add_argument('--image-weights', action='store_true', help='use weighted image selection for training') # 是否提前缓存图片到内存,默认为False
  508. parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu') # 训练设备
  509. parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%') # 多尺度训练,默认为False
  510. parser.add_argument('--single-cls', action='store_true', help='train multi-class data as single-class') #数据集是否只有一个类别,默认为False
  511. parser.add_argument('--optimizer', type=str, choices=['SGD', 'Adam', 'AdamW'], default='SGD', help='optimizer') # 默认优化器
  512. parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode') # 是否使用跨卡同步BN,在DDP模式使用
  513. parser.add_argument('--workers', type=int, default=8, help='max dataloader workers (per RANK in DDP mode)') # dataloader的最大worker数量
  514. parser.add_argument('--project', default=ROOT / 'runs/train', help='save to project/name') # 运行项目的目录名称
  515. parser.add_argument('--name', default='exp', help='save to project/name') # 运行项目的目录名称
  516. parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment') # 不要增加现有项目名称
  517. parser.add_argument('--quad', action='store_true', help='quad dataloader')
  518. parser.add_argument('--cos-lr', action='store_true', help='cosine LR scheduler') # 余弦LR调度器
  519. parser.add_argument('--label-smoothing', type=float, default=0.0, help='Label smoothing epsilon') # 标签平滑
  520. parser.add_argument('--patience', type=int, default=100, help='EarlyStopping patience (epochs without improvement)')#当模型长时间没有改进时停止
  521. parser.add_argument('--freeze', nargs='+', type=int, default=[0], help='Freeze layers: backbone=10, first3=0 1 2') # 冻结层,迁移学习使用
  522. parser.add_argument('--save-period', type=int, default=-1, help='Save checkpoint every x epochs (disabled if < 1)') # 每x个epochs保存一个检查点
  523. parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify') # gpu编号
  524. # Weights & Biases arguments
  525. parser.add_argument('--entity', default=None, help='W&B: Entity')
  526. parser.add_argument('--upload_dataset', nargs='?', const=True, default=False, help='W&B: Upload data, "val" option')
  527. parser.add_argument('--bbox_interval', type=int, default=-1, help='W&B: Set bounding-box image logging interval')
  528. parser.add_argument('--artifact_alias', type=str, default='latest', help='W&B: Version of dataset artifact to use')
  529. opt = parser.parse_known_args()[0] if known else parser.parse_args()
  530. return opt
  531. def main(opt, callbacks=Callbacks()):
  532. # Checks
  533. if RANK in (-1, 0): #表示进程序号,用于进程间通讯,表征进程优先级。rank = 0 的主机为 master 节点
  534. print_args(vars(opt)) # 打印参数字典
  535. check_git_status() # 检查代码是否为最新
  536. check_requirements(exclude=['thop']) # 检查python包的依赖项
  537. # Resume
  538. if opt.resume and not check_wandb_resume(opt) and not opt.evolve: # 断点续训,恢复之前的训练
  539. ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run() # 如果resume是str,则表示传入的是模型的路径;get_latest_run()函数获取runs文件夹中最近的last。pt文件
  540. assert os.path.isfile(ckpt), 'ERROR: --resume checkpoint does not exist'
  541. # optcan
  542. with open(Path(ckpt).parent.parent / 'opt.yaml', errors='ignore') as f: # 如果需要断点续训,则需要替换原本的opt参数
  543. opt = argparse.Namespace(**yaml.safe_load(f)) # replace
  544. opt.cfg, opt.weights, opt.resume = '', ckpt, True # reinstate
  545. LOGGER.info(f'Resuming training from {ckpt}')
  546. else: # 从头开始训练
  547. opt.data, opt.cfg, opt.hyp, opt.weights, opt.project = \
  548. check_file(opt.data), check_yaml(opt.cfg), check_yaml(opt.hyp), str(opt.weights), str(opt.project) # 检查配置文件信息
  549. assert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified'
  550. if opt.evolve: #进化策略 修改project目录路径
  551. if opt.project == str(ROOT / 'runs/train'): # if default project name, rename to runs/evolve
  552. opt.project = str(ROOT / 'runs/evolve')
  553. opt.exist_ok, opt.resume = opt.resume, False # pass resume to exist_ok and disable resume
  554. if opt.name == 'cfg': # 训练时项目的名称
  555. opt.name = Path(opt.cfg).stem # use model.yaml as name
  556. opt.save_dir = str(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok)) # 增量命名文件夹 exp1,exp2...
  557. # DDP mode
  558. device = select_device(opt.device, batch_size=opt.batch_size) # 选择设备
  559. ####################################################################
  560. if LOCAL_RANK != -1: #如果当前显卡在被使用 # LOCAL_RANK:进程内,GPU 编号,非显式参数. 比方说, rank = 3,local_rank = 0 表示第 3 个进程内的第 1 块 GPU
  561. msg = 'is not compatible with YOLOv5 Multi-GPU DDP training'
  562. assert not opt.image_weights, f'--image-weights {msg}'
  563. assert not opt.evolve, f'--evolve {msg}'
  564. assert opt.batch_size != -1, f'AutoBatch with --batch-size -1 {msg}, please pass a valid --batch-size'
  565. assert opt.batch_size % WORLD_SIZE == 0, f'--batch-size {opt.batch_size} must be multiple of WORLD_SIZE'
  566. assert torch.cuda.device_count() > LOCAL_RANK, 'insufficient CUDA devices for DDP command'
  567. torch.cuda.set_device(LOCAL_RANK)
  568. device = torch.device('cuda', LOCAL_RANK) # 根据gpu编号选择设备
  569. dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo")
  570. ####################################################################-
  571. # Train 如果不进行超参数进化,则直接调用train()函数开始训练
  572. if not opt.evolve: # 不执行进化策略
  573. train(opt.hyp, opt, device, callbacks)
  574. if WORLD_SIZE > 1 and RANK == 0:
  575. LOGGER.info('Destroying process group... ')
  576. dist.destroy_process_group()
  577. ######################################################################
  578. # Evolve hyperparameters (optional)
  579. else:
  580. # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit)
  581. # 超参数进化列表,括号里分别为(突变规模,最小值,最大值)
  582. meta = {
  583. 'lr0': (1, 1e-5, 1e-1), # initial learning rate (SGD=1E-2, Adam=1E-3)
  584. 'lrf': (1, 0.01, 1.0), # final OneCycleLR learning rate (lr0 * lrf)
  585. 'momentum': (0.3, 0.6, 0.98), # SGD momentum/Adam beta1
  586. 'weight_decay': (1, 0.0, 0.001), # optimizer weight decay
  587. 'warmup_epochs': (1, 0.0, 5.0), # warmup epochs (fractions ok)
  588. 'warmup_momentum': (1, 0.0, 0.95), # warmup initial momentum
  589. 'warmup_bias_lr': (1, 0.0, 0.2), # warmup initial bias lr
  590. 'box': (1, 0.02, 0.2), # box loss gain
  591. 'cls': (1, 0.2, 4.0), # cls loss gain
  592. 'cls_pw': (1, 0.5, 2.0), # cls BCELoss positive_weight
  593. 'obj': (1, 0.2, 4.0), # obj loss gain (scale with pixels)
  594. 'obj_pw': (1, 0.5, 2.0), # obj BCELoss positive_weight
  595. 'iou_t': (0, 0.1, 0.7), # IoU training threshold
  596. 'anchor_t': (1, 2.0, 8.0), # anchor-multiple threshold
  597. 'anchors': (2, 2.0, 10.0), # anchors per output grid (0 to ignore)
  598. 'fl_gamma': (0, 0.0, 2.0), # focal loss gamma (efficientDet default gamma=1.5)
  599. 'hsv_h': (1, 0.0, 0.1), # image HSV-Hue augmentation (fraction)
  600. 'hsv_s': (1, 0.0, 0.9), # image HSV-Saturation augmentation (fraction)
  601. 'hsv_v': (1, 0.0, 0.9), # image HSV-Value augmentation (fraction)
  602. 'degrees': (1, 0.0, 45.0), # image rotation (+/- deg)
  603. 'translate': (1, 0.0, 0.9), # image translation (+/- fraction)
  604. 'scale': (1, 0.0, 0.9), # image scale (+/- gain)
  605. 'shear': (1, 0.0, 10.0), # image shear (+/- deg)
  606. 'perspective': (0, 0.0, 0.001), # image perspective (+/- fraction), range 0-0.001
  607. 'flipud': (1, 0.0, 1.0), # image flip up-down (probability)
  608. 'fliplr': (0, 0.0, 1.0), # image flip left-right (probability)
  609. 'mosaic': (1, 0.0, 1.0), # image mixup (probability)
  610. 'mixup': (1, 0.0, 1.0), # image mixup (probability)
  611. 'copy_paste': (1, 0.0, 1.0)} # segment copy-paste (probability)
  612. with open(opt.hyp, errors='ignore') as f:
  613. hyp = yaml.safe_load(f) # load hyps dict
  614. if 'anchors' not in hyp: # anchors commented in hyp.yaml
  615. hyp['anchors'] = 3
  616. opt.noval, opt.nosave, save_dir = True, True, Path(opt.save_dir) # only val/save final epoch
  617. # ei = [isinstance(x, (int, float)) for x in hyp.values()] # evolvable indices
  618. evolve_yaml, evolve_csv = save_dir / 'hyp_evolve.yaml', save_dir / 'evolve.csv'
  619. if opt.bucket:
  620. os.system(f'gsutil cp gs://{opt.bucket}/evolve.csv {evolve_csv}') # download evolve.csv if exists
  621. for _ in range(opt.evolve): # generations to evolve
  622. if evolve_csv.exists(): # if evolve.csv exists: select best hyps and mutate
  623. # Select parent(s)
  624. parent = 'single' # parent selection method: 'single' or 'weighted' 选择进化方式
  625. x = np.loadtxt(evolve_csv, ndmin=2, delimiter=',', skiprows=1) # 加载evolve.txt文件
  626. n = min(5, len(x)) # number of previous results to consider # 选取最多前五次进化结果
  627. x = x[np.argsort(-fitness(x))][:n] # top n mutations
  628. w = fitness(x) - fitness(x).min() + 1E-6 # weights (sum > 0) 根据结果计算hyp权重
  629. #根据不同进化方式获得base hyp
  630. if parent == 'single' or len(x) == 1:
  631. # x = x[random.randint(0, n - 1)] # random selection
  632. x = x[random.choices(range(n), weights=w)[0]] # weighted selection
  633. elif parent == 'weighted':
  634. x = (x * w.reshape(n, 1)).sum(0) / w.sum() # weighted combination
  635. # Mutate 超参数进化
  636. mp, s = 0.8, 0.2 # mutation probability, sigma
  637. npr = np.random
  638. npr.seed(int(time.time()))
  639. g = np.array([meta[k][0] for k in hyp.keys()]) # gains 0-1 获取突变初始值
  640. ng = len(meta)
  641. v = np.ones(ng)
  642. while all(v == 1): # mutate until a change occurs (prevent duplicates) 设置突变
  643. v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0)
  644. for i, k in enumerate(hyp.keys()): # plt.hist(v.ravel(), 300)
  645. hyp[k] = float(x[i + 7] * v[i]) # mutate
  646. # Constrain to limits 将hyp限制在规定范围内
  647. for k, v in meta.items():
  648. hyp[k] = max(hyp[k], v[1]) # lower limit
  649. hyp[k] = min(hyp[k], v[2]) # upper limit
  650. hyp[k] = round(hyp[k], 5) # significant digits
  651. # Train mutation 训练
  652. results = train(hyp.copy(), opt, device, callbacks)
  653. callbacks = Callbacks()
  654. # Write mutation results 写入results和对应的hyp到evolve.txt
  655. print_mutation(results, hyp.copy(), save_dir, opt.bucket)
  656. # Plot results
  657. plot_evolve(evolve_csv)
  658. LOGGER.info(f'Hyperparameter evolution finished {opt.evolve} generations\n'
  659. f"Results saved to {colorstr('bold', save_dir)}\n"
  660. f'Usage example: $ python train.py --hyp {evolve_yaml}')
  661. ####################################################################-
  662. def run(**kwargs):
  663. # Usage: import train; train.run(data='coco128.yaml', imgsz=320, weights='yolov5m.pt')
  664. opt = parse_opt(True)
  665. for k, v in kwargs.items():
  666. setattr(opt, k, v)
  667. main(opt)
  668. return opt
  669. if __name__ == "__main__":
  670. opt = parse_opt()
  671. main(opt)
  672. # CUDA_VISIBLE_DEVICES="1,2" python train.py --data ../../data/helmet_fall_phone_delete_work/helmet_fall_phone.yaml --weights weights/yolov5l6.pt --img 1280 --hyp data/hyps/hyp.scratch-high.yaml --multi-scale --epochs 50 --name helmet_fall_phone_delete_work_2 --batch-size 8