CV
/
ROBOT_IMAGE_PROCESS_DETECTION_MODEL_TRAIN_v2.0.1


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
							# YOLOv5 🚀 by Ultralytics, GPL-3.0 license
"""
Loss functions
"""

import torch
import torch.nn as nn

from utils.metrics import bbox_iou
from utils.torch_utils import de_parallel

# 标签平滑
def smooth_BCE(eps=0.1):  # https://github.com/ultralytics/yolov3/issues/238#issuecomment-598028441
    # return positive, negative label smoothing BCE targets
    return 1.0 - 0.5 * eps, 0.5 * eps


class BCEBlurWithLogitsLoss(nn.Module):
    # BCEwithLogitLoss() with reduced missing label effects.
    def __init__(self, alpha=0.05):
        """
        标签平滑操作 [1, 0] => [0.95, 0.05]
        :param alpha:平滑参数
        :type alpha:
        """
        super().__init__()
        self.loss_fcn = nn.BCEWithLogitsLoss(reduction='none')  # must be nn.BCEWithLogitsLoss()
        self.alpha = alpha

    def forward(self, pred, true):
        loss = self.loss_fcn(pred, true)
        pred = torch.sigmoid(pred)  # prob from logits
        dx = pred - true  # reduce only missing label effects
        # dx = (pred - true).abs()  # reduce missing label and false label effects
        alpha_factor = 1 - torch.exp((dx - 1) / (self.alpha + 1e-4))
        loss *= alpha_factor
        return loss.mean()


class FocalLoss(nn.Module):
    # Wraps focal loss around existing loss_fcn(), i.e. criteria = FocalLoss(nn.BCEWithLogitsLoss(), gamma=1.5)
    def __init__(self, loss_fcn, gamma=1.5, alpha=0.25):
        super().__init__()
        self.loss_fcn = loss_fcn  # must be nn.BCEWithLogitsLoss()
        self.gamma = gamma
        self.alpha = alpha
        self.reduction = loss_fcn.reduction
        self.loss_fcn.reduction = 'none'  # required to apply FL to each element

    def forward(self, pred, true):
        loss = self.loss_fcn(pred, true)
        # p_t = torch.exp(-loss)
        # loss *= self.alpha * (1.000001 - p_t) ** self.gamma  # non-zero power for gradient stability

        # TF implementation https://github.com/tensorflow/addons/blob/v0.7.1/tensorflow_addons/losses/focal_loss.py
        pred_prob = torch.sigmoid(pred)  # prob from logits
        p_t = true * pred_prob + (1 - true) * (1 - pred_prob)
        alpha_factor = true * self.alpha + (1 - true) * (1 - self.alpha)
        modulating_factor = (1.0 - p_t) ** self.gamma
        loss *= alpha_factor * modulating_factor

        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        else:  # 'none'
            return loss


class QFocalLoss(nn.Module):
    # Wraps Quality focal loss around existing loss_fcn(), i.e. criteria = FocalLoss(nn.BCEWithLogitsLoss(), gamma=1.5)
    def __init__(self, loss_fcn, gamma=1.5, alpha=0.25):
        super().__init__()
        self.loss_fcn = loss_fcn  # must be nn.BCEWithLogitsLoss()
        self.gamma = gamma
        self.alpha = alpha
        self.reduction = loss_fcn.reduction
        self.loss_fcn.reduction = 'none'  # required to apply FL to each element

    def forward(self, pred, true):
        loss = self.loss_fcn(pred, true)

        pred_prob = torch.sigmoid(pred)  # prob from logits
        alpha_factor = true * self.alpha + (1 - true) * (1 - self.alpha)
        modulating_factor = torch.abs(true - pred_prob) ** self.gamma
        loss *= alpha_factor * modulating_factor

        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        else:  # 'none'
            return loss

#计算损失（分类损失 + 置信度损失 + 坐标框损失）
class ComputeLoss:
    sort_obj_iou = False

    # Compute losses
    def __init__(self, model, autobalance=False):
        device = next(model.parameters()).device  # get model device
        h = model.hyp  # hyperparameters

        # Define criteria
        BCEcls = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([h['cls_pw']], device=device))
        BCEobj = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([h['obj_pw']], device=device))

        # Class label smoothing https://arxiv.org/pdf/1902.04103.pdf eqn 3
        self.cp, self.cn = smooth_BCE(eps=h.get('label_smoothing', 0.0))  # positive, negative BCE targets

        # Focal loss
        g = h['fl_gamma']  # focal loss gamma 如果设置了fl_gamma参数， 就是用focal loss，默认没有使用
        if g > 0:
            BCEcls, BCEobj = FocalLoss(BCEcls, g), FocalLoss(BCEobj, g)

        m = de_parallel(model).model[-1]  # Detect() module
        self.balance = {3: [4.0, 1.0, 0.4]}.get(m.nl, [4.0, 1.0, 0.25, 0.06, 0.02])  # P3-P7 设置三个特征图对应输出的损失系数
        self.ssi = list(m.stride).index(16) if autobalance else 0  # stride 16 index
        self.BCEcls, self.BCEobj, self.gr, self.hyp, self.autobalance = BCEcls, BCEobj, 1.0, h, autobalance
        self.na = m.na  # number of anchors
        self.nc = m.nc  # number of classes
        self.nl = m.nl  # number of layers
        self.anchors = m.anchors
        self.device = device

    def __call__(self, p, targets):  # predictions, targets #
        '''

        :param p: 网络输出，List[torch.tensor * 3, p[i].shape = (b, 3, h, w, nc+5)], hw分别为特征图的长宽，b为batch-size
        :type p:
        :param targets:targets.shape = (nt, 6), 6=icxywh, i=0表示第一张图片， c为类别， 然后为坐标xywh
        :type targets:
        :return:
        :rtype:
        '''

        #初始化各个损失
        lcls = torch.zeros(1, device=self.device)  # class loss
        lbox = torch.zeros(1, device=self.device)  # box loss
        lobj = torch.zeros(1, device=self.device)  # object loss
        tcls, tbox, indices, anchors = self.build_targets(p, targets)  # targets 获得标签分类，边框，索引，anchors

        # Losses 遍历每个预测输出
        for i, pi in enumerate(p):  # layer index, layer predictions
            # b表示当前bbox属于batch内部的第几张图片，
            # a表示当前bbox和当前层的第几个anchor匹配上，
            # gi,gj是对应的负责预测该bbox的网格坐标
            b, a, gj, gi = indices[i]  # image, anchor, gridy, gridx
            tobj = torch.zeros(pi.shape[:4], dtype=pi.dtype, device=self.device)  # target obj

            n = b.shape[0]  # number of targets
            if n:
                # pxy, pwh, _, pcls = pi[b, a, gj, gi].tensor_split((2, 4, 5), dim=1)  # faster, requires torch 1.8.0
                pxy, pwh, _, pcls = pi[b, a, gj, gi].split((2, 2, 1, self.nc), 1)  # target-subset of predictions 找到对应网格的输出，取出对应位置预测值

                # Regression 目标框回归
                pxy = pxy.sigmoid() * 2 - 0.5
                pwh = (pwh.sigmoid() * 2) ** 2 * anchors[i]
                pbox = torch.cat((pxy, pwh), 1)  # predicted box
                iou = bbox_iou(pbox, tbox[i], CIoU=True).squeeze()  # iou(prediction, target) 计算边框损失，计算的是CIOU
                lbox += (1.0 - iou).mean()  # iou loss

                # Objectness 置信度损失
                iou = iou.detach().clamp(0).type(tobj.dtype)
                if self.sort_obj_iou:
                    j = iou.argsort()
                    b, a, gj, gi, iou = b[j], a[j], gj[j], gi[j], iou[j]
                if self.gr < 1:
                    iou = (1.0 - self.gr) + self.gr * iou
                # 将正样本的iou赋给
                tobj[b, a, gj, gi] = iou  # iou ratio

                # Classification 分类损失
                if self.nc > 1:  # cls loss (only if multiple classes) 类别数大于1
                    t = torch.full_like(pcls, self.cn, device=self.device)  # targets
                    t[range(n), tcls[i]] = self.cp
                    lcls += self.BCEcls(pcls, t)  # BCE 分别对每个类别计算loss

                # Append targets to text file
                # with open('targets.txt', 'a') as file:
                #     [file.write('%11.5g ' * 4 % tuple(x) + '\n') for x in torch.cat((txy[i], twh[i]), 1)]

            obji = self.BCEobj(pi[..., 4], tobj)
            lobj += obji * self.balance[i]  # obj loss
            if self.autobalance:
                self.balance[i] = self.balance[i] * 0.9999 + 0.0001 / obji.detach().item()

        if self.autobalance:
            self.balance = [x / self.balance[self.ssi] for x in self.balance]
        # 根据超参数设置的各个部分损失的系数获取最终的损失
        lbox *= self.hyp['box']
        lobj *= self.hyp['obj']
        lcls *= self.hyp['cls']
        bs = tobj.shape[0]  # batch size

        return (lbox + lobj + lcls) * bs, torch.cat((lbox, lobj, lcls)).detach()


    '''
    build_targets函数用于获得在训练时计算loss函数所需要的目标框，即被认为是正样本
    与yolov3/v4的不同：yolov5支持跨网格预测
    对于任何一个bbox，三个输出预测特征层都可能有先验框anchors匹配
    该函数输出的正样本框可能比传入的targets(GT框)数目多
    具体处理过程：
    （1）对于任何一层计算当前bbox和当前层anchor的匹配程度，不采用iou，而是shape比例；如果anchor和bbox的宽高比差距大于4，则不认为匹配，此时忽略相应的bbox，即当作背景；
    （2）然后对bbox计算落在的网格所有anchors都计算loss（并不是直接和GT框比较计算loss） 注意此时落在网格不再是一个，而是附近多个，这样就增加了正样本数，可能u才能在有些bbox在三个尺度都预测的情况；
        另外，yolov5也没有conf分支忽略阈值（ignore_thresh）的操作，而yolov3/v4有
    '''
    def build_targets(self, p, targets): # p: 网络输出， targets：GT框， model：模型
        # Build targets for compute_loss(), input targets(image,class,x,y,w,h)
        na, nt = self.na, targets.shape[0]  # number of anchors, targets anchor数量和标签框的数量
        tcls, tbox, indices, anch = [], [], [], []
        # ai,shape = (na, nt)生成anchor索引
        # anchor索引，用于表示当前bbox和当前层的那个anchor匹配
        gain = torch.ones(7, device=self.device)  # normalized to gridspace gain
        ai = torch.arange(na, device=self.device).float().view(na, 1).repeat(1, nt)  # same as .repeat_interleave(nt)
        targets = torch.cat((targets.repeat(na, 1, 1), ai[..., None]), 2)  # append anchor indices 先repeat targets和当前层anchor个数一样，相当于每个bbox变成了三个，然后和3个anchor单独匹配


        g = 0.5  # bias 设置网格中心偏移量
        off = torch.tensor(
            [
                [0, 0], # 当前网格
                [1, 0], # 右边网格
                [0, 1], # 下边网格
                [-1, 0], # 左边网格
                [0, -1],  # j,k,l,m # 上边网格
                # [1, 1], [1, -1], [-1, 1], [-1, -1],  # jk,jm,lk,lm
            ],
            device=self.device).float() * g  # offsets 找出当前网格临近的4个网格

        # 对每个检测层进行处理
        for i in range(self.nl): # 三个尺度的预测特征图输出分支 self.nl=3
            anchors = self.anchors[i]# 当前分支的anchor大小
            gain[2:6] = torch.tensor(p[i].shape)[[3, 2, 3, 2]]  # xyxy gain 当前特征层大小

            # Match targets to anchors
            t = targets * gain  # shape(3,n,7) 将标签框的xywh从基于0~1映射到基于特征图；targets的xywh本省是归一化尺度，故需要变成特征图尺度

            #对每个输出层单独匹配；首先将targets变成anchor尺度，方便计算；
            # 然后将target wh shape和anchor的wh计算比例，如果比例过大，则说明匹配度不高，将该bbox过滤，在当前层认为是背景层
            if nt:
                # Matches
                '''
                预测的wh与anchor的wh做匹配，筛选掉比值大于hyp['anchor_t']的，从而更好的回归。
                作者采用新的wh回归方式
                与拿来yolov3/v4为anchors[i] * exp(wh)
                将标签框与anchor的备注控制在0~4之间；hyp.scratch.yaml中的超参数anchor_t=4, 用于判定anchors与标签框默契度；
                '''

                # 计算当前target的wh和anchor的wh比例值
                # 如果最大比例大于预设值model.hyp['anchor_t']=4，则当前target和anchor匹配度不高，不强制回归，而把target丢弃
                # 计算比值ratio
                r = t[..., 4:6] / anchors[:, None]  # wh ratio 不考虑xy坐标
                j = torch.max(r, 1 / r).max(2)[0] < self.hyp['anchor_t']  # compare 筛选满足 1/hyp['anchor_t'] < targets_wh/anchor_wh < hyp['anchor_t']的框；
                # j = wh_iou(anchors, t[:, 4:6]) > model.hyp['iou_t']  # iou(3,n)=wh_iou(anchors(3,2), gwh(n,2))
                # 筛选过后的t.shape = (M, 7), M为筛选过后的数量
                t = t[j]  # filter 注意过滤规则没有考虑xy， 也就是当前bbox的wh是和所有anchoe计算的

                # Offsets
                gxy = t[:, 2:4]  # grid xy label的中心点坐标
                gxi = gain[[2, 3]] - gxy  # inverse 得到中心点相对于当前特征图的坐标

                '''
                把相对于各个网格左上角x<0.5,y<0.5和相对于右下角的x<0.5,y<0.5的框提取出来，也就是j,k,l,m;
                在选取gij（标签分配给的网格）的时候对这四个部分的框都做一个偏移（减去上面的offsets）,
                也就是下面的gij=（gxy - offsets).long()操作；
                再将这四个部分的框与原始的gxy拼接在一起，总共就是五个部分；
                yolov3/v4仅仅采用当前网格的anchor进行回归；yolov4也有解决网格跑偏的措施，即通过对sigmoid限制输出；
                yolov5中心点回归从yolov3/v4的0~1的范围变成-0.5~1.5的范围；
                中心点回归的公式变为：xy.sigmoid() * 2. - 0.5 + cx (其中对原始中心点网格坐标扩展两个邻居像素)
                '''

                # 对于筛选后的bbox，计算其落在哪个网格内，同时找出邻近的网格，将这些网格都认为是负责预测该bbox的网格
                # 浮点数取模的数学定义：对于两个浮点数a和b，a % b = a - n * b, 其中n为不能超过a / b 的最大整数
                j, k = ((gxy % 1 < g) & (gxy > 1)).T
                l, m = ((gxi % 1 < g) & (gxi > 1)).T
                j = torch.stack((torch.ones_like(j), j, k, l, m))
                t = t.repeat((5, 1, 1))[j] # 预设offset是5
                offsets = (torch.zeros_like(gxy)[None] + off[:, None])[j] # 选择出最近的3个
            else:
                t = targets[0]
                offsets = 0

            # Define
            '''
            对每个bbox找出对应的正样本anchor，其中包括b表示当前bbox属于batch内部的第几张图片，a表示当前bbox和当前层的第几个anchor匹配上，
            gi,gj是对应的负责预测该bbox的网格坐标，
            gxy是不考虑offset或者说yolov3/v4里面设定的该bbox的负责预测网格中心点坐标xy，
            gwh是对应的bbox wh， c是该bbox类别
            '''
            bc, gxy, gwh, a = t.chunk(4, 1)  # (image, class), grid xy, grid wh, anchors 中心点回归标签和宽高回归标签
            a, (b, c) = a.long().view(-1), bc.long().T  # anchors, image, class
            gij = (gxy - offsets).long() # 当前label落在哪个网格上
            gi, gj = gij.T  # grid indices

            # Append
            indices.append((b, a, gj.clamp_(0, gain[3] - 1), gi.clamp_(0, gain[2] - 1)))  # image, anchor, grid indices 添加索引，方便计算损失的时取出对应位置的输出
            tbox.append(torch.cat((gxy - gij, gwh), 1))  # box 坐标值
            anch.append(anchors[a])  # anchors 尺寸
            tcls.append(c)  # class

        return tcls, tbox, indices, anch