#!/usr/bin/env python3 # -*- coding:utf-8 -*- # Copyright (c) Megvii, Inc. and its affiliates. """ Data augmentation functionality. Passed as callable transformations to Dataset classes. The data augmentation procedures were interpreted from @weiliu89's SSD paper http://arxiv.org/abs/1512.02325 """ import cv2 import numpy as np import torch from torchvision import transforms from yolox.utils import xyxy2cxcywh import time import math import random def augment_hsv(img, hgain=0.015, sgain=0.7, vgain=0.4): r = np.random.uniform(-1, 1, 3) * [hgain, sgain, vgain] + 1 # random gains hue, sat, val = cv2.split(cv2.cvtColor(img, cv2.COLOR_BGR2HSV)) dtype = img.dtype # uint8 x = np.arange(0, 256, dtype=np.int16) lut_hue = ((x * r[0]) % 180).astype(dtype) lut_sat = np.clip(x * r[1], 0, 255).astype(dtype) lut_val = np.clip(x * r[2], 0, 255).astype(dtype) img_hsv = cv2.merge( (cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val)) ).astype(dtype) cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img) # no return needed def box_candidates(box1, box2, wh_thr=2, ar_thr=20, area_thr=0.2): # box1(4,n), box2(4,n) # Compute candidate boxes which include follwing 5 things: # box1 before augment, box2 after augment, wh_thr (pixels), aspect_ratio_thr, area_ratio w1, h1 = box1[2] - box1[0], box1[3] - box1[1] w2, h2 = box2[2] - box2[0], box2[3] - box2[1] ar = np.maximum(w2 / (h2 + 1e-16), h2 / (w2 + 1e-16)) # aspect ratio return ( (w2 > wh_thr) & (h2 > wh_thr) & (w2 * h2 / (w1 * h1 + 1e-16) > area_thr) & (ar < ar_thr) ) # candidates def random_perspective( img, targets=(), degrees=10, translate=0.1, scale=0.1, shear=10, perspective=0.0, border=(0, 0), ): # targets = [cls, xyxy] height = img.shape[0] + border[0] * 2 # shape(h,w,c) width = img.shape[1] + border[1] * 2 # Center C = np.eye(3) C[0, 2] = -img.shape[1] / 2 # x translation (pixels) C[1, 2] = -img.shape[0] / 2 # y translation (pixels) # Rotation and Scale R = np.eye(3) a = random.uniform(-degrees, degrees) # a += random.choice([-180, -90, 0, 90]) # add 90deg rotations to small rotations s = random.uniform(scale[0], scale[1]) # s = 2 ** random.uniform(-scale, scale) R[:2] = cv2.getRotationMatrix2D(angle=a, center=(0, 0), scale=s) # Shear S = np.eye(3) S[0, 1] = math.tan(random.uniform(-shear, shear) * math.pi / 180) # x shear (deg) S[1, 0] = math.tan(random.uniform(-shear, shear) * math.pi / 180) # y shear (deg) # Translation T = np.eye(3) T[0, 2] = ( random.uniform(0.5 - translate, 0.5 + translate) * width ) # x translation (pixels) T[1, 2] = ( random.uniform(0.5 - translate, 0.5 + translate) * height ) # y translation (pixels) # Combined rotation matrix M = T @ S @ R @ C # order of operations (right to left) is IMPORTANT ########################### # For Aug out of Mosaic # s = 1. # M = np.eye(3) ########################### if (border[0] != 0) or (border[1] != 0) or (M != np.eye(3)).any(): # image changed if perspective: img = cv2.warpPerspective( img, M, dsize=(width, height), borderValue=(114, 114, 114) ) else: # affine img = cv2.warpAffine( img, M[:2], dsize=(width, height), borderValue=(114, 114, 114) ) # Transform label coordinates n = len(targets) if n: # warp points xy = np.ones((n * 4, 3)) xy[:, :2] = targets[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape( n * 4, 2 ) # x1y1, x2y2, x1y2, x2y1 xy = xy @ M.T # transform if perspective: xy = (xy[:, :2] / xy[:, 2:3]).reshape(n, 8) # rescale else: # affine xy = xy[:, :2].reshape(n, 8) # create new boxes x = xy[:, [0, 2, 4, 6]] y = xy[:, [1, 3, 5, 7]] xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T # clip boxes #xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width) #xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height) # filter candidates i = box_candidates(box1=targets[:, :4].T * s, box2=xy.T) targets = targets[i] targets[:, :4] = xy[i] targets = targets[targets[:, 0] < width] targets = targets[targets[:, 2] > 0] targets = targets[targets[:, 1] < height] targets = targets[targets[:, 3] > 0] return img, targets def _distort(image): def _convert(image, alpha=1, beta=0): tmp = image.astype(float) * alpha + beta tmp[tmp < 0] = 0 tmp[tmp > 255] = 255 image[:] = tmp image = image.copy() if random.randrange(2): _convert(image, beta=random.uniform(-32, 32)) if random.randrange(2): _convert(image, alpha=random.uniform(0.5, 1.5)) image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) if random.randrange(2): tmp = image[:, :, 0].astype(int) + random.randint(-18, 18) tmp %= 180 image[:, :, 0] = tmp if random.randrange(2): _convert(image[:, :, 1], alpha=random.uniform(0.5, 1.5)) image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR) return image def _mirror(image, boxes): _, width, _ = image.shape if random.randrange(2): image = image[:, ::-1] boxes = boxes.copy() boxes[:, 0::2] = width - boxes[:, 2::-2] return image, boxes def preproc(img, input_size, mean, std, swap=(2, 0, 1)): if len(img.shape) == 3: padded_img = np.ones((input_size[0], input_size[1], 3)) * 114.0 else: padded_img = np.ones(input_size) * 114.0 r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1]) resized_img = cv2.resize( img, (int(img.shape[1] * r), int(img.shape[0] * r)), interpolation=cv2.INTER_LINEAR, ).astype(np.float32) padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img padded_img = padded_img[:, :, ::-1] padded_img /= 255.0 padded_img = (padded_img - mean) / std padded_img = padded_img.transpose(swap) padded_img = np.ascontiguousarray(padded_img, dtype=np.float32) return padded_img, r class TrainTransform: def __init__(self, p=0.5, rgb_means=None, std=None, max_labels=100): self.means = rgb_means self.std = std self.p = p self.max_labels = max_labels def __call__(self, image, targets, input_dim): boxes = targets[:, :4].copy() labels = targets[:, 4].copy() ids = targets[:, 5].copy() if len(boxes) == 0: targets = np.zeros((self.max_labels, 6), dtype=np.float32) image, r_o = preproc(image, input_dim, self.means, self.std) image = np.ascontiguousarray(image, dtype=np.float32) return image, targets image_o = image.copy() targets_o = targets.copy() height_o, width_o, _ = image_o.shape boxes_o = targets_o[:, :4] labels_o = targets_o[:, 4] ids_o = targets_o[:, 5] # bbox_o: [xyxy] to [c_x,c_y,w,h] boxes_o = xyxy2cxcywh(boxes_o) image_t = _distort(image) image_t, boxes = _mirror(image_t, boxes) height, width, _ = image_t.shape image_t, r_ = preproc(image_t, input_dim, self.means, self.std) # boxes [xyxy] 2 [cx,cy,w,h] boxes = xyxy2cxcywh(boxes) boxes *= r_ mask_b = np.minimum(boxes[:, 2], boxes[:, 3]) > 1 boxes_t = boxes[mask_b] labels_t = labels[mask_b] ids_t = ids[mask_b] if len(boxes_t) == 0: image_t, r_o = preproc(image_o, input_dim, self.means, self.std) boxes_o *= r_o boxes_t = boxes_o labels_t = labels_o ids_t = ids_o labels_t = np.expand_dims(labels_t, 1) ids_t = np.expand_dims(ids_t, 1) targets_t = np.hstack((labels_t, boxes_t, ids_t)) padded_labels = np.zeros((self.max_labels, 6)) padded_labels[range(len(targets_t))[: self.max_labels]] = targets_t[ : self.max_labels ] padded_labels = np.ascontiguousarray(padded_labels, dtype=np.float32) image_t = np.ascontiguousarray(image_t, dtype=np.float32) return image_t, padded_labels class ValTransform: """ Defines the transformations that should be applied to test PIL image for input into the network dimension -> tensorize -> color adj Arguments: resize (int): input dimension to SSD rgb_means ((int,int,int)): average RGB of the dataset (104,117,123) swap ((int,int,int)): final order of channels Returns: transform (transform) : callable transform to be applied to test/val data """ def __init__(self, rgb_means=None, std=None, swap=(2, 0, 1)): self.means = rgb_means self.swap = swap self.std = std # assume input is cv2 img for now def __call__(self, img, res, input_size): img, _ = preproc(img, input_size, self.means, self.std, self.swap) return img, np.zeros((1, 5))