operators.py 127 KB


  1. # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # function:
  15. # operators to process sample,
  16. # eg: decode/resize/crop image
  17. from __future__ import absolute_import
  18. from __future__ import print_function
  19. from __future__ import division
  20. try:
  21. from collections.abc import Sequence
  22. except Exception:
  23. from collections import Sequence
  24. from numbers import Number, Integral
  25. import uuid
  26. import random
  27. import math
  28. import numpy as np
  29. import os
  30. import copy
  31. import logging
  32. import cv2
  33. from PIL import Image, ImageDraw
  34. import pickle
  35. import threading
  36. MUTEX = threading.Lock()
  37. from ppdet.core.workspace import serializable
  38. from ppdet.modeling import bbox_utils
  39. from ..reader import Compose
  40. from .op_helper import (satisfy_sample_constraint, filter_and_process,
  41. generate_sample_bbox, clip_bbox, data_anchor_sampling,
  42. satisfy_sample_constraint_coverage, crop_image_sampling,
  43. generate_sample_bbox_square, bbox_area_sampling,
  44. is_poly, get_border)
  45. from ppdet.utils.logger import setup_logger
  46. from ppdet.modeling.keypoint_utils import get_affine_transform, affine_transform
  47. logger = setup_logger(__name__)
  48. registered_ops = []
  49. def register_op(cls):
  50. registered_ops.append(cls.__name__)
  51. if not hasattr(BaseOperator, cls.__name__):
  52. setattr(BaseOperator, cls.__name__, cls)
  53. else:
  54. raise KeyError("The {} class has been registered.".format(cls.__name__))
  55. return serializable(cls)
  56. class BboxError(ValueError):
  57. pass
  58. class ImageError(ValueError):
  59. pass
  60. class BaseOperator(object):
  61. def __init__(self, name=None):
  62. if name is None:
  63. name = self.__class__.__name__
  64. self._id = name + '_' + str(uuid.uuid4())[-6:]
  65. def apply(self, sample, context=None):
  66. """ Process a sample.
  67. Args:
  68. sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx}
  69. context (dict): info about this sample processing
  70. Returns:
  71. result (dict): a processed sample
  72. """
  73. return sample
  74. def __call__(self, sample, context=None):
  75. """ Process a sample.
  76. Args:
  77. sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx}
  78. context (dict): info about this sample processing
  79. Returns:
  80. result (dict): a processed sample
  81. """
  82. if isinstance(sample, Sequence):
  83. for i in range(len(sample)):
  84. sample[i] = self.apply(sample[i], context)
  85. else:
  86. sample = self.apply(sample, context)
  87. return sample
  88. def __str__(self):
  89. return str(self._id)
  90. @register_op
  91. class Decode(BaseOperator):
  92. def __init__(self):
  93. """ Transform the image data to numpy format following the rgb format
  94. """
  95. super(Decode, self).__init__()
  96. def apply(self, sample, context=None):
  97. """ load image if 'im_file' field is not empty but 'image' is"""
  98. if 'image' not in sample:
  99. with open(sample['im_file'], 'rb') as f:
  100. sample['image'] = f.read()
  101. sample.pop('im_file')
  102. im = sample['image']
  103. data = np.frombuffer(im, dtype='uint8')
  104. im = cv2.imdecode(data, 1) # BGR mode, but need RGB mode
  105. if 'keep_ori_im' in sample and sample['keep_ori_im']:
  106. sample['ori_image'] = im
  107. im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
  108. sample['image'] = im
  109. if 'h' not in sample:
  110. sample['h'] = im.shape[0]
  111. elif sample['h'] != im.shape[0]:
  112. logger.warning(
  113. "The actual image height: {} is not equal to the "
  114. "height: {} in annotation, and update sample['h'] by actual "
  115. "image height.".format(im.shape[0], sample['h']))
  116. sample['h'] = im.shape[0]
  117. if 'w' not in sample:
  118. sample['w'] = im.shape[1]
  119. elif sample['w'] != im.shape[1]:
  120. logger.warning(
  121. "The actual image width: {} is not equal to the "
  122. "width: {} in annotation, and update sample['w'] by actual "
  123. "image width.".format(im.shape[1], sample['w']))
  124. sample['w'] = im.shape[1]
  125. sample['im_shape'] = np.array(im.shape[:2], dtype=np.float32)
  126. sample['scale_factor'] = np.array([1., 1.], dtype=np.float32)
  127. return sample
  128. def _make_dirs(dirname):
  129. try:
  130. from pathlib import Path
  131. except ImportError:
  132. from pathlib2 import Path
  133. Path(dirname).mkdir(exist_ok=True)
  134. @register_op
  135. class DecodeCache(BaseOperator):
  136. def __init__(self, cache_root=None):
  137. '''decode image and caching
  138. '''
  139. super(DecodeCache, self).__init__()
  140. self.use_cache = False if cache_root is None else True
  141. self.cache_root = cache_root
  142. if cache_root is not None:
  143. _make_dirs(cache_root)
  144. def apply(self, sample, context=None):
  145. if self.use_cache and os.path.exists(
  146. self.cache_path(self.cache_root, sample['im_file'])):
  147. path = self.cache_path(self.cache_root, sample['im_file'])
  148. im = self.load(path)
  149. else:
  150. if 'image' not in sample:
  151. with open(sample['im_file'], 'rb') as f:
  152. sample['image'] = f.read()
  153. im = sample['image']
  154. data = np.frombuffer(im, dtype='uint8')
  155. im = cv2.imdecode(data, 1) # BGR mode, but need RGB mode
  156. if 'keep_ori_im' in sample and sample['keep_ori_im']:
  157. sample['ori_image'] = im
  158. im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
  159. if self.use_cache and not os.path.exists(
  160. self.cache_path(self.cache_root, sample['im_file'])):
  161. path = self.cache_path(self.cache_root, sample['im_file'])
  162. self.dump(im, path)
  163. sample['image'] = im
  164. sample['h'] = im.shape[0]
  165. sample['w'] = im.shape[1]
  166. sample['im_shape'] = np.array(im.shape[:2], dtype=np.float32)
  167. sample['scale_factor'] = np.array([1., 1.], dtype=np.float32)
  168. sample.pop('im_file')
  169. return sample
  170. @staticmethod
  171. def cache_path(dir_oot, im_file):
  172. return os.path.join(dir_oot, os.path.basename(im_file) + '.pkl')
  173. @staticmethod
  174. def load(path):
  175. with open(path, 'rb') as f:
  176. im = pickle.load(f)
  177. return im
  178. @staticmethod
  179. def dump(obj, path):
  180. MUTEX.acquire()
  181. try:
  182. with open(path, 'wb') as f:
  183. pickle.dump(obj, f)
  184. except Exception as e:
  185. logger.warning('dump {} occurs exception {}'.format(path, str(e)))
  186. finally:
  187. MUTEX.release()
  188. @register_op
  189. class SniperDecodeCrop(BaseOperator):
  190. def __init__(self):
  191. super(SniperDecodeCrop, self).__init__()
  192. def __call__(self, sample, context=None):
  193. if 'image' not in sample:
  194. with open(sample['im_file'], 'rb') as f:
  195. sample['image'] = f.read()
  196. sample.pop('im_file')
  197. im = sample['image']
  198. data = np.frombuffer(im, dtype='uint8')
  199. im = cv2.imdecode(data, cv2.IMREAD_COLOR) # BGR mode, but need RGB mode
  200. if 'keep_ori_im' in sample and sample['keep_ori_im']:
  201. sample['ori_image'] = im
  202. im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
  203. chip = sample['chip']
  204. x1, y1, x2, y2 = [int(xi) for xi in chip]
  205. im = im[max(y1, 0):min(y2, im.shape[0]), max(x1, 0):min(x2, im.shape[
  206. 1]), :]
  207. sample['image'] = im
  208. h = im.shape[0]
  209. w = im.shape[1]
  210. # sample['im_info'] = [h, w, 1.0]
  211. sample['h'] = h
  212. sample['w'] = w
  213. sample['im_shape'] = np.array(im.shape[:2], dtype=np.float32)
  214. sample['scale_factor'] = np.array([1., 1.], dtype=np.float32)
  215. return sample
  216. @register_op
  217. class Permute(BaseOperator):
  218. def __init__(self):
  219. """
  220. Change the channel to be (C, H, W)
  221. """
  222. super(Permute, self).__init__()
  223. def apply(self, sample, context=None):
  224. im = sample['image']
  225. im = im.transpose((2, 0, 1))
  226. sample['image'] = im
  227. return sample
  228. @register_op
  229. class Lighting(BaseOperator):
  230. """
  231. Lighting the image by eigenvalues and eigenvectors
  232. Args:
  233. eigval (list): eigenvalues
  234. eigvec (list): eigenvectors
  235. alphastd (float): random weight of lighting, 0.1 by default
  236. """
  237. def __init__(self, eigval, eigvec, alphastd=0.1):
  238. super(Lighting, self).__init__()
  239. self.alphastd = alphastd
  240. self.eigval = np.array(eigval).astype('float32')
  241. self.eigvec = np.array(eigvec).astype('float32')
  242. def apply(self, sample, context=None):
  243. alpha = np.random.normal(scale=self.alphastd, size=(3, ))
  244. sample['image'] += np.dot(self.eigvec, self.eigval * alpha)
  245. return sample
  246. @register_op
  247. class RandomErasingImage(BaseOperator):
  248. def __init__(self, prob=0.5, lower=0.02, higher=0.4, aspect_ratio=0.3):
  249. """
  250. Random Erasing Data Augmentation, see https://arxiv.org/abs/1708.04896
  251. Args:
  252. prob (float): probability to carry out random erasing
  253. lower (float): lower limit of the erasing area ratio
  254. higher (float): upper limit of the erasing area ratio
  255. aspect_ratio (float): aspect ratio of the erasing region
  256. """
  257. super(RandomErasingImage, self).__init__()
  258. self.prob = prob
  259. self.lower = lower
  260. self.higher = higher
  261. self.aspect_ratio = aspect_ratio
  262. def apply(self, sample, context=None):
  263. gt_bbox = sample['gt_bbox']
  264. im = sample['image']
  265. if not isinstance(im, np.ndarray):
  266. raise TypeError("{}: image is not a numpy array.".format(self))
  267. if len(im.shape) != 3:
  268. raise ImageError("{}: image is not 3-dimensional.".format(self))
  269. for idx in range(gt_bbox.shape[0]):
  270. if self.prob <= np.random.rand():
  271. continue
  272. x1, y1, x2, y2 = gt_bbox[idx, :]
  273. w_bbox = x2 - x1
  274. h_bbox = y2 - y1
  275. area = w_bbox * h_bbox
  276. target_area = random.uniform(self.lower, self.higher) * area
  277. aspect_ratio = random.uniform(self.aspect_ratio,
  278. 1 / self.aspect_ratio)
  279. h = int(round(math.sqrt(target_area * aspect_ratio)))
  280. w = int(round(math.sqrt(target_area / aspect_ratio)))
  281. if w < w_bbox and h < h_bbox:
  282. off_y1 = random.randint(0, int(h_bbox - h))
  283. off_x1 = random.randint(0, int(w_bbox - w))
  284. im[int(y1 + off_y1):int(y1 + off_y1 + h), int(x1 + off_x1):int(
  285. x1 + off_x1 + w), :] = 0
  286. sample['image'] = im
  287. return sample
  288. @register_op
  289. class NormalizeImage(BaseOperator):
  290. def __init__(self, mean=[0.485, 0.456, 0.406], std=[1, 1, 1],
  291. is_scale=True):
  292. """
  293. Args:
  294. mean (list): the pixel mean
  295. std (list): the pixel variance
  296. """
  297. super(NormalizeImage, self).__init__()
  298. self.mean = mean
  299. self.std = std
  300. self.is_scale = is_scale
  301. if not (isinstance(self.mean, list) and isinstance(self.std, list) and
  302. isinstance(self.is_scale, bool)):
  303. raise TypeError("{}: input type is invalid.".format(self))
  304. from functools import reduce
  305. if reduce(lambda x, y: x * y, self.std) == 0:
  306. raise ValueError('{}: std is invalid!'.format(self))
  307. def apply(self, sample, context=None):
  308. """Normalize the image.
  309. Operators:
  310. 1.(optional) Scale the image to [0,1]
  311. 2. Each pixel minus mean and is divided by std
  312. """
  313. im = sample['image']
  314. im = im.astype(np.float32, copy=False)
  315. mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
  316. std = np.array(self.std)[np.newaxis, np.newaxis, :]
  317. if self.is_scale:
  318. im = im / 255.0
  319. im -= mean
  320. im /= std
  321. sample['image'] = im
  322. return sample
  323. @register_op
  324. class GridMask(BaseOperator):
  325. def __init__(self,
  326. use_h=True,
  327. use_w=True,
  328. rotate=1,
  329. offset=False,
  330. ratio=0.5,
  331. mode=1,
  332. prob=0.7,
  333. upper_iter=360000):
  334. """
  335. GridMask Data Augmentation, see https://arxiv.org/abs/2001.04086
  336. Args:
  337. use_h (bool): whether to mask vertically
  338. use_w (boo;): whether to mask horizontally
  339. rotate (float): angle for the mask to rotate
  340. offset (float): mask offset
  341. ratio (float): mask ratio
  342. mode (int): gridmask mode
  343. prob (float): max probability to carry out gridmask
  344. upper_iter (int): suggested to be equal to global max_iter
  345. """
  346. super(GridMask, self).__init__()
  347. self.use_h = use_h
  348. self.use_w = use_w
  349. self.rotate = rotate
  350. self.offset = offset
  351. self.ratio = ratio
  352. self.mode = mode
  353. self.prob = prob
  354. self.upper_iter = upper_iter
  355. from .gridmask_utils import Gridmask
  356. self.gridmask_op = Gridmask(
  357. use_h,
  358. use_w,
  359. rotate=rotate,
  360. offset=offset,
  361. ratio=ratio,
  362. mode=mode,
  363. prob=prob,
  364. upper_iter=upper_iter)
  365. def apply(self, sample, context=None):
  366. sample['image'] = self.gridmask_op(sample['image'], sample['curr_iter'])
  367. return sample
  368. @register_op
  369. class RandomDistort(BaseOperator):
  370. """Random color distortion.
  371. Args:
  372. hue (list): hue settings. in [lower, upper, probability] format.
  373. saturation (list): saturation settings. in [lower, upper, probability] format.
  374. contrast (list): contrast settings. in [lower, upper, probability] format.
  375. brightness (list): brightness settings. in [lower, upper, probability] format.
  376. random_apply (bool): whether to apply in random (yolo) or fixed (SSD)
  377. order.
  378. count (int): the number of doing distrot
  379. random_channel (bool): whether to swap channels randomly
  380. """
  381. def __init__(self,
  382. hue=[-18, 18, 0.5],
  383. saturation=[0.5, 1.5, 0.5],
  384. contrast=[0.5, 1.5, 0.5],
  385. brightness=[0.5, 1.5, 0.5],
  386. random_apply=True,
  387. count=4,
  388. random_channel=False):
  389. super(RandomDistort, self).__init__()
  390. self.hue = hue
  391. self.saturation = saturation
  392. self.contrast = contrast
  393. self.brightness = brightness
  394. self.random_apply = random_apply
  395. self.count = count
  396. self.random_channel = random_channel
  397. def apply_hue(self, img):
  398. low, high, prob = self.hue
  399. if np.random.uniform(0., 1.) < prob:
  400. return img
  401. img = img.astype(np.float32)
  402. # it works, but result differ from HSV version
  403. delta = np.random.uniform(low, high)
  404. u = np.cos(delta * np.pi)
  405. w = np.sin(delta * np.pi)
  406. bt = np.array([[1.0, 0.0, 0.0], [0.0, u, -w], [0.0, w, u]])
  407. tyiq = np.array([[0.299, 0.587, 0.114], [0.596, -0.274, -0.321],
  408. [0.211, -0.523, 0.311]])
  409. ityiq = np.array([[1.0, 0.956, 0.621], [1.0, -0.272, -0.647],
  410. [1.0, -1.107, 1.705]])
  411. t = np.dot(np.dot(ityiq, bt), tyiq).T
  412. img = np.dot(img, t)
  413. return img
  414. def apply_saturation(self, img):
  415. low, high, prob = self.saturation
  416. if np.random.uniform(0., 1.) < prob:
  417. return img
  418. delta = np.random.uniform(low, high)
  419. img = img.astype(np.float32)
  420. # it works, but result differ from HSV version
  421. gray = img * np.array([[[0.299, 0.587, 0.114]]], dtype=np.float32)
  422. gray = gray.sum(axis=2, keepdims=True)
  423. gray *= (1.0 - delta)
  424. img *= delta
  425. img += gray
  426. return img
  427. def apply_contrast(self, img):
  428. low, high, prob = self.contrast
  429. if np.random.uniform(0., 1.) < prob:
  430. return img
  431. delta = np.random.uniform(low, high)
  432. img = img.astype(np.float32)
  433. img *= delta
  434. return img
  435. def apply_brightness(self, img):
  436. low, high, prob = self.brightness
  437. if np.random.uniform(0., 1.) < prob:
  438. return img
  439. delta = np.random.uniform(low, high)
  440. img = img.astype(np.float32)
  441. img += delta
  442. return img
  443. def apply(self, sample, context=None):
  444. img = sample['image']
  445. if self.random_apply:
  446. functions = [
  447. self.apply_brightness, self.apply_contrast,
  448. self.apply_saturation, self.apply_hue
  449. ]
  450. distortions = np.random.permutation(functions)[:self.count]
  451. for func in distortions:
  452. img = func(img)
  453. sample['image'] = img
  454. return sample
  455. img = self.apply_brightness(img)
  456. mode = np.random.randint(0, 2)
  457. if mode:
  458. img = self.apply_contrast(img)
  459. img = self.apply_saturation(img)
  460. img = self.apply_hue(img)
  461. if not mode:
  462. img = self.apply_contrast(img)
  463. if self.random_channel:
  464. if np.random.randint(0, 2):
  465. img = img[..., np.random.permutation(3)]
  466. sample['image'] = img
  467. return sample
  468. @register_op
  469. class AutoAugment(BaseOperator):
  470. def __init__(self, autoaug_type="v1"):
  471. """
  472. Args:
  473. autoaug_type (str): autoaug type, support v0, v1, v2, v3, test
  474. """
  475. super(AutoAugment, self).__init__()
  476. self.autoaug_type = autoaug_type
  477. def apply(self, sample, context=None):
  478. """
  479. Learning Data Augmentation Strategies for Object Detection, see https://arxiv.org/abs/1906.11172
  480. """
  481. im = sample['image']
  482. gt_bbox = sample['gt_bbox']
  483. if not isinstance(im, np.ndarray):
  484. raise TypeError("{}: image is not a numpy array.".format(self))
  485. if len(im.shape) != 3:
  486. raise ImageError("{}: image is not 3-dimensional.".format(self))
  487. if len(gt_bbox) == 0:
  488. return sample
  489. height, width, _ = im.shape
  490. norm_gt_bbox = np.ones_like(gt_bbox, dtype=np.float32)
  491. norm_gt_bbox[:, 0] = gt_bbox[:, 1] / float(height)
  492. norm_gt_bbox[:, 1] = gt_bbox[:, 0] / float(width)
  493. norm_gt_bbox[:, 2] = gt_bbox[:, 3] / float(height)
  494. norm_gt_bbox[:, 3] = gt_bbox[:, 2] / float(width)
  495. from .autoaugment_utils import distort_image_with_autoaugment
  496. im, norm_gt_bbox = distort_image_with_autoaugment(im, norm_gt_bbox,
  497. self.autoaug_type)
  498. gt_bbox[:, 0] = norm_gt_bbox[:, 1] * float(width)
  499. gt_bbox[:, 1] = norm_gt_bbox[:, 0] * float(height)
  500. gt_bbox[:, 2] = norm_gt_bbox[:, 3] * float(width)
  501. gt_bbox[:, 3] = norm_gt_bbox[:, 2] * float(height)
  502. sample['image'] = im
  503. sample['gt_bbox'] = gt_bbox
  504. return sample
  505. @register_op
  506. class RandomFlip(BaseOperator):
  507. def __init__(self, prob=0.5):
  508. """
  509. Args:
  510. prob (float): the probability of flipping image
  511. """
  512. super(RandomFlip, self).__init__()
  513. self.prob = prob
  514. if not (isinstance(self.prob, float)):
  515. raise TypeError("{}: input type is invalid.".format(self))
  516. def apply_segm(self, segms, height, width):
  517. def _flip_poly(poly, width):
  518. flipped_poly = np.array(poly)
  519. flipped_poly[0::2] = width - np.array(poly[0::2])
  520. return flipped_poly.tolist()
  521. def _flip_rle(rle, height, width):
  522. if 'counts' in rle and type(rle['counts']) == list:
  523. rle = mask_util.frPyObjects(rle, height, width)
  524. mask = mask_util.decode(rle)
  525. mask = mask[:, ::-1]
  526. rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
  527. return rle
  528. flipped_segms = []
  529. for segm in segms:
  530. if is_poly(segm):
  531. # Polygon format
  532. flipped_segms.append([_flip_poly(poly, width) for poly in segm])
  533. else:
  534. # RLE format
  535. import pycocotools.mask as mask_util
  536. flipped_segms.append(_flip_rle(segm, height, width))
  537. return flipped_segms
  538. def apply_keypoint(self, gt_keypoint, width):
  539. for i in range(gt_keypoint.shape[1]):
  540. if i % 2 == 0:
  541. old_x = gt_keypoint[:, i].copy()
  542. gt_keypoint[:, i] = width - old_x
  543. return gt_keypoint
  544. def apply_image(self, image):
  545. return image[:, ::-1, :]
  546. def apply_bbox(self, bbox, width):
  547. oldx1 = bbox[:, 0].copy()
  548. oldx2 = bbox[:, 2].copy()
  549. bbox[:, 0] = width - oldx2
  550. bbox[:, 2] = width - oldx1
  551. return bbox
  552. def apply_rbox(self, bbox, width):
  553. oldx1 = bbox[:, 0].copy()
  554. oldx2 = bbox[:, 2].copy()
  555. oldx3 = bbox[:, 4].copy()
  556. oldx4 = bbox[:, 6].copy()
  557. bbox[:, 0] = width - oldx1
  558. bbox[:, 2] = width - oldx2
  559. bbox[:, 4] = width - oldx3
  560. bbox[:, 6] = width - oldx4
  561. bbox = [bbox_utils.get_best_begin_point_single(e) for e in bbox]
  562. return bbox
  563. def apply(self, sample, context=None):
  564. """Filp the image and bounding box.
  565. Operators:
  566. 1. Flip the image numpy.
  567. 2. Transform the bboxes' x coordinates.
  568. (Must judge whether the coordinates are normalized!)
  569. 3. Transform the segmentations' x coordinates.
  570. (Must judge whether the coordinates are normalized!)
  571. Output:
  572. sample: the image, bounding box and segmentation part
  573. in sample are flipped.
  574. """
  575. if np.random.uniform(0, 1) < self.prob:
  576. im = sample['image']
  577. height, width = im.shape[:2]
  578. im = self.apply_image(im)
  579. if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
  580. sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], width)
  581. if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
  582. sample['gt_poly'] = self.apply_segm(sample['gt_poly'], height,
  583. width)
  584. if 'gt_keypoint' in sample and len(sample['gt_keypoint']) > 0:
  585. sample['gt_keypoint'] = self.apply_keypoint(
  586. sample['gt_keypoint'], width)
  587. if 'semantic' in sample and sample['semantic']:
  588. sample['semantic'] = sample['semantic'][:, ::-1]
  589. if 'gt_segm' in sample and sample['gt_segm'].any():
  590. sample['gt_segm'] = sample['gt_segm'][:, :, ::-1]
  591. if 'gt_rbox2poly' in sample and sample['gt_rbox2poly'].any():
  592. sample['gt_rbox2poly'] = self.apply_rbox(sample['gt_rbox2poly'],
  593. width)
  594. sample['flipped'] = True
  595. sample['image'] = im
  596. return sample
  597. @register_op
  598. class Resize(BaseOperator):
  599. def __init__(self, target_size, keep_ratio, interp=cv2.INTER_LINEAR):
  600. """
  601. Resize image to target size. if keep_ratio is True,
  602. resize the image's long side to the maximum of target_size
  603. if keep_ratio is False, resize the image to target size(h, w)
  604. Args:
  605. target_size (int|list): image target size
  606. keep_ratio (bool): whether keep_ratio or not, default true
  607. interp (int): the interpolation method
  608. """
  609. super(Resize, self).__init__()
  610. self.keep_ratio = keep_ratio
  611. self.interp = interp
  612. if not isinstance(target_size, (Integral, Sequence)):
  613. raise TypeError(
  614. "Type of target_size is invalid. Must be Integer or List or Tuple, now is {}".
  615. format(type(target_size)))
  616. if isinstance(target_size, Integral):
  617. target_size = [target_size, target_size]
  618. self.target_size = target_size
  619. def apply_image(self, image, scale):
  620. im_scale_x, im_scale_y = scale
  621. return cv2.resize(
  622. image,
  623. None,
  624. None,
  625. fx=im_scale_x,
  626. fy=im_scale_y,
  627. interpolation=self.interp)
  628. def apply_bbox(self, bbox, scale, size):
  629. im_scale_x, im_scale_y = scale
  630. resize_w, resize_h = size
  631. bbox[:, 0::2] *= im_scale_x
  632. bbox[:, 1::2] *= im_scale_y
  633. bbox[:, 0::2] = np.clip(bbox[:, 0::2], 0, resize_w)
  634. bbox[:, 1::2] = np.clip(bbox[:, 1::2], 0, resize_h)
  635. return bbox
  636. def apply_segm(self, segms, im_size, scale):
  637. def _resize_poly(poly, im_scale_x, im_scale_y):
  638. resized_poly = np.array(poly).astype('float32')
  639. resized_poly[0::2] *= im_scale_x
  640. resized_poly[1::2] *= im_scale_y
  641. return resized_poly.tolist()
  642. def _resize_rle(rle, im_h, im_w, im_scale_x, im_scale_y):
  643. if 'counts' in rle and type(rle['counts']) == list:
  644. rle = mask_util.frPyObjects(rle, im_h, im_w)
  645. mask = mask_util.decode(rle)
  646. mask = cv2.resize(
  647. mask,
  648. None,
  649. None,
  650. fx=im_scale_x,
  651. fy=im_scale_y,
  652. interpolation=self.interp)
  653. rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
  654. return rle
  655. im_h, im_w = im_size
  656. im_scale_x, im_scale_y = scale
  657. resized_segms = []
  658. for segm in segms:
  659. if is_poly(segm):
  660. # Polygon format
  661. resized_segms.append([
  662. _resize_poly(poly, im_scale_x, im_scale_y) for poly in segm
  663. ])
  664. else:
  665. # RLE format
  666. import pycocotools.mask as mask_util
  667. resized_segms.append(
  668. _resize_rle(segm, im_h, im_w, im_scale_x, im_scale_y))
  669. return resized_segms
  670. def apply(self, sample, context=None):
  671. """ Resize the image numpy.
  672. """
  673. im = sample['image']
  674. if not isinstance(im, np.ndarray):
  675. raise TypeError("{}: image type is not numpy.".format(self))
  676. if len(im.shape) != 3:
  677. raise ImageError('{}: image is not 3-dimensional.'.format(self))
  678. # apply image
  679. im_shape = im.shape
  680. if self.keep_ratio:
  681. im_size_min = np.min(im_shape[0:2])
  682. im_size_max = np.max(im_shape[0:2])
  683. target_size_min = np.min(self.target_size)
  684. target_size_max = np.max(self.target_size)
  685. im_scale = min(target_size_min / im_size_min,
  686. target_size_max / im_size_max)
  687. resize_h = im_scale * float(im_shape[0])
  688. resize_w = im_scale * float(im_shape[1])
  689. im_scale_x = im_scale
  690. im_scale_y = im_scale
  691. else:
  692. resize_h, resize_w = self.target_size
  693. im_scale_y = resize_h / im_shape[0]
  694. im_scale_x = resize_w / im_shape[1]
  695. im = self.apply_image(sample['image'], [im_scale_x, im_scale_y])
  696. sample['image'] = im
  697. sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32)
  698. if 'scale_factor' in sample:
  699. scale_factor = sample['scale_factor']
  700. sample['scale_factor'] = np.asarray(
  701. [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x],
  702. dtype=np.float32)
  703. else:
  704. sample['scale_factor'] = np.asarray(
  705. [im_scale_y, im_scale_x], dtype=np.float32)
  706. # apply bbox
  707. if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
  708. sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'],
  709. [im_scale_x, im_scale_y],
  710. [resize_w, resize_h])
  711. # apply rbox
  712. if 'gt_rbox2poly' in sample:
  713. if np.array(sample['gt_rbox2poly']).shape[1] != 8:
  714. logger.warning(
  715. "gt_rbox2poly's length shoule be 8, but actually is {}".
  716. format(len(sample['gt_rbox2poly'])))
  717. sample['gt_rbox2poly'] = self.apply_bbox(sample['gt_rbox2poly'],
  718. [im_scale_x, im_scale_y],
  719. [resize_w, resize_h])
  720. # apply polygon
  721. if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
  722. sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im_shape[:2],
  723. [im_scale_x, im_scale_y])
  724. # apply semantic
  725. if 'semantic' in sample and sample['semantic']:
  726. semantic = sample['semantic']
  727. semantic = cv2.resize(
  728. semantic.astype('float32'),
  729. None,
  730. None,
  731. fx=im_scale_x,
  732. fy=im_scale_y,
  733. interpolation=self.interp)
  734. semantic = np.asarray(semantic).astype('int32')
  735. semantic = np.expand_dims(semantic, 0)
  736. sample['semantic'] = semantic
  737. # apply gt_segm
  738. if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
  739. masks = [
  740. cv2.resize(
  741. gt_segm,
  742. None,
  743. None,
  744. fx=im_scale_x,
  745. fy=im_scale_y,
  746. interpolation=cv2.INTER_NEAREST)
  747. for gt_segm in sample['gt_segm']
  748. ]
  749. sample['gt_segm'] = np.asarray(masks).astype(np.uint8)
  750. return sample
  751. @register_op
  752. class MultiscaleTestResize(BaseOperator):
  753. def __init__(self,
  754. origin_target_size=[800, 1333],
  755. target_size=[],
  756. interp=cv2.INTER_LINEAR,
  757. use_flip=True):
  758. """
  759. Rescale image to the each size in target size, and capped at max_size.
  760. Args:
  761. origin_target_size (list): origin target size of image
  762. target_size (list): A list of target sizes of image.
  763. interp (int): the interpolation method.
  764. use_flip (bool): whether use flip augmentation.
  765. """
  766. super(MultiscaleTestResize, self).__init__()
  767. self.interp = interp
  768. self.use_flip = use_flip
  769. if not isinstance(target_size, Sequence):
  770. raise TypeError(
  771. "Type of target_size is invalid. Must be List or Tuple, now is {}".
  772. format(type(target_size)))
  773. self.target_size = target_size
  774. if not isinstance(origin_target_size, Sequence):
  775. raise TypeError(
  776. "Type of origin_target_size is invalid. Must be List or Tuple, now is {}".
  777. format(type(origin_target_size)))
  778. self.origin_target_size = origin_target_size
  779. def apply(self, sample, context=None):
  780. """ Resize the image numpy for multi-scale test.
  781. """
  782. samples = []
  783. resizer = Resize(
  784. self.origin_target_size, keep_ratio=True, interp=self.interp)
  785. samples.append(resizer(sample.copy(), context))
  786. if self.use_flip:
  787. flipper = RandomFlip(1.1)
  788. samples.append(flipper(sample.copy(), context=context))
  789. for size in self.target_size:
  790. resizer = Resize(size, keep_ratio=True, interp=self.interp)
  791. samples.append(resizer(sample.copy(), context))
  792. return samples
  793. @register_op
  794. class RandomResize(BaseOperator):
  795. def __init__(self,
  796. target_size,
  797. keep_ratio=True,
  798. interp=cv2.INTER_LINEAR,
  799. random_size=True,
  800. random_interp=False):
  801. """
  802. Resize image to target size randomly. random target_size and interpolation method
  803. Args:
  804. target_size (int, list, tuple): image target size, if random size is True, must be list or tuple
  805. keep_ratio (bool): whether keep_raio or not, default true
  806. interp (int): the interpolation method
  807. random_size (bool): whether random select target size of image
  808. random_interp (bool): whether random select interpolation method
  809. """
  810. super(RandomResize, self).__init__()
  811. self.keep_ratio = keep_ratio
  812. self.interp = interp
  813. self.interps = [
  814. cv2.INTER_NEAREST,
  815. cv2.INTER_LINEAR,
  816. cv2.INTER_AREA,
  817. cv2.INTER_CUBIC,
  818. cv2.INTER_LANCZOS4,
  819. ]
  820. assert isinstance(target_size, (
  821. Integral, Sequence)), "target_size must be Integer, List or Tuple"
  822. if random_size and not isinstance(target_size, Sequence):
  823. raise TypeError(
  824. "Type of target_size is invalid when random_size is True. Must be List or Tuple, now is {}".
  825. format(type(target_size)))
  826. self.target_size = target_size
  827. self.random_size = random_size
  828. self.random_interp = random_interp
  829. def apply(self, sample, context=None):
  830. """ Resize the image numpy.
  831. """
  832. if self.random_size:
  833. target_size = random.choice(self.target_size)
  834. else:
  835. target_size = self.target_size
  836. if self.random_interp:
  837. interp = random.choice(self.interps)
  838. else:
  839. interp = self.interp
  840. resizer = Resize(target_size, self.keep_ratio, interp)
  841. return resizer(sample, context=context)
  842. @register_op
  843. class RandomExpand(BaseOperator):
  844. """Random expand the canvas.
  845. Args:
  846. ratio (float): maximum expansion ratio.
  847. prob (float): probability to expand.
  848. fill_value (list): color value used to fill the canvas. in RGB order.
  849. """
  850. def __init__(self, ratio=4., prob=0.5, fill_value=(127.5, 127.5, 127.5)):
  851. super(RandomExpand, self).__init__()
  852. assert ratio > 1.01, "expand ratio must be larger than 1.01"
  853. self.ratio = ratio
  854. self.prob = prob
  855. assert isinstance(fill_value, (Number, Sequence)), \
  856. "fill value must be either float or sequence"
  857. if isinstance(fill_value, Number):
  858. fill_value = (fill_value, ) * 3
  859. if not isinstance(fill_value, tuple):
  860. fill_value = tuple(fill_value)
  861. self.fill_value = fill_value
  862. def apply(self, sample, context=None):
  863. if np.random.uniform(0., 1.) < self.prob:
  864. return sample
  865. im = sample['image']
  866. height, width = im.shape[:2]
  867. ratio = np.random.uniform(1., self.ratio)
  868. h = int(height * ratio)
  869. w = int(width * ratio)
  870. if not h > height or not w > width:
  871. return sample
  872. y = np.random.randint(0, h - height)
  873. x = np.random.randint(0, w - width)
  874. offsets, size = [x, y], [h, w]
  875. pad = Pad(size,
  876. pad_mode=-1,
  877. offsets=offsets,
  878. fill_value=self.fill_value)
  879. return pad(sample, context=context)
  880. @register_op
  881. class CropWithSampling(BaseOperator):
  882. def __init__(self, batch_sampler, satisfy_all=False, avoid_no_bbox=True):
  883. """
  884. Args:
  885. batch_sampler (list): Multiple sets of different
  886. parameters for cropping.
  887. satisfy_all (bool): whether all boxes must satisfy.
  888. e.g.[[1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0],
  889. [1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 1.0],
  890. [1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 1.0],
  891. [1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 1.0],
  892. [1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 1.0],
  893. [1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 1.0],
  894. [1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0]]
  895. [max sample, max trial, min scale, max scale,
  896. min aspect ratio, max aspect ratio,
  897. min overlap, max overlap]
  898. avoid_no_bbox (bool): whether to avoid the
  899. situation where the box does not appear.
  900. """
  901. super(CropWithSampling, self).__init__()
  902. self.batch_sampler = batch_sampler
  903. self.satisfy_all = satisfy_all
  904. self.avoid_no_bbox = avoid_no_bbox
  905. def apply(self, sample, context):
  906. """
  907. Crop the image and modify bounding box.
  908. Operators:
  909. 1. Scale the image width and height.
  910. 2. Crop the image according to a radom sample.
  911. 3. Rescale the bounding box.
  912. 4. Determine if the new bbox is satisfied in the new image.
  913. Returns:
  914. sample: the image, bounding box are replaced.
  915. """
  916. assert 'image' in sample, "image data not found"
  917. im = sample['image']
  918. gt_bbox = sample['gt_bbox']
  919. gt_class = sample['gt_class']
  920. im_height, im_width = im.shape[:2]
  921. gt_score = None
  922. if 'gt_score' in sample:
  923. gt_score = sample['gt_score']
  924. sampled_bbox = []
  925. gt_bbox = gt_bbox.tolist()
  926. for sampler in self.batch_sampler:
  927. found = 0
  928. for i in range(sampler[1]):
  929. if found >= sampler[0]:
  930. break
  931. sample_bbox = generate_sample_bbox(sampler)
  932. if satisfy_sample_constraint(sampler, sample_bbox, gt_bbox,
  933. self.satisfy_all):
  934. sampled_bbox.append(sample_bbox)
  935. found = found + 1
  936. im = np.array(im)
  937. while sampled_bbox:
  938. idx = int(np.random.uniform(0, len(sampled_bbox)))
  939. sample_bbox = sampled_bbox.pop(idx)
  940. sample_bbox = clip_bbox(sample_bbox)
  941. crop_bbox, crop_class, crop_score = \
  942. filter_and_process(sample_bbox, gt_bbox, gt_class, scores=gt_score)
  943. if self.avoid_no_bbox:
  944. if len(crop_bbox) < 1:
  945. continue
  946. xmin = int(sample_bbox[0] * im_width)
  947. xmax = int(sample_bbox[2] * im_width)
  948. ymin = int(sample_bbox[1] * im_height)
  949. ymax = int(sample_bbox[3] * im_height)
  950. im = im[ymin:ymax, xmin:xmax]
  951. sample['image'] = im
  952. sample['gt_bbox'] = crop_bbox
  953. sample['gt_class'] = crop_class
  954. sample['gt_score'] = crop_score
  955. return sample
  956. return sample
  957. @register_op
  958. class CropWithDataAchorSampling(BaseOperator):
  959. def __init__(self,
  960. batch_sampler,
  961. anchor_sampler=None,
  962. target_size=None,
  963. das_anchor_scales=[16, 32, 64, 128],
  964. sampling_prob=0.5,
  965. min_size=8.,
  966. avoid_no_bbox=True):
  967. """
  968. Args:
  969. anchor_sampler (list): anchor_sampling sets of different
  970. parameters for cropping.
  971. batch_sampler (list): Multiple sets of different
  972. parameters for cropping.
  973. e.g.[[1, 10, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.2, 0.0]]
  974. [[1, 50, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0],
  975. [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0],
  976. [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0],
  977. [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0],
  978. [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0]]
  979. [max sample, max trial, min scale, max scale,
  980. min aspect ratio, max aspect ratio,
  981. min overlap, max overlap, min coverage, max coverage]
  982. target_size (int): target image size.
  983. das_anchor_scales (list[float]): a list of anchor scales in data
  984. anchor smapling.
  985. min_size (float): minimum size of sampled bbox.
  986. avoid_no_bbox (bool): whether to avoid the
  987. situation where the box does not appear.
  988. """
  989. super(CropWithDataAchorSampling, self).__init__()
  990. self.anchor_sampler = anchor_sampler
  991. self.batch_sampler = batch_sampler
  992. self.target_size = target_size
  993. self.sampling_prob = sampling_prob
  994. self.min_size = min_size
  995. self.avoid_no_bbox = avoid_no_bbox
  996. self.das_anchor_scales = np.array(das_anchor_scales)
  997. def apply(self, sample, context):
  998. """
  999. Crop the image and modify bounding box.
  1000. Operators:
  1001. 1. Scale the image width and height.
  1002. 2. Crop the image according to a radom sample.
  1003. 3. Rescale the bounding box.
  1004. 4. Determine if the new bbox is satisfied in the new image.
  1005. Returns:
  1006. sample: the image, bounding box are replaced.
  1007. """
  1008. assert 'image' in sample, "image data not found"
  1009. im = sample['image']
  1010. gt_bbox = sample['gt_bbox']
  1011. gt_class = sample['gt_class']
  1012. image_height, image_width = im.shape[:2]
  1013. gt_bbox[:, 0] /= image_width
  1014. gt_bbox[:, 1] /= image_height
  1015. gt_bbox[:, 2] /= image_width
  1016. gt_bbox[:, 3] /= image_height
  1017. gt_score = None
  1018. if 'gt_score' in sample:
  1019. gt_score = sample['gt_score']
  1020. sampled_bbox = []
  1021. gt_bbox = gt_bbox.tolist()
  1022. prob = np.random.uniform(0., 1.)
  1023. if prob > self.sampling_prob: # anchor sampling
  1024. assert self.anchor_sampler
  1025. for sampler in self.anchor_sampler:
  1026. found = 0
  1027. for i in range(sampler[1]):
  1028. if found >= sampler[0]:
  1029. break
  1030. sample_bbox = data_anchor_sampling(
  1031. gt_bbox, image_width, image_height,
  1032. self.das_anchor_scales, self.target_size)
  1033. if sample_bbox == 0:
  1034. break
  1035. if satisfy_sample_constraint_coverage(sampler, sample_bbox,
  1036. gt_bbox):
  1037. sampled_bbox.append(sample_bbox)
  1038. found = found + 1
  1039. im = np.array(im)
  1040. while sampled_bbox:
  1041. idx = int(np.random.uniform(0, len(sampled_bbox)))
  1042. sample_bbox = sampled_bbox.pop(idx)
  1043. if 'gt_keypoint' in sample.keys():
  1044. keypoints = (sample['gt_keypoint'],
  1045. sample['keypoint_ignore'])
  1046. crop_bbox, crop_class, crop_score, gt_keypoints = \
  1047. filter_and_process(sample_bbox, gt_bbox, gt_class,
  1048. scores=gt_score,
  1049. keypoints=keypoints)
  1050. else:
  1051. crop_bbox, crop_class, crop_score = filter_and_process(
  1052. sample_bbox, gt_bbox, gt_class, scores=gt_score)
  1053. crop_bbox, crop_class, crop_score = bbox_area_sampling(
  1054. crop_bbox, crop_class, crop_score, self.target_size,
  1055. self.min_size)
  1056. if self.avoid_no_bbox:
  1057. if len(crop_bbox) < 1:
  1058. continue
  1059. im = crop_image_sampling(im, sample_bbox, image_width,
  1060. image_height, self.target_size)
  1061. height, width = im.shape[:2]
  1062. crop_bbox[:, 0] *= width
  1063. crop_bbox[:, 1] *= height
  1064. crop_bbox[:, 2] *= width
  1065. crop_bbox[:, 3] *= height
  1066. sample['image'] = im
  1067. sample['gt_bbox'] = crop_bbox
  1068. sample['gt_class'] = crop_class
  1069. if 'gt_score' in sample:
  1070. sample['gt_score'] = crop_score
  1071. if 'gt_keypoint' in sample.keys():
  1072. sample['gt_keypoint'] = gt_keypoints[0]
  1073. sample['keypoint_ignore'] = gt_keypoints[1]
  1074. return sample
  1075. return sample
  1076. else:
  1077. for sampler in self.batch_sampler:
  1078. found = 0
  1079. for i in range(sampler[1]):
  1080. if found >= sampler[0]:
  1081. break
  1082. sample_bbox = generate_sample_bbox_square(
  1083. sampler, image_width, image_height)
  1084. if satisfy_sample_constraint_coverage(sampler, sample_bbox,
  1085. gt_bbox):
  1086. sampled_bbox.append(sample_bbox)
  1087. found = found + 1
  1088. im = np.array(im)
  1089. while sampled_bbox:
  1090. idx = int(np.random.uniform(0, len(sampled_bbox)))
  1091. sample_bbox = sampled_bbox.pop(idx)
  1092. sample_bbox = clip_bbox(sample_bbox)
  1093. if 'gt_keypoint' in sample.keys():
  1094. keypoints = (sample['gt_keypoint'],
  1095. sample['keypoint_ignore'])
  1096. crop_bbox, crop_class, crop_score, gt_keypoints = \
  1097. filter_and_process(sample_bbox, gt_bbox, gt_class,
  1098. scores=gt_score,
  1099. keypoints=keypoints)
  1100. else:
  1101. crop_bbox, crop_class, crop_score = filter_and_process(
  1102. sample_bbox, gt_bbox, gt_class, scores=gt_score)
  1103. # sampling bbox according the bbox area
  1104. crop_bbox, crop_class, crop_score = bbox_area_sampling(
  1105. crop_bbox, crop_class, crop_score, self.target_size,
  1106. self.min_size)
  1107. if self.avoid_no_bbox:
  1108. if len(crop_bbox) < 1:
  1109. continue
  1110. xmin = int(sample_bbox[0] * image_width)
  1111. xmax = int(sample_bbox[2] * image_width)
  1112. ymin = int(sample_bbox[1] * image_height)
  1113. ymax = int(sample_bbox[3] * image_height)
  1114. im = im[ymin:ymax, xmin:xmax]
  1115. height, width = im.shape[:2]
  1116. crop_bbox[:, 0] *= width
  1117. crop_bbox[:, 1] *= height
  1118. crop_bbox[:, 2] *= width
  1119. crop_bbox[:, 3] *= height
  1120. sample['image'] = im
  1121. sample['gt_bbox'] = crop_bbox
  1122. sample['gt_class'] = crop_class
  1123. if 'gt_score' in sample:
  1124. sample['gt_score'] = crop_score
  1125. if 'gt_keypoint' in sample.keys():
  1126. sample['gt_keypoint'] = gt_keypoints[0]
  1127. sample['keypoint_ignore'] = gt_keypoints[1]
  1128. return sample
  1129. return sample
  1130. @register_op
  1131. class RandomCrop(BaseOperator):
  1132. """Random crop image and bboxes.
  1133. Args:
  1134. aspect_ratio (list): aspect ratio of cropped region.
  1135. in [min, max] format.
  1136. thresholds (list): iou thresholds for decide a valid bbox crop.
  1137. scaling (list): ratio between a cropped region and the original image.
  1138. in [min, max] format.
  1139. num_attempts (int): number of tries before giving up.
  1140. allow_no_crop (bool): allow return without actually cropping them.
  1141. cover_all_box (bool): ensure all bboxes are covered in the final crop.
  1142. is_mask_crop(bool): whether crop the segmentation.
  1143. """
  1144. def __init__(self,
  1145. aspect_ratio=[.5, 2.],
  1146. thresholds=[.0, .1, .3, .5, .7, .9],
  1147. scaling=[.3, 1.],
  1148. num_attempts=50,
  1149. allow_no_crop=True,
  1150. cover_all_box=False,
  1151. is_mask_crop=False):
  1152. super(RandomCrop, self).__init__()
  1153. self.aspect_ratio = aspect_ratio
  1154. self.thresholds = thresholds
  1155. self.scaling = scaling
  1156. self.num_attempts = num_attempts
  1157. self.allow_no_crop = allow_no_crop
  1158. self.cover_all_box = cover_all_box
  1159. self.is_mask_crop = is_mask_crop
  1160. def crop_segms(self, segms, valid_ids, crop, height, width):
  1161. def _crop_poly(segm, crop):
  1162. xmin, ymin, xmax, ymax = crop
  1163. crop_coord = [xmin, ymin, xmin, ymax, xmax, ymax, xmax, ymin]
  1164. crop_p = np.array(crop_coord).reshape(4, 2)
  1165. crop_p = Polygon(crop_p)
  1166. crop_segm = list()
  1167. for poly in segm:
  1168. poly = np.array(poly).reshape(len(poly) // 2, 2)
  1169. polygon = Polygon(poly)
  1170. if not polygon.is_valid:
  1171. exterior = polygon.exterior
  1172. multi_lines = exterior.intersection(exterior)
  1173. polygons = shapely.ops.polygonize(multi_lines)
  1174. polygon = MultiPolygon(polygons)
  1175. multi_polygon = list()
  1176. if isinstance(polygon, MultiPolygon):
  1177. multi_polygon = copy.deepcopy(polygon)
  1178. else:
  1179. multi_polygon.append(copy.deepcopy(polygon))
  1180. for per_polygon in multi_polygon:
  1181. inter = per_polygon.intersection(crop_p)
  1182. if not inter:
  1183. continue
  1184. if isinstance(inter, (MultiPolygon, GeometryCollection)):
  1185. for part in inter:
  1186. if not isinstance(part, Polygon):
  1187. continue
  1188. part = np.squeeze(
  1189. np.array(part.exterior.coords[:-1]).reshape(1,
  1190. -1))
  1191. part[0::2] -= xmin
  1192. part[1::2] -= ymin
  1193. crop_segm.append(part.tolist())
  1194. elif isinstance(inter, Polygon):
  1195. crop_poly = np.squeeze(
  1196. np.array(inter.exterior.coords[:-1]).reshape(1, -1))
  1197. crop_poly[0::2] -= xmin
  1198. crop_poly[1::2] -= ymin
  1199. crop_segm.append(crop_poly.tolist())
  1200. else:
  1201. continue
  1202. return crop_segm
  1203. def _crop_rle(rle, crop, height, width):
  1204. if 'counts' in rle and type(rle['counts']) == list:
  1205. rle = mask_util.frPyObjects(rle, height, width)
  1206. mask = mask_util.decode(rle)
  1207. mask = mask[crop[1]:crop[3], crop[0]:crop[2]]
  1208. rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
  1209. return rle
  1210. crop_segms = []
  1211. for id in valid_ids:
  1212. segm = segms[id]
  1213. if is_poly(segm):
  1214. import copy
  1215. import shapely.ops
  1216. from shapely.geometry import Polygon, MultiPolygon, GeometryCollection
  1217. logging.getLogger("shapely").setLevel(logging.WARNING)
  1218. # Polygon format
  1219. crop_segms.append(_crop_poly(segm, crop))
  1220. else:
  1221. # RLE format
  1222. import pycocotools.mask as mask_util
  1223. crop_segms.append(_crop_rle(segm, crop, height, width))
  1224. return crop_segms
  1225. def apply(self, sample, context=None):
  1226. if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0:
  1227. return sample
  1228. h, w = sample['image'].shape[:2]
  1229. gt_bbox = sample['gt_bbox']
  1230. # NOTE Original method attempts to generate one candidate for each
  1231. # threshold then randomly sample one from the resulting list.
  1232. # Here a short circuit approach is taken, i.e., randomly choose a
  1233. # threshold and attempt to find a valid crop, and simply return the
  1234. # first one found.
  1235. # The probability is not exactly the same, kinda resembling the
  1236. # "Monty Hall" problem. Actually carrying out the attempts will affect
  1237. # observability (just like opening doors in the "Monty Hall" game).
  1238. thresholds = list(self.thresholds)
  1239. if self.allow_no_crop:
  1240. thresholds.append('no_crop')
  1241. np.random.shuffle(thresholds)
  1242. for thresh in thresholds:
  1243. if thresh == 'no_crop':
  1244. return sample
  1245. found = False
  1246. for i in range(self.num_attempts):
  1247. scale = np.random.uniform(*self.scaling)
  1248. if self.aspect_ratio is not None:
  1249. min_ar, max_ar = self.aspect_ratio
  1250. aspect_ratio = np.random.uniform(
  1251. max(min_ar, scale**2), min(max_ar, scale**-2))
  1252. h_scale = scale / np.sqrt(aspect_ratio)
  1253. w_scale = scale * np.sqrt(aspect_ratio)
  1254. else:
  1255. h_scale = np.random.uniform(*self.scaling)
  1256. w_scale = np.random.uniform(*self.scaling)
  1257. crop_h = h * h_scale
  1258. crop_w = w * w_scale
  1259. if self.aspect_ratio is None:
  1260. if crop_h / crop_w < 0.5 or crop_h / crop_w > 2.0:
  1261. continue
  1262. crop_h = int(crop_h)
  1263. crop_w = int(crop_w)
  1264. crop_y = np.random.randint(0, h - crop_h)
  1265. crop_x = np.random.randint(0, w - crop_w)
  1266. crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h]
  1267. iou = self._iou_matrix(
  1268. gt_bbox, np.array(
  1269. [crop_box], dtype=np.float32))
  1270. if iou.max() < thresh:
  1271. continue
  1272. if self.cover_all_box and iou.min() < thresh:
  1273. continue
  1274. cropped_box, valid_ids = self._crop_box_with_center_constraint(
  1275. gt_bbox, np.array(
  1276. crop_box, dtype=np.float32))
  1277. if valid_ids.size > 0:
  1278. found = True
  1279. break
  1280. if found:
  1281. if self.is_mask_crop and 'gt_poly' in sample and len(sample[
  1282. 'gt_poly']) > 0:
  1283. crop_polys = self.crop_segms(
  1284. sample['gt_poly'],
  1285. valid_ids,
  1286. np.array(
  1287. crop_box, dtype=np.int64),
  1288. h,
  1289. w)
  1290. if [] in crop_polys:
  1291. delete_id = list()
  1292. valid_polys = list()
  1293. for id, crop_poly in enumerate(crop_polys):
  1294. if crop_poly == []:
  1295. delete_id.append(id)
  1296. else:
  1297. valid_polys.append(crop_poly)
  1298. valid_ids = np.delete(valid_ids, delete_id)
  1299. if len(valid_polys) == 0:
  1300. return sample
  1301. sample['gt_poly'] = valid_polys
  1302. else:
  1303. sample['gt_poly'] = crop_polys
  1304. if 'gt_segm' in sample:
  1305. sample['gt_segm'] = self._crop_segm(sample['gt_segm'],
  1306. crop_box)
  1307. sample['gt_segm'] = np.take(
  1308. sample['gt_segm'], valid_ids, axis=0)
  1309. sample['image'] = self._crop_image(sample['image'], crop_box)
  1310. sample['gt_bbox'] = np.take(cropped_box, valid_ids, axis=0)
  1311. sample['gt_class'] = np.take(
  1312. sample['gt_class'], valid_ids, axis=0)
  1313. if 'gt_score' in sample:
  1314. sample['gt_score'] = np.take(
  1315. sample['gt_score'], valid_ids, axis=0)
  1316. if 'is_crowd' in sample:
  1317. sample['is_crowd'] = np.take(
  1318. sample['is_crowd'], valid_ids, axis=0)
  1319. if 'difficult' in sample:
  1320. sample['difficult'] = np.take(
  1321. sample['difficult'], valid_ids, axis=0)
  1322. return sample
  1323. return sample
  1324. def _iou_matrix(self, a, b):
  1325. tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2])
  1326. br_i = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
  1327. area_i = np.prod(br_i - tl_i, axis=2) * (tl_i < br_i).all(axis=2)
  1328. area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
  1329. area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
  1330. area_o = (area_a[:, np.newaxis] + area_b - area_i)
  1331. return area_i / (area_o + 1e-10)
  1332. def _crop_box_with_center_constraint(self, box, crop):
  1333. cropped_box = box.copy()
  1334. cropped_box[:, :2] = np.maximum(box[:, :2], crop[:2])
  1335. cropped_box[:, 2:] = np.minimum(box[:, 2:], crop[2:])
  1336. cropped_box[:, :2] -= crop[:2]
  1337. cropped_box[:, 2:] -= crop[:2]
  1338. centers = (box[:, :2] + box[:, 2:]) / 2
  1339. valid = np.logical_and(crop[:2] <= centers,
  1340. centers < crop[2:]).all(axis=1)
  1341. valid = np.logical_and(
  1342. valid, (cropped_box[:, :2] < cropped_box[:, 2:]).all(axis=1))
  1343. return cropped_box, np.where(valid)[0]
  1344. def _crop_image(self, img, crop):
  1345. x1, y1, x2, y2 = crop
  1346. return img[y1:y2, x1:x2, :]
  1347. def _crop_segm(self, segm, crop):
  1348. x1, y1, x2, y2 = crop
  1349. return segm[:, y1:y2, x1:x2]
  1350. @register_op
  1351. class RandomScaledCrop(BaseOperator):
  1352. """Resize image and bbox based on long side (with optional random scaling),
  1353. then crop or pad image to target size.
  1354. Args:
  1355. target_dim (int): target size.
  1356. scale_range (list): random scale range.
  1357. interp (int): interpolation method, default to `cv2.INTER_LINEAR`.
  1358. """
  1359. def __init__(self,
  1360. target_dim=512,
  1361. scale_range=[.1, 2.],
  1362. interp=cv2.INTER_LINEAR):
  1363. super(RandomScaledCrop, self).__init__()
  1364. self.target_dim = target_dim
  1365. self.scale_range = scale_range
  1366. self.interp = interp
  1367. def apply(self, sample, context=None):
  1368. img = sample['image']
  1369. h, w = img.shape[:2]
  1370. random_scale = np.random.uniform(*self.scale_range)
  1371. dim = self.target_dim
  1372. random_dim = int(dim * random_scale)
  1373. dim_max = max(h, w)
  1374. scale = random_dim / dim_max
  1375. resize_w = w * scale
  1376. resize_h = h * scale
  1377. offset_x = int(max(0, np.random.uniform(0., resize_w - dim)))
  1378. offset_y = int(max(0, np.random.uniform(0., resize_h - dim)))
  1379. img = cv2.resize(img, (resize_w, resize_h), interpolation=self.interp)
  1380. img = np.array(img)
  1381. canvas = np.zeros((dim, dim, 3), dtype=img.dtype)
  1382. canvas[:min(dim, resize_h), :min(dim, resize_w), :] = img[
  1383. offset_y:offset_y + dim, offset_x:offset_x + dim, :]
  1384. sample['image'] = canvas
  1385. sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32)
  1386. scale_factor = sample['sacle_factor']
  1387. sample['scale_factor'] = np.asarray(
  1388. [scale_factor[0] * scale, scale_factor[1] * scale],
  1389. dtype=np.float32)
  1390. if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
  1391. scale_array = np.array([scale, scale] * 2, dtype=np.float32)
  1392. shift_array = np.array([offset_x, offset_y] * 2, dtype=np.float32)
  1393. boxes = sample['gt_bbox'] * scale_array - shift_array
  1394. boxes = np.clip(boxes, 0, dim - 1)
  1395. # filter boxes with no area
  1396. area = np.prod(boxes[..., 2:] - boxes[..., :2], axis=1)
  1397. valid = (area > 1.).nonzero()[0]
  1398. sample['gt_bbox'] = boxes[valid]
  1399. sample['gt_class'] = sample['gt_class'][valid]
  1400. return sample
  1401. @register_op
  1402. class Cutmix(BaseOperator):
  1403. def __init__(self, alpha=1.5, beta=1.5):
  1404. """
  1405. CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features, see https://arxiv.org/abs/1905.04899
  1406. Cutmix image and gt_bbbox/gt_score
  1407. Args:
  1408. alpha (float): alpha parameter of beta distribute
  1409. beta (float): beta parameter of beta distribute
  1410. """
  1411. super(Cutmix, self).__init__()
  1412. self.alpha = alpha
  1413. self.beta = beta
  1414. if self.alpha <= 0.0:
  1415. raise ValueError("alpha shold be positive in {}".format(self))
  1416. if self.beta <= 0.0:
  1417. raise ValueError("beta shold be positive in {}".format(self))
  1418. def apply_image(self, img1, img2, factor):
  1419. """ _rand_bbox """
  1420. h = max(img1.shape[0], img2.shape[0])
  1421. w = max(img1.shape[1], img2.shape[1])
  1422. cut_rat = np.sqrt(1. - factor)
  1423. cut_w = np.int32(w * cut_rat)
  1424. cut_h = np.int32(h * cut_rat)
  1425. # uniform
  1426. cx = np.random.randint(w)
  1427. cy = np.random.randint(h)
  1428. bbx1 = np.clip(cx - cut_w // 2, 0, w - 1)
  1429. bby1 = np.clip(cy - cut_h // 2, 0, h - 1)
  1430. bbx2 = np.clip(cx + cut_w // 2, 0, w - 1)
  1431. bby2 = np.clip(cy + cut_h // 2, 0, h - 1)
  1432. img_1_pad = np.zeros((h, w, img1.shape[2]), 'float32')
  1433. img_1_pad[:img1.shape[0], :img1.shape[1], :] = \
  1434. img1.astype('float32')
  1435. img_2_pad = np.zeros((h, w, img2.shape[2]), 'float32')
  1436. img_2_pad[:img2.shape[0], :img2.shape[1], :] = \
  1437. img2.astype('float32')
  1438. img_1_pad[bby1:bby2, bbx1:bbx2, :] = img_2_pad[bby1:bby2, bbx1:bbx2, :]
  1439. return img_1_pad
  1440. def __call__(self, sample, context=None):
  1441. if not isinstance(sample, Sequence):
  1442. return sample
  1443. assert len(sample) == 2, 'cutmix need two samples'
  1444. factor = np.random.beta(self.alpha, self.beta)
  1445. factor = max(0.0, min(1.0, factor))
  1446. if factor >= 1.0:
  1447. return sample[0]
  1448. if factor <= 0.0:
  1449. return sample[1]
  1450. img1 = sample[0]['image']
  1451. img2 = sample[1]['image']
  1452. img = self.apply_image(img1, img2, factor)
  1453. gt_bbox1 = sample[0]['gt_bbox']
  1454. gt_bbox2 = sample[1]['gt_bbox']
  1455. gt_bbox = np.concatenate((gt_bbox1, gt_bbox2), axis=0)
  1456. gt_class1 = sample[0]['gt_class']
  1457. gt_class2 = sample[1]['gt_class']
  1458. gt_class = np.concatenate((gt_class1, gt_class2), axis=0)
  1459. gt_score1 = np.ones_like(sample[0]['gt_class'])
  1460. gt_score2 = np.ones_like(sample[1]['gt_class'])
  1461. gt_score = np.concatenate(
  1462. (gt_score1 * factor, gt_score2 * (1. - factor)), axis=0)
  1463. result = copy.deepcopy(sample[0])
  1464. result['image'] = img
  1465. result['gt_bbox'] = gt_bbox
  1466. result['gt_score'] = gt_score
  1467. result['gt_class'] = gt_class
  1468. if 'is_crowd' in sample[0]:
  1469. is_crowd1 = sample[0]['is_crowd']
  1470. is_crowd2 = sample[1]['is_crowd']
  1471. is_crowd = np.concatenate((is_crowd1, is_crowd2), axis=0)
  1472. result['is_crowd'] = is_crowd
  1473. if 'difficult' in sample[0]:
  1474. is_difficult1 = sample[0]['difficult']
  1475. is_difficult2 = sample[1]['difficult']
  1476. is_difficult = np.concatenate(
  1477. (is_difficult1, is_difficult2), axis=0)
  1478. result['difficult'] = is_difficult
  1479. return result
  1480. @register_op
  1481. class Mixup(BaseOperator):
  1482. def __init__(self, alpha=1.5, beta=1.5):
  1483. """ Mixup image and gt_bbbox/gt_score
  1484. Args:
  1485. alpha (float): alpha parameter of beta distribute
  1486. beta (float): beta parameter of beta distribute
  1487. """
  1488. super(Mixup, self).__init__()
  1489. self.alpha = alpha
  1490. self.beta = beta
  1491. if self.alpha <= 0.0:
  1492. raise ValueError("alpha shold be positive in {}".format(self))
  1493. if self.beta <= 0.0:
  1494. raise ValueError("beta shold be positive in {}".format(self))
  1495. def apply_image(self, img1, img2, factor):
  1496. h = max(img1.shape[0], img2.shape[0])
  1497. w = max(img1.shape[1], img2.shape[1])
  1498. img = np.zeros((h, w, img1.shape[2]), 'float32')
  1499. img[:img1.shape[0], :img1.shape[1], :] = \
  1500. img1.astype('float32') * factor
  1501. img[:img2.shape[0], :img2.shape[1], :] += \
  1502. img2.astype('float32') * (1.0 - factor)
  1503. return img.astype('uint8')
  1504. def __call__(self, sample, context=None):
  1505. if not isinstance(sample, Sequence):
  1506. return sample
  1507. assert len(sample) == 2, 'mixup need two samples'
  1508. factor = np.random.beta(self.alpha, self.beta)
  1509. factor = max(0.0, min(1.0, factor))
  1510. if factor >= 1.0:
  1511. return sample[0]
  1512. if factor <= 0.0:
  1513. return sample[1]
  1514. im = self.apply_image(sample[0]['image'], sample[1]['image'], factor)
  1515. result = copy.deepcopy(sample[0])
  1516. result['image'] = im
  1517. # apply bbox and score
  1518. if 'gt_bbox' in sample[0]:
  1519. gt_bbox1 = sample[0]['gt_bbox']
  1520. gt_bbox2 = sample[1]['gt_bbox']
  1521. gt_bbox = np.concatenate((gt_bbox1, gt_bbox2), axis=0)
  1522. result['gt_bbox'] = gt_bbox
  1523. if 'gt_class' in sample[0]:
  1524. gt_class1 = sample[0]['gt_class']
  1525. gt_class2 = sample[1]['gt_class']
  1526. gt_class = np.concatenate((gt_class1, gt_class2), axis=0)
  1527. result['gt_class'] = gt_class
  1528. gt_score1 = np.ones_like(sample[0]['gt_class'])
  1529. gt_score2 = np.ones_like(sample[1]['gt_class'])
  1530. gt_score = np.concatenate(
  1531. (gt_score1 * factor, gt_score2 * (1. - factor)), axis=0)
  1532. result['gt_score'] = gt_score.astype('float32')
  1533. if 'is_crowd' in sample[0]:
  1534. is_crowd1 = sample[0]['is_crowd']
  1535. is_crowd2 = sample[1]['is_crowd']
  1536. is_crowd = np.concatenate((is_crowd1, is_crowd2), axis=0)
  1537. result['is_crowd'] = is_crowd
  1538. if 'difficult' in sample[0]:
  1539. is_difficult1 = sample[0]['difficult']
  1540. is_difficult2 = sample[1]['difficult']
  1541. is_difficult = np.concatenate(
  1542. (is_difficult1, is_difficult2), axis=0)
  1543. result['difficult'] = is_difficult
  1544. if 'gt_ide' in sample[0]:
  1545. gt_ide1 = sample[0]['gt_ide']
  1546. gt_ide2 = sample[1]['gt_ide']
  1547. gt_ide = np.concatenate((gt_ide1, gt_ide2), axis=0)
  1548. result['gt_ide'] = gt_ide
  1549. return result
  1550. @register_op
  1551. class NormalizeBox(BaseOperator):
  1552. """Transform the bounding box's coornidates to [0,1]."""
  1553. def __init__(self):
  1554. super(NormalizeBox, self).__init__()
  1555. def apply(self, sample, context):
  1556. im = sample['image']
  1557. gt_bbox = sample['gt_bbox']
  1558. height, width, _ = im.shape
  1559. for i in range(gt_bbox.shape[0]):
  1560. gt_bbox[i][0] = gt_bbox[i][0] / width
  1561. gt_bbox[i][1] = gt_bbox[i][1] / height
  1562. gt_bbox[i][2] = gt_bbox[i][2] / width
  1563. gt_bbox[i][3] = gt_bbox[i][3] / height
  1564. sample['gt_bbox'] = gt_bbox
  1565. if 'gt_keypoint' in sample.keys():
  1566. gt_keypoint = sample['gt_keypoint']
  1567. for i in range(gt_keypoint.shape[1]):
  1568. if i % 2:
  1569. gt_keypoint[:, i] = gt_keypoint[:, i] / height
  1570. else:
  1571. gt_keypoint[:, i] = gt_keypoint[:, i] / width
  1572. sample['gt_keypoint'] = gt_keypoint
  1573. return sample
  1574. @register_op
  1575. class BboxXYXY2XYWH(BaseOperator):
  1576. """
  1577. Convert bbox XYXY format to XYWH format.
  1578. """
  1579. def __init__(self):
  1580. super(BboxXYXY2XYWH, self).__init__()
  1581. def apply(self, sample, context=None):
  1582. assert 'gt_bbox' in sample
  1583. bbox = sample['gt_bbox']
  1584. bbox[:, 2:4] = bbox[:, 2:4] - bbox[:, :2]
  1585. bbox[:, :2] = bbox[:, :2] + bbox[:, 2:4] / 2.
  1586. sample['gt_bbox'] = bbox
  1587. return sample
  1588. @register_op
  1589. class PadBox(BaseOperator):
  1590. def __init__(self, num_max_boxes=50):
  1591. """
  1592. Pad zeros to bboxes if number of bboxes is less than num_max_boxes.
  1593. Args:
  1594. num_max_boxes (int): the max number of bboxes
  1595. """
  1596. self.num_max_boxes = num_max_boxes
  1597. super(PadBox, self).__init__()
  1598. def apply(self, sample, context=None):
  1599. assert 'gt_bbox' in sample
  1600. bbox = sample['gt_bbox']
  1601. gt_num = min(self.num_max_boxes, len(bbox))
  1602. num_max = self.num_max_boxes
  1603. # fields = context['fields'] if context else []
  1604. pad_bbox = np.zeros((num_max, 4), dtype=np.float32)
  1605. if gt_num > 0:
  1606. pad_bbox[:gt_num, :] = bbox[:gt_num, :]
  1607. sample['gt_bbox'] = pad_bbox
  1608. if 'gt_class' in sample:
  1609. pad_class = np.zeros((num_max, ), dtype=np.int32)
  1610. if gt_num > 0:
  1611. pad_class[:gt_num] = sample['gt_class'][:gt_num, 0]
  1612. sample['gt_class'] = pad_class
  1613. if 'gt_score' in sample:
  1614. pad_score = np.zeros((num_max, ), dtype=np.float32)
  1615. if gt_num > 0:
  1616. pad_score[:gt_num] = sample['gt_score'][:gt_num, 0]
  1617. sample['gt_score'] = pad_score
  1618. # in training, for example in op ExpandImage,
  1619. # the bbox and gt_class is expandded, but the difficult is not,
  1620. # so, judging by it's length
  1621. if 'difficult' in sample:
  1622. pad_diff = np.zeros((num_max, ), dtype=np.int32)
  1623. if gt_num > 0:
  1624. pad_diff[:gt_num] = sample['difficult'][:gt_num, 0]
  1625. sample['difficult'] = pad_diff
  1626. if 'is_crowd' in sample:
  1627. pad_crowd = np.zeros((num_max, ), dtype=np.int32)
  1628. if gt_num > 0:
  1629. pad_crowd[:gt_num] = sample['is_crowd'][:gt_num, 0]
  1630. sample['is_crowd'] = pad_crowd
  1631. if 'gt_ide' in sample:
  1632. pad_ide = np.zeros((num_max, ), dtype=np.int32)
  1633. if gt_num > 0:
  1634. pad_ide[:gt_num] = sample['gt_ide'][:gt_num, 0]
  1635. sample['gt_ide'] = pad_ide
  1636. return sample
  1637. @register_op
  1638. class DebugVisibleImage(BaseOperator):
  1639. """
  1640. In debug mode, visualize images according to `gt_box`.
  1641. (Currently only supported when not cropping and flipping image.)
  1642. """
  1643. def __init__(self, output_dir='output/debug', is_normalized=False):
  1644. super(DebugVisibleImage, self).__init__()
  1645. self.is_normalized = is_normalized
  1646. self.output_dir = output_dir
  1647. if not os.path.isdir(output_dir):
  1648. os.makedirs(output_dir)
  1649. if not isinstance(self.is_normalized, bool):
  1650. raise TypeError("{}: input type is invalid.".format(self))
  1651. def apply(self, sample, context=None):
  1652. image = Image.fromarray(sample['image'].astype(np.uint8))
  1653. out_file_name = '{:012d}.jpg'.format(sample['im_id'][0])
  1654. width = sample['w']
  1655. height = sample['h']
  1656. gt_bbox = sample['gt_bbox']
  1657. gt_class = sample['gt_class']
  1658. draw = ImageDraw.Draw(image)
  1659. for i in range(gt_bbox.shape[0]):
  1660. if self.is_normalized:
  1661. gt_bbox[i][0] = gt_bbox[i][0] * width
  1662. gt_bbox[i][1] = gt_bbox[i][1] * height
  1663. gt_bbox[i][2] = gt_bbox[i][2] * width
  1664. gt_bbox[i][3] = gt_bbox[i][3] * height
  1665. xmin, ymin, xmax, ymax = gt_bbox[i]
  1666. draw.line(
  1667. [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin),
  1668. (xmin, ymin)],
  1669. width=2,
  1670. fill='green')
  1671. # draw label
  1672. text = str(gt_class[i][0])
  1673. tw, th = draw.textsize(text)
  1674. draw.rectangle(
  1675. [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill='green')
  1676. draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255))
  1677. if 'gt_keypoint' in sample.keys():
  1678. gt_keypoint = sample['gt_keypoint']
  1679. if self.is_normalized:
  1680. for i in range(gt_keypoint.shape[1]):
  1681. if i % 2:
  1682. gt_keypoint[:, i] = gt_keypoint[:, i] * height
  1683. else:
  1684. gt_keypoint[:, i] = gt_keypoint[:, i] * width
  1685. for i in range(gt_keypoint.shape[0]):
  1686. keypoint = gt_keypoint[i]
  1687. for j in range(int(keypoint.shape[0] / 2)):
  1688. x1 = round(keypoint[2 * j]).astype(np.int32)
  1689. y1 = round(keypoint[2 * j + 1]).astype(np.int32)
  1690. draw.ellipse(
  1691. (x1, y1, x1 + 5, y1 + 5), fill='green', outline='green')
  1692. save_path = os.path.join(self.output_dir, out_file_name)
  1693. image.save(save_path, quality=95)
  1694. return sample
  1695. @register_op
  1696. class Pad(BaseOperator):
  1697. def __init__(self,
  1698. size=None,
  1699. size_divisor=32,
  1700. pad_mode=0,
  1701. offsets=None,
  1702. fill_value=(127.5, 127.5, 127.5)):
  1703. """
  1704. Pad image to a specified size or multiple of size_divisor.
  1705. Args:
  1706. size (int, Sequence): image target size, if None, pad to multiple of size_divisor, default None
  1707. size_divisor (int): size divisor, default 32
  1708. pad_mode (int): pad mode, currently only supports four modes [-1, 0, 1, 2]. if -1, use specified offsets
  1709. if 0, only pad to right and bottom. if 1, pad according to center. if 2, only pad left and top
  1710. offsets (list): [offset_x, offset_y], specify offset while padding, only supported pad_mode=-1
  1711. fill_value (bool): rgb value of pad area, default (127.5, 127.5, 127.5)
  1712. """
  1713. super(Pad, self).__init__()
  1714. if not isinstance(size, (int, Sequence)):
  1715. raise TypeError(
  1716. "Type of target_size is invalid when random_size is True. \
  1717. Must be List, now is {}".format(type(size)))
  1718. if isinstance(size, int):
  1719. size = [size, size]
  1720. assert pad_mode in [
  1721. -1, 0, 1, 2
  1722. ], 'currently only supports four modes [-1, 0, 1, 2]'
  1723. if pad_mode == -1:
  1724. assert offsets, 'if pad_mode is -1, offsets should not be None'
  1725. self.size = size
  1726. self.size_divisor = size_divisor
  1727. self.pad_mode = pad_mode
  1728. self.fill_value = fill_value
  1729. self.offsets = offsets
  1730. def apply_segm(self, segms, offsets, im_size, size):
  1731. def _expand_poly(poly, x, y):
  1732. expanded_poly = np.array(poly)
  1733. expanded_poly[0::2] += x
  1734. expanded_poly[1::2] += y
  1735. return expanded_poly.tolist()
  1736. def _expand_rle(rle, x, y, height, width, h, w):
  1737. if 'counts' in rle and type(rle['counts']) == list:
  1738. rle = mask_util.frPyObjects(rle, height, width)
  1739. mask = mask_util.decode(rle)
  1740. expanded_mask = np.full((h, w), 0).astype(mask.dtype)
  1741. expanded_mask[y:y + height, x:x + width] = mask
  1742. rle = mask_util.encode(
  1743. np.array(
  1744. expanded_mask, order='F', dtype=np.uint8))
  1745. return rle
  1746. x, y = offsets
  1747. height, width = im_size
  1748. h, w = size
  1749. expanded_segms = []
  1750. for segm in segms:
  1751. if is_poly(segm):
  1752. # Polygon format
  1753. expanded_segms.append(
  1754. [_expand_poly(poly, x, y) for poly in segm])
  1755. else:
  1756. # RLE format
  1757. import pycocotools.mask as mask_util
  1758. expanded_segms.append(
  1759. _expand_rle(segm, x, y, height, width, h, w))
  1760. return expanded_segms
  1761. def apply_bbox(self, bbox, offsets):
  1762. return bbox + np.array(offsets * 2, dtype=np.float32)
  1763. def apply_keypoint(self, keypoints, offsets):
  1764. n = len(keypoints[0]) // 2
  1765. return keypoints + np.array(offsets * n, dtype=np.float32)
  1766. def apply_image(self, image, offsets, im_size, size):
  1767. x, y = offsets
  1768. im_h, im_w = im_size
  1769. h, w = size
  1770. canvas = np.ones((h, w, 3), dtype=np.float32)
  1771. canvas *= np.array(self.fill_value, dtype=np.float32)
  1772. canvas[y:y + im_h, x:x + im_w, :] = image.astype(np.float32)
  1773. return canvas
  1774. def apply(self, sample, context=None):
  1775. im = sample['image']
  1776. im_h, im_w = im.shape[:2]
  1777. if self.size:
  1778. h, w = self.size
  1779. assert (
  1780. im_h <= h and im_w <= w
  1781. ), '(h, w) of target size should be greater than (im_h, im_w)'
  1782. else:
  1783. h = int(np.ceil(im_h / self.size_divisor) * self.size_divisor)
  1784. w = int(np.ceil(im_w / self.size_divisor) * self.size_divisor)
  1785. if h == im_h and w == im_w:
  1786. sample['image'] = im.astype(np.float32)
  1787. return sample
  1788. if self.pad_mode == -1:
  1789. offset_x, offset_y = self.offsets
  1790. elif self.pad_mode == 0:
  1791. offset_y, offset_x = 0, 0
  1792. elif self.pad_mode == 1:
  1793. offset_y, offset_x = (h - im_h) // 2, (w - im_w) // 2
  1794. else:
  1795. offset_y, offset_x = h - im_h, w - im_w
  1796. offsets, im_size, size = [offset_x, offset_y], [im_h, im_w], [h, w]
  1797. sample['image'] = self.apply_image(im, offsets, im_size, size)
  1798. if self.pad_mode == 0:
  1799. return sample
  1800. if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
  1801. sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], offsets)
  1802. if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
  1803. sample['gt_poly'] = self.apply_segm(sample['gt_poly'], offsets,
  1804. im_size, size)
  1805. if 'gt_keypoint' in sample and len(sample['gt_keypoint']) > 0:
  1806. sample['gt_keypoint'] = self.apply_keypoint(sample['gt_keypoint'],
  1807. offsets)
  1808. return sample
  1809. @register_op
  1810. class Poly2Mask(BaseOperator):
  1811. """
  1812. gt poly to mask annotations
  1813. """
  1814. def __init__(self):
  1815. super(Poly2Mask, self).__init__()
  1816. import pycocotools.mask as maskUtils
  1817. self.maskutils = maskUtils
  1818. def _poly2mask(self, mask_ann, img_h, img_w):
  1819. if isinstance(mask_ann, list):
  1820. # polygon -- a single object might consist of multiple parts
  1821. # we merge all parts into one mask rle code
  1822. rles = self.maskutils.frPyObjects(mask_ann, img_h, img_w)
  1823. rle = self.maskutils.merge(rles)
  1824. elif isinstance(mask_ann['counts'], list):
  1825. # uncompressed RLE
  1826. rle = self.maskutils.frPyObjects(mask_ann, img_h, img_w)
  1827. else:
  1828. # rle
  1829. rle = mask_ann
  1830. mask = self.maskutils.decode(rle)
  1831. return mask
  1832. def apply(self, sample, context=None):
  1833. assert 'gt_poly' in sample
  1834. im_h = sample['h']
  1835. im_w = sample['w']
  1836. masks = [
  1837. self._poly2mask(gt_poly, im_h, im_w)
  1838. for gt_poly in sample['gt_poly']
  1839. ]
  1840. sample['gt_segm'] = np.asarray(masks).astype(np.uint8)
  1841. return sample
  1842. @register_op
  1843. class Rbox2Poly(BaseOperator):
  1844. """
  1845. Convert rbbox format to poly format.
  1846. """
  1847. def __init__(self):
  1848. super(Rbox2Poly, self).__init__()
  1849. def apply(self, sample, context=None):
  1850. assert 'gt_rbox' in sample
  1851. assert sample['gt_rbox'].shape[1] == 5
  1852. rrects = sample['gt_rbox']
  1853. x_ctr = rrects[:, 0]
  1854. y_ctr = rrects[:, 1]
  1855. width = rrects[:, 2]
  1856. height = rrects[:, 3]
  1857. x1 = x_ctr - width / 2.0
  1858. y1 = y_ctr - height / 2.0
  1859. x2 = x_ctr + width / 2.0
  1860. y2 = y_ctr + height / 2.0
  1861. sample['gt_bbox'] = np.stack([x1, y1, x2, y2], axis=1)
  1862. polys = bbox_utils.rbox2poly_np(rrects)
  1863. sample['gt_rbox2poly'] = polys
  1864. return sample
  1865. @register_op
  1866. class AugmentHSV(BaseOperator):
  1867. """
  1868. Augment the SV channel of image data.
  1869. Args:
  1870. fraction (float): the fraction for augment. Default: 0.5.
  1871. is_bgr (bool): whether the image is BGR mode. Default: True.
  1872. hgain (float): H channel gains
  1873. sgain (float): S channel gains
  1874. vgain (float): V channel gains
  1875. """
  1876. def __init__(self,
  1877. fraction=0.50,
  1878. is_bgr=True,
  1879. hgain=None,
  1880. sgain=None,
  1881. vgain=None):
  1882. super(AugmentHSV, self).__init__()
  1883. self.fraction = fraction
  1884. self.is_bgr = is_bgr
  1885. self.hgain = hgain
  1886. self.sgain = sgain
  1887. self.vgain = vgain
  1888. self.use_hsvgain = False if hgain is None else True
  1889. def apply(self, sample, context=None):
  1890. img = sample['image']
  1891. if self.is_bgr:
  1892. img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
  1893. else:
  1894. img_hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)
  1895. if self.use_hsvgain:
  1896. hsv_augs = np.random.uniform(
  1897. -1, 1, 3) * [self.hgain, self.sgain, self.vgain]
  1898. # random selection of h, s, v
  1899. hsv_augs *= np.random.randint(0, 2, 3)
  1900. img_hsv[..., 0] = (img_hsv[..., 0] + hsv_augs[0]) % 180
  1901. img_hsv[..., 1] = np.clip(img_hsv[..., 1] + hsv_augs[1], 0, 255)
  1902. img_hsv[..., 2] = np.clip(img_hsv[..., 2] + hsv_augs[2], 0, 255)
  1903. else:
  1904. S = img_hsv[:, :, 1].astype(np.float32)
  1905. V = img_hsv[:, :, 2].astype(np.float32)
  1906. a = (random.random() * 2 - 1) * self.fraction + 1
  1907. S *= a
  1908. if a > 1:
  1909. np.clip(S, a_min=0, a_max=255, out=S)
  1910. a = (random.random() * 2 - 1) * self.fraction + 1
  1911. V *= a
  1912. if a > 1:
  1913. np.clip(V, a_min=0, a_max=255, out=V)
  1914. img_hsv[:, :, 1] = S.astype(np.uint8)
  1915. img_hsv[:, :, 2] = V.astype(np.uint8)
  1916. if self.is_bgr:
  1917. cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img)
  1918. else:
  1919. cv2.cvtColor(img_hsv, cv2.COLOR_HSV2RGB, dst=img)
  1920. sample['image'] = img
  1921. return sample
  1922. @register_op
  1923. class Norm2PixelBbox(BaseOperator):
  1924. """
  1925. Transform the bounding box's coornidates which is in [0,1] to pixels.
  1926. """
  1927. def __init__(self):
  1928. super(Norm2PixelBbox, self).__init__()
  1929. def apply(self, sample, context=None):
  1930. assert 'gt_bbox' in sample
  1931. bbox = sample['gt_bbox']
  1932. height, width = sample['image'].shape[:2]
  1933. bbox[:, 0::2] = bbox[:, 0::2] * width
  1934. bbox[:, 1::2] = bbox[:, 1::2] * height
  1935. sample['gt_bbox'] = bbox
  1936. return sample
  1937. @register_op
  1938. class BboxCXCYWH2XYXY(BaseOperator):
  1939. """
  1940. Convert bbox CXCYWH format to XYXY format.
  1941. [center_x, center_y, width, height] -> [x0, y0, x1, y1]
  1942. """
  1943. def __init__(self):
  1944. super(BboxCXCYWH2XYXY, self).__init__()
  1945. def apply(self, sample, context=None):
  1946. assert 'gt_bbox' in sample
  1947. bbox0 = sample['gt_bbox']
  1948. bbox = bbox0.copy()
  1949. bbox[:, :2] = bbox0[:, :2] - bbox0[:, 2:4] / 2.
  1950. bbox[:, 2:4] = bbox0[:, :2] + bbox0[:, 2:4] / 2.
  1951. sample['gt_bbox'] = bbox
  1952. return sample
  1953. @register_op
  1954. class RandomResizeCrop(BaseOperator):
  1955. """Random resize and crop image and bboxes.
  1956. Args:
  1957. resizes (list): resize image to one of resizes. if keep_ratio is True and mode is
  1958. 'long', resize the image's long side to the maximum of target_size, if keep_ratio is
  1959. True and mode is 'short', resize the image's short side to the minimum of target_size.
  1960. cropsizes (list): crop sizes after resize, [(min_crop_1, max_crop_1), ...]
  1961. mode (str): resize mode, `long` or `short`. Details see resizes.
  1962. prob (float): probability of this op.
  1963. keep_ratio (bool): whether keep_ratio or not, default true
  1964. interp (int): the interpolation method
  1965. thresholds (list): iou thresholds for decide a valid bbox crop.
  1966. num_attempts (int): number of tries before giving up.
  1967. allow_no_crop (bool): allow return without actually cropping them.
  1968. cover_all_box (bool): ensure all bboxes are covered in the final crop.
  1969. is_mask_crop(bool): whether crop the segmentation.
  1970. """
  1971. def __init__(
  1972. self,
  1973. resizes,
  1974. cropsizes,
  1975. prob=0.5,
  1976. mode='short',
  1977. keep_ratio=True,
  1978. interp=cv2.INTER_LINEAR,
  1979. num_attempts=3,
  1980. cover_all_box=False,
  1981. allow_no_crop=False,
  1982. thresholds=[0.3, 0.5, 0.7],
  1983. is_mask_crop=False, ):
  1984. super(RandomResizeCrop, self).__init__()
  1985. self.resizes = resizes
  1986. self.cropsizes = cropsizes
  1987. self.prob = prob
  1988. self.mode = mode
  1989. self.resizer = Resize(0, keep_ratio=keep_ratio, interp=interp)
  1990. self.croper = RandomCrop(
  1991. num_attempts=num_attempts,
  1992. cover_all_box=cover_all_box,
  1993. thresholds=thresholds,
  1994. allow_no_crop=allow_no_crop,
  1995. is_mask_crop=is_mask_crop)
  1996. def _format_size(self, size):
  1997. if isinstance(size, Integral):
  1998. size = (size, size)
  1999. return size
  2000. def apply(self, sample, context=None):
  2001. if random.random() < self.prob:
  2002. _resize = self._format_size(random.choice(self.resizes))
  2003. _cropsize = self._format_size(random.choice(self.cropsizes))
  2004. sample = self._resize(
  2005. self.resizer,
  2006. sample,
  2007. size=_resize,
  2008. mode=self.mode,
  2009. context=context)
  2010. sample = self._random_crop(
  2011. self.croper, sample, size=_cropsize, context=context)
  2012. return sample
  2013. @staticmethod
  2014. def _random_crop(croper, sample, size, context=None):
  2015. if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0:
  2016. return sample
  2017. self = croper
  2018. h, w = sample['image'].shape[:2]
  2019. gt_bbox = sample['gt_bbox']
  2020. cropsize = size
  2021. min_crop = min(cropsize)
  2022. max_crop = max(cropsize)
  2023. thresholds = list(self.thresholds)
  2024. np.random.shuffle(thresholds)
  2025. for thresh in thresholds:
  2026. found = False
  2027. for _ in range(self.num_attempts):
  2028. crop_h = random.randint(min_crop, min(h, max_crop))
  2029. crop_w = random.randint(min_crop, min(w, max_crop))
  2030. crop_y = random.randint(0, h - crop_h)
  2031. crop_x = random.randint(0, w - crop_w)
  2032. crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h]
  2033. iou = self._iou_matrix(
  2034. gt_bbox, np.array(
  2035. [crop_box], dtype=np.float32))
  2036. if iou.max() < thresh:
  2037. continue
  2038. if self.cover_all_box and iou.min() < thresh:
  2039. continue
  2040. cropped_box, valid_ids = self._crop_box_with_center_constraint(
  2041. gt_bbox, np.array(
  2042. crop_box, dtype=np.float32))
  2043. if valid_ids.size > 0:
  2044. found = True
  2045. break
  2046. if found:
  2047. if self.is_mask_crop and 'gt_poly' in sample and len(sample[
  2048. 'gt_poly']) > 0:
  2049. crop_polys = self.crop_segms(
  2050. sample['gt_poly'],
  2051. valid_ids,
  2052. np.array(
  2053. crop_box, dtype=np.int64),
  2054. h,
  2055. w)
  2056. if [] in crop_polys:
  2057. delete_id = list()
  2058. valid_polys = list()
  2059. for id, crop_poly in enumerate(crop_polys):
  2060. if crop_poly == []:
  2061. delete_id.append(id)
  2062. else:
  2063. valid_polys.append(crop_poly)
  2064. valid_ids = np.delete(valid_ids, delete_id)
  2065. if len(valid_polys) == 0:
  2066. return sample
  2067. sample['gt_poly'] = valid_polys
  2068. else:
  2069. sample['gt_poly'] = crop_polys
  2070. if 'gt_segm' in sample:
  2071. sample['gt_segm'] = self._crop_segm(sample['gt_segm'],
  2072. crop_box)
  2073. sample['gt_segm'] = np.take(
  2074. sample['gt_segm'], valid_ids, axis=0)
  2075. sample['image'] = self._crop_image(sample['image'], crop_box)
  2076. sample['gt_bbox'] = np.take(cropped_box, valid_ids, axis=0)
  2077. sample['gt_class'] = np.take(
  2078. sample['gt_class'], valid_ids, axis=0)
  2079. if 'gt_score' in sample:
  2080. sample['gt_score'] = np.take(
  2081. sample['gt_score'], valid_ids, axis=0)
  2082. if 'is_crowd' in sample:
  2083. sample['is_crowd'] = np.take(
  2084. sample['is_crowd'], valid_ids, axis=0)
  2085. return sample
  2086. return sample
  2087. @staticmethod
  2088. def _resize(resizer, sample, size, mode='short', context=None):
  2089. self = resizer
  2090. im = sample['image']
  2091. target_size = size
  2092. if not isinstance(im, np.ndarray):
  2093. raise TypeError("{}: image type is not numpy.".format(self))
  2094. if len(im.shape) != 3:
  2095. raise ImageError('{}: image is not 3-dimensional.'.format(self))
  2096. # apply image
  2097. im_shape = im.shape
  2098. if self.keep_ratio:
  2099. im_size_min = np.min(im_shape[0:2])
  2100. im_size_max = np.max(im_shape[0:2])
  2101. target_size_min = np.min(target_size)
  2102. target_size_max = np.max(target_size)
  2103. if mode == 'long':
  2104. im_scale = min(target_size_min / im_size_min,
  2105. target_size_max / im_size_max)
  2106. else:
  2107. im_scale = max(target_size_min / im_size_min,
  2108. target_size_max / im_size_max)
  2109. resize_h = im_scale * float(im_shape[0])
  2110. resize_w = im_scale * float(im_shape[1])
  2111. im_scale_x = im_scale
  2112. im_scale_y = im_scale
  2113. else:
  2114. resize_h, resize_w = target_size
  2115. im_scale_y = resize_h / im_shape[0]
  2116. im_scale_x = resize_w / im_shape[1]
  2117. im = self.apply_image(sample['image'], [im_scale_x, im_scale_y])
  2118. sample['image'] = im
  2119. sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32)
  2120. if 'scale_factor' in sample:
  2121. scale_factor = sample['scale_factor']
  2122. sample['scale_factor'] = np.asarray(
  2123. [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x],
  2124. dtype=np.float32)
  2125. else:
  2126. sample['scale_factor'] = np.asarray(
  2127. [im_scale_y, im_scale_x], dtype=np.float32)
  2128. # apply bbox
  2129. if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
  2130. sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'],
  2131. [im_scale_x, im_scale_y],
  2132. [resize_w, resize_h])
  2133. # apply rbox
  2134. if 'gt_rbox2poly' in sample:
  2135. if np.array(sample['gt_rbox2poly']).shape[1] != 8:
  2136. logger.warn(
  2137. "gt_rbox2poly's length shoule be 8, but actually is {}".
  2138. format(len(sample['gt_rbox2poly'])))
  2139. sample['gt_rbox2poly'] = self.apply_bbox(sample['gt_rbox2poly'],
  2140. [im_scale_x, im_scale_y],
  2141. [resize_w, resize_h])
  2142. # apply polygon
  2143. if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
  2144. sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im_shape[:2],
  2145. [im_scale_x, im_scale_y])
  2146. # apply semantic
  2147. if 'semantic' in sample and sample['semantic']:
  2148. semantic = sample['semantic']
  2149. semantic = cv2.resize(
  2150. semantic.astype('float32'),
  2151. None,
  2152. None,
  2153. fx=im_scale_x,
  2154. fy=im_scale_y,
  2155. interpolation=self.interp)
  2156. semantic = np.asarray(semantic).astype('int32')
  2157. semantic = np.expand_dims(semantic, 0)
  2158. sample['semantic'] = semantic
  2159. # apply gt_segm
  2160. if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
  2161. masks = [
  2162. cv2.resize(
  2163. gt_segm,
  2164. None,
  2165. None,
  2166. fx=im_scale_x,
  2167. fy=im_scale_y,
  2168. interpolation=cv2.INTER_NEAREST)
  2169. for gt_segm in sample['gt_segm']
  2170. ]
  2171. sample['gt_segm'] = np.asarray(masks).astype(np.uint8)
  2172. return sample
  2173. @register_op
  2174. class RandomSelect(BaseOperator):
  2175. """
  2176. Randomly choose a transformation between transforms1 and transforms2,
  2177. and the probability of choosing transforms1 is p.
  2178. The code is based on https://github.com/facebookresearch/detr/blob/main/datasets/transforms.py
  2179. """
  2180. def __init__(self, transforms1, transforms2, p=0.5):
  2181. super(RandomSelect, self).__init__()
  2182. self.transforms1 = Compose(transforms1)
  2183. self.transforms2 = Compose(transforms2)
  2184. self.p = p
  2185. def apply(self, sample, context=None):
  2186. if random.random() < self.p:
  2187. return self.transforms1(sample)
  2188. return self.transforms2(sample)
  2189. @register_op
  2190. class RandomShortSideResize(BaseOperator):
  2191. def __init__(self,
  2192. short_side_sizes,
  2193. max_size=None,
  2194. interp=cv2.INTER_LINEAR,
  2195. random_interp=False):
  2196. """
  2197. Resize the image randomly according to the short side. If max_size is not None,
  2198. the long side is scaled according to max_size. The whole process will be keep ratio.
  2199. Args:
  2200. short_side_sizes (list|tuple): Image target short side size.
  2201. max_size (int): The size of the longest side of image after resize.
  2202. interp (int): The interpolation method.
  2203. random_interp (bool): Whether random select interpolation method.
  2204. """
  2205. super(RandomShortSideResize, self).__init__()
  2206. assert isinstance(short_side_sizes,
  2207. Sequence), "short_side_sizes must be List or Tuple"
  2208. self.short_side_sizes = short_side_sizes
  2209. self.max_size = max_size
  2210. self.interp = interp
  2211. self.random_interp = random_interp
  2212. self.interps = [
  2213. cv2.INTER_NEAREST,
  2214. cv2.INTER_LINEAR,
  2215. cv2.INTER_AREA,
  2216. cv2.INTER_CUBIC,
  2217. cv2.INTER_LANCZOS4,
  2218. ]
  2219. def get_size_with_aspect_ratio(self, image_shape, size, max_size=None):
  2220. h, w = image_shape
  2221. if max_size is not None:
  2222. min_original_size = float(min((w, h)))
  2223. max_original_size = float(max((w, h)))
  2224. if max_original_size / min_original_size * size > max_size:
  2225. size = int(
  2226. round(max_size * min_original_size / max_original_size))
  2227. if (w <= h and w == size) or (h <= w and h == size):
  2228. return (w, h)
  2229. if w < h:
  2230. ow = size
  2231. oh = int(size * h / w)
  2232. else:
  2233. oh = size
  2234. ow = int(size * w / h)
  2235. return (ow, oh)
  2236. def resize(self,
  2237. sample,
  2238. target_size,
  2239. max_size=None,
  2240. interp=cv2.INTER_LINEAR):
  2241. im = sample['image']
  2242. if not isinstance(im, np.ndarray):
  2243. raise TypeError("{}: image type is not numpy.".format(self))
  2244. if len(im.shape) != 3:
  2245. raise ImageError('{}: image is not 3-dimensional.'.format(self))
  2246. target_size = self.get_size_with_aspect_ratio(im.shape[:2], target_size,
  2247. max_size)
  2248. im_scale_y, im_scale_x = target_size[1] / im.shape[0], target_size[
  2249. 0] / im.shape[1]
  2250. sample['image'] = cv2.resize(im, target_size, interpolation=interp)
  2251. sample['im_shape'] = np.asarray(target_size[::-1], dtype=np.float32)
  2252. if 'scale_factor' in sample:
  2253. scale_factor = sample['scale_factor']
  2254. sample['scale_factor'] = np.asarray(
  2255. [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x],
  2256. dtype=np.float32)
  2257. else:
  2258. sample['scale_factor'] = np.asarray(
  2259. [im_scale_y, im_scale_x], dtype=np.float32)
  2260. # apply bbox
  2261. if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
  2262. sample['gt_bbox'] = self.apply_bbox(
  2263. sample['gt_bbox'], [im_scale_x, im_scale_y], target_size)
  2264. # apply polygon
  2265. if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
  2266. sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im.shape[:2],
  2267. [im_scale_x, im_scale_y])
  2268. # apply semantic
  2269. if 'semantic' in sample and sample['semantic']:
  2270. semantic = sample['semantic']
  2271. semantic = cv2.resize(
  2272. semantic.astype('float32'),
  2273. target_size,
  2274. interpolation=self.interp)
  2275. semantic = np.asarray(semantic).astype('int32')
  2276. semantic = np.expand_dims(semantic, 0)
  2277. sample['semantic'] = semantic
  2278. # apply gt_segm
  2279. if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
  2280. masks = [
  2281. cv2.resize(
  2282. gt_segm, target_size, interpolation=cv2.INTER_NEAREST)
  2283. for gt_segm in sample['gt_segm']
  2284. ]
  2285. sample['gt_segm'] = np.asarray(masks).astype(np.uint8)
  2286. return sample
  2287. def apply_bbox(self, bbox, scale, size):
  2288. im_scale_x, im_scale_y = scale
  2289. resize_w, resize_h = size
  2290. bbox[:, 0::2] *= im_scale_x
  2291. bbox[:, 1::2] *= im_scale_y
  2292. bbox[:, 0::2] = np.clip(bbox[:, 0::2], 0, resize_w)
  2293. bbox[:, 1::2] = np.clip(bbox[:, 1::2], 0, resize_h)
  2294. return bbox.astype('float32')
  2295. def apply_segm(self, segms, im_size, scale):
  2296. def _resize_poly(poly, im_scale_x, im_scale_y):
  2297. resized_poly = np.array(poly).astype('float32')
  2298. resized_poly[0::2] *= im_scale_x
  2299. resized_poly[1::2] *= im_scale_y
  2300. return resized_poly.tolist()
  2301. def _resize_rle(rle, im_h, im_w, im_scale_x, im_scale_y):
  2302. if 'counts' in rle and type(rle['counts']) == list:
  2303. rle = mask_util.frPyObjects(rle, im_h, im_w)
  2304. mask = mask_util.decode(rle)
  2305. mask = cv2.resize(
  2306. mask,
  2307. None,
  2308. None,
  2309. fx=im_scale_x,
  2310. fy=im_scale_y,
  2311. interpolation=self.interp)
  2312. rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
  2313. return rle
  2314. im_h, im_w = im_size
  2315. im_scale_x, im_scale_y = scale
  2316. resized_segms = []
  2317. for segm in segms:
  2318. if is_poly(segm):
  2319. # Polygon format
  2320. resized_segms.append([
  2321. _resize_poly(poly, im_scale_x, im_scale_y) for poly in segm
  2322. ])
  2323. else:
  2324. # RLE format
  2325. import pycocotools.mask as mask_util
  2326. resized_segms.append(
  2327. _resize_rle(segm, im_h, im_w, im_scale_x, im_scale_y))
  2328. return resized_segms
  2329. def apply(self, sample, context=None):
  2330. target_size = random.choice(self.short_side_sizes)
  2331. interp = random.choice(
  2332. self.interps) if self.random_interp else self.interp
  2333. return self.resize(sample, target_size, self.max_size, interp)
  2334. @register_op
  2335. class RandomSizeCrop(BaseOperator):
  2336. """
  2337. Cut the image randomly according to `min_size` and `max_size`
  2338. """
  2339. def __init__(self, min_size, max_size):
  2340. super(RandomSizeCrop, self).__init__()
  2341. self.min_size = min_size
  2342. self.max_size = max_size
  2343. from paddle.vision.transforms.functional import crop as paddle_crop
  2344. self.paddle_crop = paddle_crop
  2345. @staticmethod
  2346. def get_crop_params(img_shape, output_size):
  2347. """Get parameters for ``crop`` for a random crop.
  2348. Args:
  2349. img_shape (list|tuple): Image's height and width.
  2350. output_size (list|tuple): Expected output size of the crop.
  2351. Returns:
  2352. tuple: params (i, j, h, w) to be passed to ``crop`` for random crop.
  2353. """
  2354. h, w = img_shape
  2355. th, tw = output_size
  2356. if h + 1 < th or w + 1 < tw:
  2357. raise ValueError(
  2358. "Required crop size {} is larger then input image size {}".
  2359. format((th, tw), (h, w)))
  2360. if w == tw and h == th:
  2361. return 0, 0, h, w
  2362. i = random.randint(0, h - th + 1)
  2363. j = random.randint(0, w - tw + 1)
  2364. return i, j, th, tw
  2365. def crop(self, sample, region):
  2366. image_shape = sample['image'].shape[:2]
  2367. sample['image'] = self.paddle_crop(sample['image'], *region)
  2368. keep_index = None
  2369. # apply bbox
  2370. if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
  2371. sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], region)
  2372. bbox = sample['gt_bbox'].reshape([-1, 2, 2])
  2373. area = (bbox[:, 1, :] - bbox[:, 0, :]).prod(axis=1)
  2374. keep_index = np.where(area > 0)[0]
  2375. sample['gt_bbox'] = sample['gt_bbox'][keep_index] if len(
  2376. keep_index) > 0 else np.zeros(
  2377. [0, 4], dtype=np.float32)
  2378. sample['gt_class'] = sample['gt_class'][keep_index] if len(
  2379. keep_index) > 0 else np.zeros(
  2380. [0, 1], dtype=np.float32)
  2381. if 'gt_score' in sample:
  2382. sample['gt_score'] = sample['gt_score'][keep_index] if len(
  2383. keep_index) > 0 else np.zeros(
  2384. [0, 1], dtype=np.float32)
  2385. if 'is_crowd' in sample:
  2386. sample['is_crowd'] = sample['is_crowd'][keep_index] if len(
  2387. keep_index) > 0 else np.zeros(
  2388. [0, 1], dtype=np.float32)
  2389. # apply polygon
  2390. if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
  2391. sample['gt_poly'] = self.apply_segm(sample['gt_poly'], region,
  2392. image_shape)
  2393. if keep_index is not None:
  2394. sample['gt_poly'] = sample['gt_poly'][keep_index]
  2395. # apply gt_segm
  2396. if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
  2397. i, j, h, w = region
  2398. sample['gt_segm'] = sample['gt_segm'][:, i:i + h, j:j + w]
  2399. if keep_index is not None:
  2400. sample['gt_segm'] = sample['gt_segm'][keep_index]
  2401. return sample
  2402. def apply_bbox(self, bbox, region):
  2403. i, j, h, w = region
  2404. region_size = np.asarray([w, h])
  2405. crop_bbox = bbox - np.asarray([j, i, j, i])
  2406. crop_bbox = np.minimum(crop_bbox.reshape([-1, 2, 2]), region_size)
  2407. crop_bbox = crop_bbox.clip(min=0)
  2408. return crop_bbox.reshape([-1, 4]).astype('float32')
  2409. def apply_segm(self, segms, region, image_shape):
  2410. def _crop_poly(segm, crop):
  2411. xmin, ymin, xmax, ymax = crop
  2412. crop_coord = [xmin, ymin, xmin, ymax, xmax, ymax, xmax, ymin]
  2413. crop_p = np.array(crop_coord).reshape(4, 2)
  2414. crop_p = Polygon(crop_p)
  2415. crop_segm = list()
  2416. for poly in segm:
  2417. poly = np.array(poly).reshape(len(poly) // 2, 2)
  2418. polygon = Polygon(poly)
  2419. if not polygon.is_valid:
  2420. exterior = polygon.exterior
  2421. multi_lines = exterior.intersection(exterior)
  2422. polygons = shapely.ops.polygonize(multi_lines)
  2423. polygon = MultiPolygon(polygons)
  2424. multi_polygon = list()
  2425. if isinstance(polygon, MultiPolygon):
  2426. multi_polygon = copy.deepcopy(polygon)
  2427. else:
  2428. multi_polygon.append(copy.deepcopy(polygon))
  2429. for per_polygon in multi_polygon:
  2430. inter = per_polygon.intersection(crop_p)
  2431. if not inter:
  2432. continue
  2433. if isinstance(inter, (MultiPolygon, GeometryCollection)):
  2434. for part in inter:
  2435. if not isinstance(part, Polygon):
  2436. continue
  2437. part = np.squeeze(
  2438. np.array(part.exterior.coords[:-1]).reshape(1,
  2439. -1))
  2440. part[0::2] -= xmin
  2441. part[1::2] -= ymin
  2442. crop_segm.append(part.tolist())
  2443. elif isinstance(inter, Polygon):
  2444. crop_poly = np.squeeze(
  2445. np.array(inter.exterior.coords[:-1]).reshape(1, -1))
  2446. crop_poly[0::2] -= xmin
  2447. crop_poly[1::2] -= ymin
  2448. crop_segm.append(crop_poly.tolist())
  2449. else:
  2450. continue
  2451. return crop_segm
  2452. def _crop_rle(rle, crop, height, width):
  2453. if 'counts' in rle and type(rle['counts']) == list:
  2454. rle = mask_util.frPyObjects(rle, height, width)
  2455. mask = mask_util.decode(rle)
  2456. mask = mask[crop[1]:crop[3], crop[0]:crop[2]]
  2457. rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
  2458. return rle
  2459. i, j, h, w = region
  2460. crop = [j, i, j + w, i + h]
  2461. height, width = image_shape
  2462. crop_segms = []
  2463. for segm in segms:
  2464. if is_poly(segm):
  2465. import copy
  2466. import shapely.ops
  2467. from shapely.geometry import Polygon, MultiPolygon, GeometryCollection
  2468. # Polygon format
  2469. crop_segms.append(_crop_poly(segm, crop))
  2470. else:
  2471. # RLE format
  2472. import pycocotools.mask as mask_util
  2473. crop_segms.append(_crop_rle(segm, crop, height, width))
  2474. return crop_segms
  2475. def apply(self, sample, context=None):
  2476. h = random.randint(self.min_size,
  2477. min(sample['image'].shape[0], self.max_size))
  2478. w = random.randint(self.min_size,
  2479. min(sample['image'].shape[1], self.max_size))
  2480. region = self.get_crop_params(sample['image'].shape[:2], [h, w])
  2481. return self.crop(sample, region)
  2482. @register_op
  2483. class WarpAffine(BaseOperator):
  2484. def __init__(self,
  2485. keep_res=False,
  2486. pad=31,
  2487. input_h=512,
  2488. input_w=512,
  2489. scale=0.4,
  2490. shift=0.1):
  2491. """WarpAffine
  2492. Warp affine the image
  2493. The code is based on https://github.com/xingyizhou/CenterNet/blob/master/src/lib/datasets/sample/ctdet.py
  2494. """
  2495. super(WarpAffine, self).__init__()
  2496. self.keep_res = keep_res
  2497. self.pad = pad
  2498. self.input_h = input_h
  2499. self.input_w = input_w
  2500. self.scale = scale
  2501. self.shift = shift
  2502. def apply(self, sample, context=None):
  2503. img = sample['image']
  2504. img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
  2505. if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0:
  2506. return sample
  2507. h, w = img.shape[:2]
  2508. if self.keep_res:
  2509. input_h = (h | self.pad) + 1
  2510. input_w = (w | self.pad) + 1
  2511. s = np.array([input_w, input_h], dtype=np.float32)
  2512. c = np.array([w // 2, h // 2], dtype=np.float32)
  2513. else:
  2514. s = max(h, w) * 1.0
  2515. input_h, input_w = self.input_h, self.input_w
  2516. c = np.array([w / 2., h / 2.], dtype=np.float32)
  2517. trans_input = get_affine_transform(c, s, 0, [input_w, input_h])
  2518. img = cv2.resize(img, (w, h))
  2519. inp = cv2.warpAffine(
  2520. img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR)
  2521. sample['image'] = inp
  2522. return sample
  2523. @register_op
  2524. class FlipWarpAffine(BaseOperator):
  2525. def __init__(self,
  2526. keep_res=False,
  2527. pad=31,
  2528. input_h=512,
  2529. input_w=512,
  2530. not_rand_crop=False,
  2531. scale=0.4,
  2532. shift=0.1,
  2533. flip=0.5,
  2534. is_scale=True,
  2535. use_random=True):
  2536. """FlipWarpAffine
  2537. 1. Random Crop
  2538. 2. Flip the image horizontal
  2539. 3. Warp affine the image
  2540. """
  2541. super(FlipWarpAffine, self).__init__()
  2542. self.keep_res = keep_res
  2543. self.pad = pad
  2544. self.input_h = input_h
  2545. self.input_w = input_w
  2546. self.not_rand_crop = not_rand_crop
  2547. self.scale = scale
  2548. self.shift = shift
  2549. self.flip = flip
  2550. self.is_scale = is_scale
  2551. self.use_random = use_random
  2552. def apply(self, sample, context=None):
  2553. img = sample['image']
  2554. img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
  2555. if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0:
  2556. return sample
  2557. h, w = img.shape[:2]
  2558. if self.keep_res:
  2559. input_h = (h | self.pad) + 1
  2560. input_w = (w | self.pad) + 1
  2561. s = np.array([input_w, input_h], dtype=np.float32)
  2562. c = np.array([w // 2, h // 2], dtype=np.float32)
  2563. else:
  2564. s = max(h, w) * 1.0
  2565. input_h, input_w = self.input_h, self.input_w
  2566. c = np.array([w / 2., h / 2.], dtype=np.float32)
  2567. if self.use_random:
  2568. gt_bbox = sample['gt_bbox']
  2569. if not self.not_rand_crop:
  2570. s = s * np.random.choice(np.arange(0.6, 1.4, 0.1))
  2571. w_border = get_border(128, w)
  2572. h_border = get_border(128, h)
  2573. c[0] = np.random.randint(low=w_border, high=w - w_border)
  2574. c[1] = np.random.randint(low=h_border, high=h - h_border)
  2575. else:
  2576. sf = self.scale
  2577. cf = self.shift
  2578. c[0] += s * np.clip(np.random.randn() * cf, -2 * cf, 2 * cf)
  2579. c[1] += s * np.clip(np.random.randn() * cf, -2 * cf, 2 * cf)
  2580. s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)
  2581. if np.random.random() < self.flip:
  2582. img = img[:, ::-1, :]
  2583. c[0] = w - c[0] - 1
  2584. oldx1 = gt_bbox[:, 0].copy()
  2585. oldx2 = gt_bbox[:, 2].copy()
  2586. gt_bbox[:, 0] = w - oldx2 - 1
  2587. gt_bbox[:, 2] = w - oldx1 - 1
  2588. sample['gt_bbox'] = gt_bbox
  2589. trans_input = get_affine_transform(c, s, 0, [input_w, input_h])
  2590. if not self.use_random:
  2591. img = cv2.resize(img, (w, h))
  2592. inp = cv2.warpAffine(
  2593. img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR)
  2594. if self.is_scale:
  2595. inp = (inp.astype(np.float32) / 255.)
  2596. sample['image'] = inp
  2597. sample['center'] = c
  2598. sample['scale'] = s
  2599. return sample
  2600. @register_op
  2601. class CenterRandColor(BaseOperator):
  2602. """Random color for CenterNet series models.
  2603. Args:
  2604. saturation (float): saturation settings.
  2605. contrast (float): contrast settings.
  2606. brightness (float): brightness settings.
  2607. """
  2608. def __init__(self, saturation=0.4, contrast=0.4, brightness=0.4):
  2609. super(CenterRandColor, self).__init__()
  2610. self.saturation = saturation
  2611. self.contrast = contrast
  2612. self.brightness = brightness
  2613. def apply_saturation(self, img, img_gray):
  2614. alpha = 1. + np.random.uniform(
  2615. low=-self.saturation, high=self.saturation)
  2616. self._blend(alpha, img, img_gray[:, :, None])
  2617. return img
  2618. def apply_contrast(self, img, img_gray):
  2619. alpha = 1. + np.random.uniform(low=-self.contrast, high=self.contrast)
  2620. img_mean = img_gray.mean()
  2621. self._blend(alpha, img, img_mean)
  2622. return img
  2623. def apply_brightness(self, img, img_gray):
  2624. alpha = 1 + np.random.uniform(
  2625. low=-self.brightness, high=self.brightness)
  2626. img *= alpha
  2627. return img
  2628. def _blend(self, alpha, img, img_mean):
  2629. img *= alpha
  2630. img_mean *= (1 - alpha)
  2631. img += img_mean
  2632. def __call__(self, sample, context=None):
  2633. img = sample['image']
  2634. img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  2635. functions = [
  2636. self.apply_brightness,
  2637. self.apply_contrast,
  2638. self.apply_saturation,
  2639. ]
  2640. distortions = np.random.permutation(functions)
  2641. for func in distortions:
  2642. img = func(img, img_gray)
  2643. sample['image'] = img
  2644. return sample
  2645. @register_op
  2646. class Mosaic(BaseOperator):
  2647. """ Mosaic operator for image and gt_bboxes
  2648. The code is based on https://github.com/Megvii-BaseDetection/YOLOX/blob/main/yolox/data/datasets/mosaicdetection.py
  2649. 1. get mosaic coords
  2650. 2. clip bbox and get mosaic_labels
  2651. 3. random_affine augment
  2652. 4. Mixup augment as copypaste (optinal), not used in tiny/nano
  2653. Args:
  2654. prob (float): probability of using Mosaic, 1.0 as default
  2655. input_dim (list[int]): input shape
  2656. degrees (list[2]): the rotate range to apply, transform range is [min, max]
  2657. translate (list[2]): the translate range to apply, transform range is [min, max]
  2658. scale (list[2]): the scale range to apply, transform range is [min, max]
  2659. shear (list[2]): the shear range to apply, transform range is [min, max]
  2660. enable_mixup (bool): whether to enable Mixup or not
  2661. mixup_prob (float): probability of using Mixup, 1.0 as default
  2662. mixup_scale (list[int]): scale range of Mixup
  2663. remove_outside_box (bool): whether remove outside boxes, False as
  2664. default in COCO dataset, True in MOT dataset
  2665. """
  2666. def __init__(self,
  2667. prob=1.0,
  2668. input_dim=[640, 640],
  2669. degrees=[-10, 10],
  2670. translate=[-0.1, 0.1],
  2671. scale=[0.1, 2],
  2672. shear=[-2, 2],
  2673. enable_mixup=True,
  2674. mixup_prob=1.0,
  2675. mixup_scale=[0.5, 1.5],
  2676. remove_outside_box=False):
  2677. super(Mosaic, self).__init__()
  2678. self.prob = prob
  2679. if isinstance(input_dim, Integral):
  2680. input_dim = [input_dim, input_dim]
  2681. self.input_dim = input_dim
  2682. self.degrees = degrees
  2683. self.translate = translate
  2684. self.scale = scale
  2685. self.shear = shear
  2686. self.enable_mixup = enable_mixup
  2687. self.mixup_prob = mixup_prob
  2688. self.mixup_scale = mixup_scale
  2689. self.remove_outside_box = remove_outside_box
  2690. def get_mosaic_coords(self, mosaic_idx, xc, yc, w, h, input_h, input_w):
  2691. # (x1, y1, x2, y2) means coords in large image,
  2692. # small_coords means coords in small image in mosaic aug.
  2693. if mosaic_idx == 0:
  2694. # top left
  2695. x1, y1, x2, y2 = max(xc - w, 0), max(yc - h, 0), xc, yc
  2696. small_coords = w - (x2 - x1), h - (y2 - y1), w, h
  2697. elif mosaic_idx == 1:
  2698. # top right
  2699. x1, y1, x2, y2 = xc, max(yc - h, 0), min(xc + w, input_w * 2), yc
  2700. small_coords = 0, h - (y2 - y1), min(w, x2 - x1), h
  2701. elif mosaic_idx == 2:
  2702. # bottom left
  2703. x1, y1, x2, y2 = max(xc - w, 0), yc, xc, min(input_h * 2, yc + h)
  2704. small_coords = w - (x2 - x1), 0, w, min(y2 - y1, h)
  2705. elif mosaic_idx == 3:
  2706. # bottom right
  2707. x1, y1, x2, y2 = xc, yc, min(xc + w, input_w * 2), min(input_h * 2,
  2708. yc + h)
  2709. small_coords = 0, 0, min(w, x2 - x1), min(y2 - y1, h)
  2710. return (x1, y1, x2, y2), small_coords
  2711. def random_affine_augment(self,
  2712. img,
  2713. labels=[],
  2714. input_dim=[640, 640],
  2715. degrees=[-10, 10],
  2716. scales=[0.1, 2],
  2717. shears=[-2, 2],
  2718. translates=[-0.1, 0.1]):
  2719. # random rotation and scale
  2720. degree = random.uniform(degrees[0], degrees[1])
  2721. scale = random.uniform(scales[0], scales[1])
  2722. assert scale > 0, "Argument scale should be positive."
  2723. R = cv2.getRotationMatrix2D(angle=degree, center=(0, 0), scale=scale)
  2724. M = np.ones([2, 3])
  2725. # random shear
  2726. shear = random.uniform(shears[0], shears[1])
  2727. shear_x = math.tan(shear * math.pi / 180)
  2728. shear_y = math.tan(shear * math.pi / 180)
  2729. M[0] = R[0] + shear_y * R[1]
  2730. M[1] = R[1] + shear_x * R[0]
  2731. # random translation
  2732. translate = random.uniform(translates[0], translates[1])
  2733. translation_x = translate * input_dim[0]
  2734. translation_y = translate * input_dim[1]
  2735. M[0, 2] = translation_x
  2736. M[1, 2] = translation_y
  2737. # warpAffine
  2738. img = cv2.warpAffine(
  2739. img, M, dsize=tuple(input_dim), borderValue=(114, 114, 114))
  2740. num_gts = len(labels)
  2741. if num_gts > 0:
  2742. # warp corner points
  2743. corner_points = np.ones((4 * num_gts, 3))
  2744. corner_points[:, :2] = labels[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(
  2745. 4 * num_gts, 2) # x1y1, x2y2, x1y2, x2y1
  2746. # apply affine transform
  2747. corner_points = corner_points @M.T
  2748. corner_points = corner_points.reshape(num_gts, 8)
  2749. # create new boxes
  2750. corner_xs = corner_points[:, 0::2]
  2751. corner_ys = corner_points[:, 1::2]
  2752. new_bboxes = np.concatenate((corner_xs.min(1), corner_ys.min(1),
  2753. corner_xs.max(1), corner_ys.max(1)))
  2754. new_bboxes = new_bboxes.reshape(4, num_gts).T
  2755. # clip boxes
  2756. new_bboxes[:, 0::2] = np.clip(new_bboxes[:, 0::2], 0, input_dim[0])
  2757. new_bboxes[:, 1::2] = np.clip(new_bboxes[:, 1::2], 0, input_dim[1])
  2758. labels[:, :4] = new_bboxes
  2759. return img, labels
  2760. def __call__(self, sample, context=None):
  2761. if not isinstance(sample, Sequence):
  2762. return sample
  2763. assert len(
  2764. sample) == 5, "Mosaic needs 5 samples, 4 for mosaic and 1 for mixup."
  2765. if np.random.uniform(0., 1.) > self.prob:
  2766. return sample[0]
  2767. mosaic_gt_bbox, mosaic_gt_class, mosaic_is_crowd = [], [], []
  2768. input_h, input_w = self.input_dim
  2769. yc = int(random.uniform(0.5 * input_h, 1.5 * input_h))
  2770. xc = int(random.uniform(0.5 * input_w, 1.5 * input_w))
  2771. mosaic_img = np.full((input_h * 2, input_w * 2, 3), 114, dtype=np.uint8)
  2772. # 1. get mosaic coords
  2773. for mosaic_idx, sp in enumerate(sample[:4]):
  2774. img = sp['image']
  2775. gt_bbox = sp['gt_bbox']
  2776. h0, w0 = img.shape[:2]
  2777. scale = min(1. * input_h / h0, 1. * input_w / w0)
  2778. img = cv2.resize(
  2779. img, (int(w0 * scale), int(h0 * scale)),
  2780. interpolation=cv2.INTER_LINEAR)
  2781. (h, w, c) = img.shape[:3]
  2782. # suffix l means large image, while s means small image in mosaic aug.
  2783. (l_x1, l_y1, l_x2, l_y2), (
  2784. s_x1, s_y1, s_x2, s_y2) = self.get_mosaic_coords(
  2785. mosaic_idx, xc, yc, w, h, input_h, input_w)
  2786. mosaic_img[l_y1:l_y2, l_x1:l_x2] = img[s_y1:s_y2, s_x1:s_x2]
  2787. padw, padh = l_x1 - s_x1, l_y1 - s_y1
  2788. # Normalized xywh to pixel xyxy format
  2789. _gt_bbox = gt_bbox.copy()
  2790. if len(gt_bbox) > 0:
  2791. _gt_bbox[:, 0] = scale * gt_bbox[:, 0] + padw
  2792. _gt_bbox[:, 1] = scale * gt_bbox[:, 1] + padh
  2793. _gt_bbox[:, 2] = scale * gt_bbox[:, 2] + padw
  2794. _gt_bbox[:, 3] = scale * gt_bbox[:, 3] + padh
  2795. mosaic_gt_bbox.append(_gt_bbox)
  2796. mosaic_gt_class.append(sp['gt_class'])
  2797. mosaic_is_crowd.append(sp['is_crowd'])
  2798. # 2. clip bbox and get mosaic_labels([gt_bbox, gt_class, is_crowd])
  2799. if len(mosaic_gt_bbox):
  2800. mosaic_gt_bbox = np.concatenate(mosaic_gt_bbox, 0)
  2801. mosaic_gt_class = np.concatenate(mosaic_gt_class, 0)
  2802. mosaic_is_crowd = np.concatenate(mosaic_is_crowd, 0)
  2803. mosaic_labels = np.concatenate([
  2804. mosaic_gt_bbox, mosaic_gt_class.astype(mosaic_gt_bbox.dtype),
  2805. mosaic_is_crowd.astype(mosaic_gt_bbox.dtype)
  2806. ], 1)
  2807. if self.remove_outside_box:
  2808. # for MOT dataset
  2809. flag1 = mosaic_gt_bbox[:, 0] < 2 * input_w
  2810. flag2 = mosaic_gt_bbox[:, 2] > 0
  2811. flag3 = mosaic_gt_bbox[:, 1] < 2 * input_h
  2812. flag4 = mosaic_gt_bbox[:, 3] > 0
  2813. flag_all = flag1 * flag2 * flag3 * flag4
  2814. mosaic_labels = mosaic_labels[flag_all]
  2815. else:
  2816. mosaic_labels[:, 0] = np.clip(mosaic_labels[:, 0], 0,
  2817. 2 * input_w)
  2818. mosaic_labels[:, 1] = np.clip(mosaic_labels[:, 1], 0,
  2819. 2 * input_h)
  2820. mosaic_labels[:, 2] = np.clip(mosaic_labels[:, 2], 0,
  2821. 2 * input_w)
  2822. mosaic_labels[:, 3] = np.clip(mosaic_labels[:, 3], 0,
  2823. 2 * input_h)
  2824. else:
  2825. mosaic_labels = np.zeros((1, 6))
  2826. # 3. random_affine augment
  2827. mosaic_img, mosaic_labels = self.random_affine_augment(
  2828. mosaic_img,
  2829. mosaic_labels,
  2830. input_dim=self.input_dim,
  2831. degrees=self.degrees,
  2832. translates=self.translate,
  2833. scales=self.scale,
  2834. shears=self.shear)
  2835. # 4. Mixup augment as copypaste, https://arxiv.org/abs/2012.07177
  2836. # optinal, not used(enable_mixup=False) in tiny/nano
  2837. if (self.enable_mixup and not len(mosaic_labels) == 0 and
  2838. random.random() < self.mixup_prob):
  2839. sample_mixup = sample[4]
  2840. mixup_img = sample_mixup['image']
  2841. cp_labels = np.concatenate([
  2842. sample_mixup['gt_bbox'],
  2843. sample_mixup['gt_class'].astype(mosaic_labels.dtype),
  2844. sample_mixup['is_crowd'].astype(mosaic_labels.dtype)
  2845. ], 1)
  2846. mosaic_img, mosaic_labels = self.mixup_augment(
  2847. mosaic_img, mosaic_labels, self.input_dim, cp_labels, mixup_img)
  2848. sample0 = sample[0]
  2849. sample0['image'] = mosaic_img.astype(np.uint8) # can not be float32
  2850. sample0['h'] = float(mosaic_img.shape[0])
  2851. sample0['w'] = float(mosaic_img.shape[1])
  2852. sample0['im_shape'][0] = sample0['h']
  2853. sample0['im_shape'][1] = sample0['w']
  2854. sample0['gt_bbox'] = mosaic_labels[:, :4].astype(np.float32)
  2855. sample0['gt_class'] = mosaic_labels[:, 4:5].astype(np.float32)
  2856. sample0['is_crowd'] = mosaic_labels[:, 5:6].astype(np.float32)
  2857. return sample0
  2858. def mixup_augment(self, origin_img, origin_labels, input_dim, cp_labels,
  2859. img):
  2860. jit_factor = random.uniform(*self.mixup_scale)
  2861. FLIP = random.uniform(0, 1) > 0.5
  2862. if len(img.shape) == 3:
  2863. cp_img = np.ones(
  2864. (input_dim[0], input_dim[1], 3), dtype=np.uint8) * 114
  2865. else:
  2866. cp_img = np.ones(input_dim, dtype=np.uint8) * 114
  2867. cp_scale_ratio = min(input_dim[0] / img.shape[0],
  2868. input_dim[1] / img.shape[1])
  2869. resized_img = cv2.resize(
  2870. img, (int(img.shape[1] * cp_scale_ratio),
  2871. int(img.shape[0] * cp_scale_ratio)),
  2872. interpolation=cv2.INTER_LINEAR)
  2873. cp_img[:int(img.shape[0] * cp_scale_ratio), :int(img.shape[
  2874. 1] * cp_scale_ratio)] = resized_img
  2875. cp_img = cv2.resize(cp_img, (int(cp_img.shape[1] * jit_factor),
  2876. int(cp_img.shape[0] * jit_factor)))
  2877. cp_scale_ratio *= jit_factor
  2878. if FLIP:
  2879. cp_img = cp_img[:, ::-1, :]
  2880. origin_h, origin_w = cp_img.shape[:2]
  2881. target_h, target_w = origin_img.shape[:2]
  2882. padded_img = np.zeros(
  2883. (max(origin_h, target_h), max(origin_w, target_w), 3),
  2884. dtype=np.uint8)
  2885. padded_img[:origin_h, :origin_w] = cp_img
  2886. x_offset, y_offset = 0, 0
  2887. if padded_img.shape[0] > target_h:
  2888. y_offset = random.randint(0, padded_img.shape[0] - target_h - 1)
  2889. if padded_img.shape[1] > target_w:
  2890. x_offset = random.randint(0, padded_img.shape[1] - target_w - 1)
  2891. padded_cropped_img = padded_img[y_offset:y_offset + target_h, x_offset:
  2892. x_offset + target_w]
  2893. # adjust boxes
  2894. cp_bboxes_origin_np = cp_labels[:, :4].copy()
  2895. cp_bboxes_origin_np[:, 0::2] = np.clip(cp_bboxes_origin_np[:, 0::2] *
  2896. cp_scale_ratio, 0, origin_w)
  2897. cp_bboxes_origin_np[:, 1::2] = np.clip(cp_bboxes_origin_np[:, 1::2] *
  2898. cp_scale_ratio, 0, origin_h)
  2899. if FLIP:
  2900. cp_bboxes_origin_np[:, 0::2] = (
  2901. origin_w - cp_bboxes_origin_np[:, 0::2][:, ::-1])
  2902. cp_bboxes_transformed_np = cp_bboxes_origin_np.copy()
  2903. if self.remove_outside_box:
  2904. # for MOT dataset
  2905. cp_bboxes_transformed_np[:, 0::2] -= x_offset
  2906. cp_bboxes_transformed_np[:, 1::2] -= y_offset
  2907. else:
  2908. cp_bboxes_transformed_np[:, 0::2] = np.clip(
  2909. cp_bboxes_transformed_np[:, 0::2] - x_offset, 0, target_w)
  2910. cp_bboxes_transformed_np[:, 1::2] = np.clip(
  2911. cp_bboxes_transformed_np[:, 1::2] - y_offset, 0, target_h)
  2912. cls_labels = cp_labels[:, 4:5].copy()
  2913. crd_labels = cp_labels[:, 5:6].copy()
  2914. box_labels = cp_bboxes_transformed_np
  2915. labels = np.hstack((box_labels, cls_labels, crd_labels))
  2916. if self.remove_outside_box:
  2917. labels = labels[labels[:, 0] < target_w]
  2918. labels = labels[labels[:, 2] > 0]
  2919. labels = labels[labels[:, 1] < target_h]
  2920. labels = labels[labels[:, 3] > 0]
  2921. origin_labels = np.vstack((origin_labels, labels))
  2922. origin_img = origin_img.astype(np.float32)
  2923. origin_img = 0.5 * origin_img + 0.5 * padded_cropped_img.astype(
  2924. np.float32)
  2925. return origin_img.astype(np.uint8), origin_labels
  2926. @register_op
  2927. class PadResize(BaseOperator):
  2928. """ PadResize for image and gt_bbbox
  2929. Args:
  2930. target_size (list[int]): input shape
  2931. fill_value (float): pixel value of padded image
  2932. """
  2933. def __init__(self, target_size, fill_value=114):
  2934. super(PadResize, self).__init__()
  2935. if isinstance(target_size, Integral):
  2936. target_size = [target_size, target_size]
  2937. self.target_size = target_size
  2938. self.fill_value = fill_value
  2939. def _resize(self, img, bboxes, labels):
  2940. ratio = min(self.target_size[0] / img.shape[0],
  2941. self.target_size[1] / img.shape[1])
  2942. w, h = int(img.shape[1] * ratio), int(img.shape[0] * ratio)
  2943. resized_img = cv2.resize(img, (w, h), interpolation=cv2.INTER_LINEAR)
  2944. if len(bboxes) > 0:
  2945. bboxes *= ratio
  2946. mask = np.minimum(bboxes[:, 2] - bboxes[:, 0],
  2947. bboxes[:, 3] - bboxes[:, 1]) > 1
  2948. bboxes = bboxes[mask]
  2949. labels = labels[mask]
  2950. return resized_img, bboxes, labels
  2951. def _pad(self, img):
  2952. h, w, _ = img.shape
  2953. if h == self.target_size[0] and w == self.target_size[1]:
  2954. return img
  2955. padded_img = np.full(
  2956. (self.target_size[0], self.target_size[1], 3),
  2957. self.fill_value,
  2958. dtype=np.uint8)
  2959. padded_img[:h, :w] = img
  2960. return padded_img
  2961. def apply(self, sample, context=None):
  2962. image = sample['image']
  2963. bboxes = sample['gt_bbox']
  2964. labels = sample['gt_class']
  2965. image, bboxes, labels = self._resize(image, bboxes, labels)
  2966. sample['image'] = self._pad(image).astype(np.float32)
  2967. sample['gt_bbox'] = bboxes
  2968. sample['gt_class'] = labels
  2969. return sample