transforms.py 145 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855
  1. # Copyright (c) OpenMMLab. All rights reserved.
  2. import copy
  3. import inspect
  4. import math
  5. import warnings
  6. from typing import List, Optional, Sequence, Tuple, Union
  7. import cv2
  8. import mmcv
  9. import numpy
  10. import numpy as np
  11. from mmcv.image import imresize
  12. from mmcv.image.geometric import _scale_size
  13. from mmcv.transforms import BaseTransform
  14. from mmcv.transforms import Pad as MMCV_Pad
  15. from mmcv.transforms import RandomFlip as MMCV_RandomFlip
  16. from mmcv.transforms import Resize as MMCV_Resize
  17. from mmcv.transforms.utils import avoid_cache_randomness, cache_randomness
  18. from mmengine.dataset import BaseDataset
  19. from mmengine.utils import is_str
  20. from numpy import random
  21. from mmdet.registry import TRANSFORMS
  22. from mmdet.structures.bbox import HorizontalBoxes, autocast_box_type
  23. from mmdet.structures.mask import BitmapMasks, PolygonMasks
  24. from mmdet.utils import log_img_scale
  25. try:
  26. from imagecorruptions import corrupt
  27. except ImportError:
  28. corrupt = None
  29. try:
  30. import albumentations
  31. from albumentations import Compose
  32. except ImportError:
  33. albumentations = None
  34. Compose = None
  35. Number = Union[int, float]
  36. def _fixed_scale_size(
  37. size: Tuple[int, int],
  38. scale: Union[float, int, tuple],
  39. ) -> Tuple[int, int]:
  40. """Rescale a size by a ratio.
  41. Args:
  42. size (tuple[int]): (w, h).
  43. scale (float | tuple(float)): Scaling factor.
  44. Returns:
  45. tuple[int]: scaled size.
  46. """
  47. if isinstance(scale, (float, int)):
  48. scale = (scale, scale)
  49. w, h = size
  50. # don’t need o.5 offset
  51. return int(w * float(scale[0])), int(h * float(scale[1]))
  52. def rescale_size(old_size: tuple,
  53. scale: Union[float, int, tuple],
  54. return_scale: bool = False) -> tuple:
  55. """Calculate the new size to be rescaled to.
  56. Args:
  57. old_size (tuple[int]): The old size (w, h) of image.
  58. scale (float | tuple[int]): The scaling factor or maximum size.
  59. If it is a float number, then the image will be rescaled by this
  60. factor, else if it is a tuple of 2 integers, then the image will
  61. be rescaled as large as possible within the scale.
  62. return_scale (bool): Whether to return the scaling factor besides the
  63. rescaled image size.
  64. Returns:
  65. tuple[int]: The new rescaled image size.
  66. """
  67. w, h = old_size
  68. if isinstance(scale, (float, int)):
  69. if scale <= 0:
  70. raise ValueError(f'Invalid scale {scale}, must be positive.')
  71. scale_factor = scale
  72. elif isinstance(scale, tuple):
  73. max_long_edge = max(scale)
  74. max_short_edge = min(scale)
  75. scale_factor = min(max_long_edge / max(h, w),
  76. max_short_edge / min(h, w))
  77. else:
  78. raise TypeError(
  79. f'Scale must be a number or tuple of int, but got {type(scale)}')
  80. # only change this
  81. new_size = _fixed_scale_size((w, h), scale_factor)
  82. if return_scale:
  83. return new_size, scale_factor
  84. else:
  85. return new_size
  86. def imrescale(
  87. img: np.ndarray,
  88. scale: Union[float, Tuple[int, int]],
  89. return_scale: bool = False,
  90. interpolation: str = 'bilinear',
  91. backend: Optional[str] = None
  92. ) -> Union[np.ndarray, Tuple[np.ndarray, float]]:
  93. """Resize image while keeping the aspect ratio.
  94. Args:
  95. img (ndarray): The input image.
  96. scale (float | tuple[int]): The scaling factor or maximum size.
  97. If it is a float number, then the image will be rescaled by this
  98. factor, else if it is a tuple of 2 integers, then the image will
  99. be rescaled as large as possible within the scale.
  100. return_scale (bool): Whether to return the scaling factor besides the
  101. rescaled image.
  102. interpolation (str): Same as :func:`resize`.
  103. backend (str | None): Same as :func:`resize`.
  104. Returns:
  105. ndarray: The rescaled image.
  106. """
  107. h, w = img.shape[:2]
  108. new_size, scale_factor = rescale_size((w, h), scale, return_scale=True)
  109. rescaled_img = imresize(
  110. img, new_size, interpolation=interpolation, backend=backend)
  111. if return_scale:
  112. return rescaled_img, scale_factor
  113. else:
  114. return rescaled_img
  115. @TRANSFORMS.register_module()
  116. class Resize(MMCV_Resize):
  117. """Resize images & bbox & seg.
  118. This transform resizes the input image according to ``scale`` or
  119. ``scale_factor``. Bboxes, masks, and seg map are then resized
  120. with the same scale factor.
  121. if ``scale`` and ``scale_factor`` are both set, it will use ``scale`` to
  122. resize.
  123. Required Keys:
  124. - img
  125. - gt_bboxes (BaseBoxes[torch.float32]) (optional)
  126. - gt_masks (BitmapMasks | PolygonMasks) (optional)
  127. - gt_seg_map (np.uint8) (optional)
  128. Modified Keys:
  129. - img
  130. - img_shape
  131. - gt_bboxes
  132. - gt_masks
  133. - gt_seg_map
  134. Added Keys:
  135. - scale
  136. - scale_factor
  137. - keep_ratio
  138. - homography_matrix
  139. Args:
  140. scale (int or tuple): Images scales for resizing. Defaults to None
  141. scale_factor (float or tuple[float]): Scale factors for resizing.
  142. Defaults to None.
  143. keep_ratio (bool): Whether to keep the aspect ratio when resizing the
  144. image. Defaults to False.
  145. clip_object_border (bool): Whether to clip the objects
  146. outside the border of the image. In some dataset like MOT17, the gt
  147. bboxes are allowed to cross the border of images. Therefore, we
  148. don't need to clip the gt bboxes in these cases. Defaults to True.
  149. backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
  150. These two backends generates slightly different results. Defaults
  151. to 'cv2'.
  152. interpolation (str): Interpolation method, accepted values are
  153. "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
  154. backend, "nearest", "bilinear" for 'pillow' backend. Defaults
  155. to 'bilinear'.
  156. """
  157. def _resize_masks(self, results: dict) -> None:
  158. """Resize masks with ``results['scale']``"""
  159. if results.get('gt_masks', None) is not None:
  160. if self.keep_ratio:
  161. results['gt_masks'] = results['gt_masks'].rescale(
  162. results['scale'])
  163. else:
  164. results['gt_masks'] = results['gt_masks'].resize(
  165. results['img_shape'])
  166. def _resize_bboxes(self, results: dict) -> None:
  167. """Resize bounding boxes with ``results['scale_factor']``."""
  168. if results.get('gt_bboxes', None) is not None:
  169. results['gt_bboxes'].rescale_(results['scale_factor'])
  170. if self.clip_object_border:
  171. results['gt_bboxes'].clip_(results['img_shape'])
  172. def _record_homography_matrix(self, results: dict) -> None:
  173. """Record the homography matrix for the Resize."""
  174. w_scale, h_scale = results['scale_factor']
  175. homography_matrix = np.array(
  176. [[w_scale, 0, 0], [0, h_scale, 0], [0, 0, 1]], dtype=np.float32)
  177. if results.get('homography_matrix', None) is None:
  178. results['homography_matrix'] = homography_matrix
  179. else:
  180. results['homography_matrix'] = homography_matrix @ results[
  181. 'homography_matrix']
  182. @autocast_box_type()
  183. def transform(self, results: dict) -> dict:
  184. """Transform function to resize images, bounding boxes and semantic
  185. segmentation map.
  186. Args:
  187. results (dict): Result dict from loading pipeline.
  188. Returns:
  189. dict: Resized results, 'img', 'gt_bboxes', 'gt_seg_map',
  190. 'scale', 'scale_factor', 'height', 'width', and 'keep_ratio' keys
  191. are updated in result dict.
  192. """
  193. if self.scale:
  194. results['scale'] = self.scale
  195. else:
  196. img_shape = results['img'].shape[:2]
  197. results['scale'] = _scale_size(img_shape[::-1], self.scale_factor)
  198. self._resize_img(results)
  199. self._resize_bboxes(results)
  200. self._resize_masks(results)
  201. self._resize_seg(results)
  202. self._record_homography_matrix(results)
  203. return results
  204. def __repr__(self) -> str:
  205. repr_str = self.__class__.__name__
  206. repr_str += f'(scale={self.scale}, '
  207. repr_str += f'scale_factor={self.scale_factor}, '
  208. repr_str += f'keep_ratio={self.keep_ratio}, '
  209. repr_str += f'clip_object_border={self.clip_object_border}), '
  210. repr_str += f'backend={self.backend}), '
  211. repr_str += f'interpolation={self.interpolation})'
  212. return repr_str
  213. @TRANSFORMS.register_module()
  214. class FixScaleResize(Resize):
  215. """Compared to Resize, FixScaleResize fixes the scaling issue when
  216. `keep_ratio=true`."""
  217. def _resize_img(self, results):
  218. """Resize images with ``results['scale']``."""
  219. if results.get('img', None) is not None:
  220. if self.keep_ratio:
  221. img, scale_factor = imrescale(
  222. results['img'],
  223. results['scale'],
  224. interpolation=self.interpolation,
  225. return_scale=True,
  226. backend=self.backend)
  227. new_h, new_w = img.shape[:2]
  228. h, w = results['img'].shape[:2]
  229. w_scale = new_w / w
  230. h_scale = new_h / h
  231. else:
  232. img, w_scale, h_scale = mmcv.imresize(
  233. results['img'],
  234. results['scale'],
  235. interpolation=self.interpolation,
  236. return_scale=True,
  237. backend=self.backend)
  238. results['img'] = img
  239. results['img_shape'] = img.shape[:2]
  240. results['scale_factor'] = (w_scale, h_scale)
  241. results['keep_ratio'] = self.keep_ratio
  242. @TRANSFORMS.register_module()
  243. class ResizeShortestEdge(BaseTransform):
  244. """Resize the image and mask while keeping the aspect ratio unchanged.
  245. Modified from https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/transforms/augmentation_impl.py#L130 # noqa:E501
  246. This transform attempts to scale the shorter edge to the given
  247. `scale`, as long as the longer edge does not exceed `max_size`.
  248. If `max_size` is reached, then downscale so that the longer
  249. edge does not exceed `max_size`.
  250. Required Keys:
  251. - img
  252. - gt_seg_map (optional)
  253. Modified Keys:
  254. - img
  255. - img_shape
  256. - gt_seg_map (optional))
  257. Added Keys:
  258. - scale
  259. - scale_factor
  260. - keep_ratio
  261. Args:
  262. scale (Union[int, Tuple[int, int]]): The target short edge length.
  263. If it's tuple, will select the min value as the short edge length.
  264. max_size (int): The maximum allowed longest edge length.
  265. """
  266. def __init__(self,
  267. scale: Union[int, Tuple[int, int]],
  268. max_size: Optional[int] = None,
  269. resize_type: str = 'Resize',
  270. **resize_kwargs) -> None:
  271. super().__init__()
  272. self.scale = scale
  273. self.max_size = max_size
  274. self.resize_cfg = dict(type=resize_type, **resize_kwargs)
  275. self.resize = TRANSFORMS.build({'scale': 0, **self.resize_cfg})
  276. def _get_output_shape(
  277. self, img: np.ndarray,
  278. short_edge_length: Union[int, Tuple[int, int]]) -> Tuple[int, int]:
  279. """Compute the target image shape with the given `short_edge_length`.
  280. Args:
  281. img (np.ndarray): The input image.
  282. short_edge_length (Union[int, Tuple[int, int]]): The target short
  283. edge length. If it's tuple, will select the min value as the
  284. short edge length.
  285. """
  286. h, w = img.shape[:2]
  287. if isinstance(short_edge_length, int):
  288. size = short_edge_length * 1.0
  289. elif isinstance(short_edge_length, tuple):
  290. size = min(short_edge_length) * 1.0
  291. scale = size / min(h, w)
  292. if h < w:
  293. new_h, new_w = size, scale * w
  294. else:
  295. new_h, new_w = scale * h, size
  296. if self.max_size and max(new_h, new_w) > self.max_size:
  297. scale = self.max_size * 1.0 / max(new_h, new_w)
  298. new_h *= scale
  299. new_w *= scale
  300. new_h = int(new_h + 0.5)
  301. new_w = int(new_w + 0.5)
  302. return new_w, new_h
  303. def transform(self, results: dict) -> dict:
  304. self.resize.scale = self._get_output_shape(results['img'], self.scale)
  305. return self.resize(results)
  306. @TRANSFORMS.register_module()
  307. class FixShapeResize(Resize):
  308. """Resize images & bbox & seg to the specified size.
  309. This transform resizes the input image according to ``width`` and
  310. ``height``. Bboxes, masks, and seg map are then resized
  311. with the same parameters.
  312. Required Keys:
  313. - img
  314. - gt_bboxes (BaseBoxes[torch.float32]) (optional)
  315. - gt_masks (BitmapMasks | PolygonMasks) (optional)
  316. - gt_seg_map (np.uint8) (optional)
  317. Modified Keys:
  318. - img
  319. - img_shape
  320. - gt_bboxes
  321. - gt_masks
  322. - gt_seg_map
  323. Added Keys:
  324. - scale
  325. - scale_factor
  326. - keep_ratio
  327. - homography_matrix
  328. Args:
  329. width (int): width for resizing.
  330. height (int): height for resizing.
  331. Defaults to None.
  332. pad_val (Number | dict[str, Number], optional): Padding value for if
  333. the pad_mode is "constant". If it is a single number, the value
  334. to pad the image is the number and to pad the semantic
  335. segmentation map is 255. If it is a dict, it should have the
  336. following keys:
  337. - img: The value to pad the image.
  338. - seg: The value to pad the semantic segmentation map.
  339. Defaults to dict(img=0, seg=255).
  340. keep_ratio (bool): Whether to keep the aspect ratio when resizing the
  341. image. Defaults to False.
  342. clip_object_border (bool): Whether to clip the objects
  343. outside the border of the image. In some dataset like MOT17, the gt
  344. bboxes are allowed to cross the border of images. Therefore, we
  345. don't need to clip the gt bboxes in these cases. Defaults to True.
  346. backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
  347. These two backends generates slightly different results. Defaults
  348. to 'cv2'.
  349. interpolation (str): Interpolation method, accepted values are
  350. "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
  351. backend, "nearest", "bilinear" for 'pillow' backend. Defaults
  352. to 'bilinear'.
  353. """
  354. def __init__(self,
  355. width: int,
  356. height: int,
  357. pad_val: Union[Number, dict] = dict(img=0, seg=255),
  358. keep_ratio: bool = False,
  359. clip_object_border: bool = True,
  360. backend: str = 'cv2',
  361. interpolation: str = 'bilinear') -> None:
  362. assert width is not None and height is not None, (
  363. '`width` and'
  364. '`height` can not be `None`')
  365. self.width = width
  366. self.height = height
  367. self.scale = (width, height)
  368. self.backend = backend
  369. self.interpolation = interpolation
  370. self.keep_ratio = keep_ratio
  371. self.clip_object_border = clip_object_border
  372. if keep_ratio is True:
  373. # padding to the fixed size when keep_ratio=True
  374. self.pad_transform = Pad(size=self.scale, pad_val=pad_val)
  375. @autocast_box_type()
  376. def transform(self, results: dict) -> dict:
  377. """Transform function to resize images, bounding boxes and semantic
  378. segmentation map.
  379. Args:
  380. results (dict): Result dict from loading pipeline.
  381. Returns:
  382. dict: Resized results, 'img', 'gt_bboxes', 'gt_seg_map',
  383. 'scale', 'scale_factor', 'height', 'width', and 'keep_ratio' keys
  384. are updated in result dict.
  385. """
  386. img = results['img']
  387. h, w = img.shape[:2]
  388. if self.keep_ratio:
  389. scale_factor = min(self.width / w, self.height / h)
  390. results['scale_factor'] = (scale_factor, scale_factor)
  391. real_w, real_h = int(w * float(scale_factor) +
  392. 0.5), int(h * float(scale_factor) + 0.5)
  393. img, scale_factor = mmcv.imrescale(
  394. results['img'], (real_w, real_h),
  395. interpolation=self.interpolation,
  396. return_scale=True,
  397. backend=self.backend)
  398. # the w_scale and h_scale has minor difference
  399. # a real fix should be done in the mmcv.imrescale in the future
  400. results['img'] = img
  401. results['img_shape'] = img.shape[:2]
  402. results['keep_ratio'] = self.keep_ratio
  403. results['scale'] = (real_w, real_h)
  404. else:
  405. results['scale'] = (self.width, self.height)
  406. results['scale_factor'] = (self.width / w, self.height / h)
  407. super()._resize_img(results)
  408. self._resize_bboxes(results)
  409. self._resize_masks(results)
  410. self._resize_seg(results)
  411. self._record_homography_matrix(results)
  412. if self.keep_ratio:
  413. self.pad_transform(results)
  414. return results
  415. def __repr__(self) -> str:
  416. repr_str = self.__class__.__name__
  417. repr_str += f'(width={self.width}, height={self.height}, '
  418. repr_str += f'keep_ratio={self.keep_ratio}, '
  419. repr_str += f'clip_object_border={self.clip_object_border}), '
  420. repr_str += f'backend={self.backend}), '
  421. repr_str += f'interpolation={self.interpolation})'
  422. return repr_str
  423. @TRANSFORMS.register_module()
  424. class RandomFlip(MMCV_RandomFlip):
  425. """Flip the image & bbox & mask & segmentation map. Added or Updated keys:
  426. flip, flip_direction, img, gt_bboxes, and gt_seg_map. There are 3 flip
  427. modes:
  428. - ``prob`` is float, ``direction`` is string: the image will be
  429. ``direction``ly flipped with probability of ``prob`` .
  430. E.g., ``prob=0.5``, ``direction='horizontal'``,
  431. then image will be horizontally flipped with probability of 0.5.
  432. - ``prob`` is float, ``direction`` is list of string: the image will
  433. be ``direction[i]``ly flipped with probability of
  434. ``prob/len(direction)``.
  435. E.g., ``prob=0.5``, ``direction=['horizontal', 'vertical']``,
  436. then image will be horizontally flipped with probability of 0.25,
  437. vertically with probability of 0.25.
  438. - ``prob`` is list of float, ``direction`` is list of string:
  439. given ``len(prob) == len(direction)``, the image will
  440. be ``direction[i]``ly flipped with probability of ``prob[i]``.
  441. E.g., ``prob=[0.3, 0.5]``, ``direction=['horizontal',
  442. 'vertical']``, then image will be horizontally flipped with
  443. probability of 0.3, vertically with probability of 0.5.
  444. Required Keys:
  445. - img
  446. - gt_bboxes (BaseBoxes[torch.float32]) (optional)
  447. - gt_masks (BitmapMasks | PolygonMasks) (optional)
  448. - gt_seg_map (np.uint8) (optional)
  449. Modified Keys:
  450. - img
  451. - gt_bboxes
  452. - gt_masks
  453. - gt_seg_map
  454. Added Keys:
  455. - flip
  456. - flip_direction
  457. - homography_matrix
  458. Args:
  459. prob (float | list[float], optional): The flipping probability.
  460. Defaults to None.
  461. direction(str | list[str]): The flipping direction. Options
  462. If input is a list, the length must equal ``prob``. Each
  463. element in ``prob`` indicates the flip probability of
  464. corresponding direction. Defaults to 'horizontal'.
  465. """
  466. def _record_homography_matrix(self, results: dict) -> None:
  467. """Record the homography matrix for the RandomFlip."""
  468. cur_dir = results['flip_direction']
  469. h, w = results['img'].shape[:2]
  470. if cur_dir == 'horizontal':
  471. homography_matrix = np.array([[-1, 0, w], [0, 1, 0], [0, 0, 1]],
  472. dtype=np.float32)
  473. elif cur_dir == 'vertical':
  474. homography_matrix = np.array([[1, 0, 0], [0, -1, h], [0, 0, 1]],
  475. dtype=np.float32)
  476. elif cur_dir == 'diagonal':
  477. homography_matrix = np.array([[-1, 0, w], [0, -1, h], [0, 0, 1]],
  478. dtype=np.float32)
  479. else:
  480. homography_matrix = np.eye(3, dtype=np.float32)
  481. if results.get('homography_matrix', None) is None:
  482. results['homography_matrix'] = homography_matrix
  483. else:
  484. results['homography_matrix'] = homography_matrix @ results[
  485. 'homography_matrix']
  486. @autocast_box_type()
  487. def _flip(self, results: dict) -> None:
  488. """Flip images, bounding boxes, and semantic segmentation map."""
  489. # flip image
  490. results['img'] = mmcv.imflip(
  491. results['img'], direction=results['flip_direction'])
  492. img_shape = results['img'].shape[:2]
  493. # flip bboxes
  494. if results.get('gt_bboxes', None) is not None:
  495. results['gt_bboxes'].flip_(img_shape, results['flip_direction'])
  496. # flip masks
  497. if results.get('gt_masks', None) is not None:
  498. results['gt_masks'] = results['gt_masks'].flip(
  499. results['flip_direction'])
  500. # flip segs
  501. if results.get('gt_seg_map', None) is not None:
  502. results['gt_seg_map'] = mmcv.imflip(
  503. results['gt_seg_map'], direction=results['flip_direction'])
  504. # record homography matrix for flip
  505. self._record_homography_matrix(results)
  506. @TRANSFORMS.register_module()
  507. class RandomShift(BaseTransform):
  508. """Shift the image and box given shift pixels and probability.
  509. Required Keys:
  510. - img
  511. - gt_bboxes (BaseBoxes[torch.float32])
  512. - gt_bboxes_labels (np.int64)
  513. - gt_ignore_flags (bool) (optional)
  514. Modified Keys:
  515. - img
  516. - gt_bboxes
  517. - gt_bboxes_labels
  518. - gt_ignore_flags (bool) (optional)
  519. Args:
  520. prob (float): Probability of shifts. Defaults to 0.5.
  521. max_shift_px (int): The max pixels for shifting. Defaults to 32.
  522. filter_thr_px (int): The width and height threshold for filtering.
  523. The bbox and the rest of the targets below the width and
  524. height threshold will be filtered. Defaults to 1.
  525. """
  526. def __init__(self,
  527. prob: float = 0.5,
  528. max_shift_px: int = 32,
  529. filter_thr_px: int = 1) -> None:
  530. assert 0 <= prob <= 1
  531. assert max_shift_px >= 0
  532. self.prob = prob
  533. self.max_shift_px = max_shift_px
  534. self.filter_thr_px = int(filter_thr_px)
  535. @cache_randomness
  536. def _random_prob(self) -> float:
  537. return random.uniform(0, 1)
  538. @autocast_box_type()
  539. def transform(self, results: dict) -> dict:
  540. """Transform function to random shift images, bounding boxes.
  541. Args:
  542. results (dict): Result dict from loading pipeline.
  543. Returns:
  544. dict: Shift results.
  545. """
  546. if self._random_prob() < self.prob:
  547. img_shape = results['img'].shape[:2]
  548. random_shift_x = random.randint(-self.max_shift_px,
  549. self.max_shift_px)
  550. random_shift_y = random.randint(-self.max_shift_px,
  551. self.max_shift_px)
  552. new_x = max(0, random_shift_x)
  553. ori_x = max(0, -random_shift_x)
  554. new_y = max(0, random_shift_y)
  555. ori_y = max(0, -random_shift_y)
  556. # TODO: support mask and semantic segmentation maps.
  557. bboxes = results['gt_bboxes'].clone()
  558. bboxes.translate_([random_shift_x, random_shift_y])
  559. # clip border
  560. bboxes.clip_(img_shape)
  561. # remove invalid bboxes
  562. valid_inds = (bboxes.widths > self.filter_thr_px).numpy() & (
  563. bboxes.heights > self.filter_thr_px).numpy()
  564. # If the shift does not contain any gt-bbox area, skip this
  565. # image.
  566. if not valid_inds.any():
  567. return results
  568. bboxes = bboxes[valid_inds]
  569. results['gt_bboxes'] = bboxes
  570. results['gt_bboxes_labels'] = results['gt_bboxes_labels'][
  571. valid_inds]
  572. if results.get('gt_ignore_flags', None) is not None:
  573. results['gt_ignore_flags'] = \
  574. results['gt_ignore_flags'][valid_inds]
  575. # shift img
  576. img = results['img']
  577. new_img = np.zeros_like(img)
  578. img_h, img_w = img.shape[:2]
  579. new_h = img_h - np.abs(random_shift_y)
  580. new_w = img_w - np.abs(random_shift_x)
  581. new_img[new_y:new_y + new_h, new_x:new_x + new_w] \
  582. = img[ori_y:ori_y + new_h, ori_x:ori_x + new_w]
  583. results['img'] = new_img
  584. return results
  585. def __repr__(self):
  586. repr_str = self.__class__.__name__
  587. repr_str += f'(prob={self.prob}, '
  588. repr_str += f'max_shift_px={self.max_shift_px}, '
  589. repr_str += f'filter_thr_px={self.filter_thr_px})'
  590. return repr_str
  591. @TRANSFORMS.register_module()
  592. class Pad(MMCV_Pad):
  593. """Pad the image & segmentation map.
  594. There are three padding modes: (1) pad to a fixed size and (2) pad to the
  595. minimum size that is divisible by some number. and (3)pad to square. Also,
  596. pad to square and pad to the minimum size can be used as the same time.
  597. Required Keys:
  598. - img
  599. - gt_bboxes (BaseBoxes[torch.float32]) (optional)
  600. - gt_masks (BitmapMasks | PolygonMasks) (optional)
  601. - gt_seg_map (np.uint8) (optional)
  602. Modified Keys:
  603. - img
  604. - img_shape
  605. - gt_masks
  606. - gt_seg_map
  607. Added Keys:
  608. - pad_shape
  609. - pad_fixed_size
  610. - pad_size_divisor
  611. Args:
  612. size (tuple, optional): Fixed padding size.
  613. Expected padding shape (width, height). Defaults to None.
  614. size_divisor (int, optional): The divisor of padded size. Defaults to
  615. None.
  616. pad_to_square (bool): Whether to pad the image into a square.
  617. Currently only used for YOLOX. Defaults to False.
  618. pad_val (Number | dict[str, Number], optional) - Padding value for if
  619. the pad_mode is "constant". If it is a single number, the value
  620. to pad the image is the number and to pad the semantic
  621. segmentation map is 255. If it is a dict, it should have the
  622. following keys:
  623. - img: The value to pad the image.
  624. - seg: The value to pad the semantic segmentation map.
  625. Defaults to dict(img=0, seg=255).
  626. padding_mode (str): Type of padding. Should be: constant, edge,
  627. reflect or symmetric. Defaults to 'constant'.
  628. - constant: pads with a constant value, this value is specified
  629. with pad_val.
  630. - edge: pads with the last value at the edge of the image.
  631. - reflect: pads with reflection of image without repeating the last
  632. value on the edge. For example, padding [1, 2, 3, 4] with 2
  633. elements on both sides in reflect mode will result in
  634. [3, 2, 1, 2, 3, 4, 3, 2].
  635. - symmetric: pads with reflection of image repeating the last value
  636. on the edge. For example, padding [1, 2, 3, 4] with 2 elements on
  637. both sides in symmetric mode will result in
  638. [2, 1, 1, 2, 3, 4, 4, 3]
  639. """
  640. def _pad_masks(self, results: dict) -> None:
  641. """Pad masks according to ``results['pad_shape']``."""
  642. if results.get('gt_masks', None) is not None:
  643. pad_val = self.pad_val.get('masks', 0)
  644. pad_shape = results['pad_shape'][:2]
  645. results['gt_masks'] = results['gt_masks'].pad(
  646. pad_shape, pad_val=pad_val)
  647. def transform(self, results: dict) -> dict:
  648. """Call function to pad images, masks, semantic segmentation maps.
  649. Args:
  650. results (dict): Result dict from loading pipeline.
  651. Returns:
  652. dict: Updated result dict.
  653. """
  654. self._pad_img(results)
  655. self._pad_seg(results)
  656. self._pad_masks(results)
  657. return results
  658. @TRANSFORMS.register_module()
  659. class RandomCrop(BaseTransform):
  660. """Random crop the image & bboxes & masks.
  661. The absolute ``crop_size`` is sampled based on ``crop_type`` and
  662. ``image_size``, then the cropped results are generated.
  663. Required Keys:
  664. - img
  665. - gt_bboxes (BaseBoxes[torch.float32]) (optional)
  666. - gt_bboxes_labels (np.int64) (optional)
  667. - gt_masks (BitmapMasks | PolygonMasks) (optional)
  668. - gt_ignore_flags (bool) (optional)
  669. - gt_seg_map (np.uint8) (optional)
  670. Modified Keys:
  671. - img
  672. - img_shape
  673. - gt_bboxes (optional)
  674. - gt_bboxes_labels (optional)
  675. - gt_masks (optional)
  676. - gt_ignore_flags (optional)
  677. - gt_seg_map (optional)
  678. - gt_instances_ids (options, only used in MOT/VIS)
  679. Added Keys:
  680. - homography_matrix
  681. Args:
  682. crop_size (tuple): The relative ratio or absolute pixels of
  683. (width, height).
  684. crop_type (str, optional): One of "relative_range", "relative",
  685. "absolute", "absolute_range". "relative" randomly crops
  686. (h * crop_size[0], w * crop_size[1]) part from an input of size
  687. (h, w). "relative_range" uniformly samples relative crop size from
  688. range [crop_size[0], 1] and [crop_size[1], 1] for height and width
  689. respectively. "absolute" crops from an input with absolute size
  690. (crop_size[0], crop_size[1]). "absolute_range" uniformly samples
  691. crop_h in range [crop_size[0], min(h, crop_size[1])] and crop_w
  692. in range [crop_size[0], min(w, crop_size[1])].
  693. Defaults to "absolute".
  694. allow_negative_crop (bool, optional): Whether to allow a crop that does
  695. not contain any bbox area. Defaults to False.
  696. recompute_bbox (bool, optional): Whether to re-compute the boxes based
  697. on cropped instance masks. Defaults to False.
  698. bbox_clip_border (bool, optional): Whether clip the objects outside
  699. the border of the image. Defaults to True.
  700. Note:
  701. - If the image is smaller than the absolute crop size, return the
  702. original image.
  703. - The keys for bboxes, labels and masks must be aligned. That is,
  704. ``gt_bboxes`` corresponds to ``gt_labels`` and ``gt_masks``, and
  705. ``gt_bboxes_ignore`` corresponds to ``gt_labels_ignore`` and
  706. ``gt_masks_ignore``.
  707. - If the crop does not contain any gt-bbox region and
  708. ``allow_negative_crop`` is set to False, skip this image.
  709. """
  710. def __init__(self,
  711. crop_size: tuple,
  712. crop_type: str = 'absolute',
  713. allow_negative_crop: bool = False,
  714. recompute_bbox: bool = False,
  715. bbox_clip_border: bool = True) -> None:
  716. if crop_type not in [
  717. 'relative_range', 'relative', 'absolute', 'absolute_range'
  718. ]:
  719. raise ValueError(f'Invalid crop_type {crop_type}.')
  720. if crop_type in ['absolute', 'absolute_range']:
  721. assert crop_size[0] > 0 and crop_size[1] > 0
  722. assert isinstance(crop_size[0], int) and isinstance(
  723. crop_size[1], int)
  724. if crop_type == 'absolute_range':
  725. assert crop_size[0] <= crop_size[1]
  726. else:
  727. assert 0 < crop_size[0] <= 1 and 0 < crop_size[1] <= 1
  728. self.crop_size = crop_size
  729. self.crop_type = crop_type
  730. self.allow_negative_crop = allow_negative_crop
  731. self.bbox_clip_border = bbox_clip_border
  732. self.recompute_bbox = recompute_bbox
  733. def _crop_data(self, results: dict, crop_size: Tuple[int, int],
  734. allow_negative_crop: bool) -> Union[dict, None]:
  735. """Function to randomly crop images, bounding boxes, masks, semantic
  736. segmentation maps.
  737. Args:
  738. results (dict): Result dict from loading pipeline.
  739. crop_size (Tuple[int, int]): Expected absolute size after
  740. cropping, (h, w).
  741. allow_negative_crop (bool): Whether to allow a crop that does not
  742. contain any bbox area.
  743. Returns:
  744. results (Union[dict, None]): Randomly cropped results, 'img_shape'
  745. key in result dict is updated according to crop size. None will
  746. be returned when there is no valid bbox after cropping.
  747. """
  748. assert crop_size[0] > 0 and crop_size[1] > 0
  749. img = results['img']
  750. margin_h = max(img.shape[0] - crop_size[0], 0)
  751. margin_w = max(img.shape[1] - crop_size[1], 0)
  752. offset_h, offset_w = self._rand_offset((margin_h, margin_w))
  753. crop_y1, crop_y2 = offset_h, offset_h + crop_size[0]
  754. crop_x1, crop_x2 = offset_w, offset_w + crop_size[1]
  755. # Record the homography matrix for the RandomCrop
  756. homography_matrix = np.array(
  757. [[1, 0, -offset_w], [0, 1, -offset_h], [0, 0, 1]],
  758. dtype=np.float32)
  759. if results.get('homography_matrix', None) is None:
  760. results['homography_matrix'] = homography_matrix
  761. else:
  762. results['homography_matrix'] = homography_matrix @ results[
  763. 'homography_matrix']
  764. # crop the image
  765. img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...]
  766. img_shape = img.shape
  767. results['img'] = img
  768. results['img_shape'] = img_shape[:2]
  769. # crop bboxes accordingly and clip to the image boundary
  770. if results.get('gt_bboxes', None) is not None:
  771. bboxes = results['gt_bboxes']
  772. bboxes.translate_([-offset_w, -offset_h])
  773. if self.bbox_clip_border:
  774. bboxes.clip_(img_shape[:2])
  775. valid_inds = bboxes.is_inside(img_shape[:2]).numpy()
  776. # If the crop does not contain any gt-bbox area and
  777. # allow_negative_crop is False, skip this image.
  778. if (not valid_inds.any() and not allow_negative_crop):
  779. return None
  780. results['gt_bboxes'] = bboxes[valid_inds]
  781. if results.get('gt_ignore_flags', None) is not None:
  782. results['gt_ignore_flags'] = \
  783. results['gt_ignore_flags'][valid_inds]
  784. if results.get('gt_bboxes_labels', None) is not None:
  785. results['gt_bboxes_labels'] = \
  786. results['gt_bboxes_labels'][valid_inds]
  787. if results.get('gt_masks', None) is not None:
  788. results['gt_masks'] = results['gt_masks'][
  789. valid_inds.nonzero()[0]].crop(
  790. np.asarray([crop_x1, crop_y1, crop_x2, crop_y2]))
  791. if self.recompute_bbox:
  792. results['gt_bboxes'] = results['gt_masks'].get_bboxes(
  793. type(results['gt_bboxes']))
  794. # We should remove the instance ids corresponding to invalid boxes.
  795. if results.get('gt_instances_ids', None) is not None:
  796. results['gt_instances_ids'] = \
  797. results['gt_instances_ids'][valid_inds]
  798. # crop semantic seg
  799. if results.get('gt_seg_map', None) is not None:
  800. results['gt_seg_map'] = results['gt_seg_map'][crop_y1:crop_y2,
  801. crop_x1:crop_x2]
  802. return results
  803. @cache_randomness
  804. def _rand_offset(self, margin: Tuple[int, int]) -> Tuple[int, int]:
  805. """Randomly generate crop offset.
  806. Args:
  807. margin (Tuple[int, int]): The upper bound for the offset generated
  808. randomly.
  809. Returns:
  810. Tuple[int, int]: The random offset for the crop.
  811. """
  812. margin_h, margin_w = margin
  813. offset_h = np.random.randint(0, margin_h + 1)
  814. offset_w = np.random.randint(0, margin_w + 1)
  815. return offset_h, offset_w
  816. @cache_randomness
  817. def _get_crop_size(self, image_size: Tuple[int, int]) -> Tuple[int, int]:
  818. """Randomly generates the absolute crop size based on `crop_type` and
  819. `image_size`.
  820. Args:
  821. image_size (Tuple[int, int]): (h, w).
  822. Returns:
  823. crop_size (Tuple[int, int]): (crop_h, crop_w) in absolute pixels.
  824. """
  825. h, w = image_size
  826. if self.crop_type == 'absolute':
  827. return min(self.crop_size[1], h), min(self.crop_size[0], w)
  828. elif self.crop_type == 'absolute_range':
  829. crop_h = np.random.randint(
  830. min(h, self.crop_size[0]),
  831. min(h, self.crop_size[1]) + 1)
  832. crop_w = np.random.randint(
  833. min(w, self.crop_size[0]),
  834. min(w, self.crop_size[1]) + 1)
  835. return crop_h, crop_w
  836. elif self.crop_type == 'relative':
  837. crop_w, crop_h = self.crop_size
  838. return int(h * crop_h + 0.5), int(w * crop_w + 0.5)
  839. else:
  840. # 'relative_range'
  841. crop_size = np.asarray(self.crop_size, dtype=np.float32)
  842. crop_h, crop_w = crop_size + np.random.rand(2) * (1 - crop_size)
  843. return int(h * crop_h + 0.5), int(w * crop_w + 0.5)
  844. @autocast_box_type()
  845. def transform(self, results: dict) -> Union[dict, None]:
  846. """Transform function to randomly crop images, bounding boxes, masks,
  847. semantic segmentation maps.
  848. Args:
  849. results (dict): Result dict from loading pipeline.
  850. Returns:
  851. results (Union[dict, None]): Randomly cropped results, 'img_shape'
  852. key in result dict is updated according to crop size. None will
  853. be returned when there is no valid bbox after cropping.
  854. """
  855. image_size = results['img'].shape[:2]
  856. crop_size = self._get_crop_size(image_size)
  857. results = self._crop_data(results, crop_size, self.allow_negative_crop)
  858. return results
  859. def __repr__(self) -> str:
  860. repr_str = self.__class__.__name__
  861. repr_str += f'(crop_size={self.crop_size}, '
  862. repr_str += f'crop_type={self.crop_type}, '
  863. repr_str += f'allow_negative_crop={self.allow_negative_crop}, '
  864. repr_str += f'recompute_bbox={self.recompute_bbox}, '
  865. repr_str += f'bbox_clip_border={self.bbox_clip_border})'
  866. return repr_str
  867. @TRANSFORMS.register_module()
  868. class SegRescale(BaseTransform):
  869. """Rescale semantic segmentation maps.
  870. This transform rescale the ``gt_seg_map`` according to ``scale_factor``.
  871. Required Keys:
  872. - gt_seg_map
  873. Modified Keys:
  874. - gt_seg_map
  875. Args:
  876. scale_factor (float): The scale factor of the final output. Defaults
  877. to 1.
  878. backend (str): Image rescale backend, choices are 'cv2' and 'pillow'.
  879. These two backends generates slightly different results. Defaults
  880. to 'cv2'.
  881. """
  882. def __init__(self, scale_factor: float = 1, backend: str = 'cv2') -> None:
  883. self.scale_factor = scale_factor
  884. self.backend = backend
  885. def transform(self, results: dict) -> dict:
  886. """Transform function to scale the semantic segmentation map.
  887. Args:
  888. results (dict): Result dict from loading pipeline.
  889. Returns:
  890. dict: Result dict with semantic segmentation map scaled.
  891. """
  892. if self.scale_factor != 1:
  893. results['gt_seg_map'] = mmcv.imrescale(
  894. results['gt_seg_map'],
  895. self.scale_factor,
  896. interpolation='nearest',
  897. backend=self.backend)
  898. return results
  899. def __repr__(self) -> str:
  900. repr_str = self.__class__.__name__
  901. repr_str += f'(scale_factor={self.scale_factor}, '
  902. repr_str += f'backend={self.backend})'
  903. return repr_str
  904. @TRANSFORMS.register_module()
  905. class PhotoMetricDistortion(BaseTransform):
  906. """Apply photometric distortion to image sequentially, every transformation
  907. is applied with a probability of 0.5. The position of random contrast is in
  908. second or second to last.
  909. 1. random brightness
  910. 2. random contrast (mode 0)
  911. 3. convert color from BGR to HSV
  912. 4. random saturation
  913. 5. random hue
  914. 6. convert color from HSV to BGR
  915. 7. random contrast (mode 1)
  916. 8. randomly swap channels
  917. Required Keys:
  918. - img (np.uint8)
  919. Modified Keys:
  920. - img (np.float32)
  921. Args:
  922. brightness_delta (int): delta of brightness.
  923. contrast_range (sequence): range of contrast.
  924. saturation_range (sequence): range of saturation.
  925. hue_delta (int): delta of hue.
  926. """
  927. def __init__(self,
  928. brightness_delta: int = 32,
  929. contrast_range: Sequence[Number] = (0.5, 1.5),
  930. saturation_range: Sequence[Number] = (0.5, 1.5),
  931. hue_delta: int = 18) -> None:
  932. self.brightness_delta = brightness_delta
  933. self.contrast_lower, self.contrast_upper = contrast_range
  934. self.saturation_lower, self.saturation_upper = saturation_range
  935. self.hue_delta = hue_delta
  936. @cache_randomness
  937. def _random_flags(self) -> Sequence[Number]:
  938. mode = random.randint(2)
  939. brightness_flag = random.randint(2)
  940. contrast_flag = random.randint(2)
  941. saturation_flag = random.randint(2)
  942. hue_flag = random.randint(2)
  943. swap_flag = random.randint(2)
  944. delta_value = random.uniform(-self.brightness_delta,
  945. self.brightness_delta)
  946. alpha_value = random.uniform(self.contrast_lower, self.contrast_upper)
  947. saturation_value = random.uniform(self.saturation_lower,
  948. self.saturation_upper)
  949. hue_value = random.uniform(-self.hue_delta, self.hue_delta)
  950. swap_value = random.permutation(3)
  951. return (mode, brightness_flag, contrast_flag, saturation_flag,
  952. hue_flag, swap_flag, delta_value, alpha_value,
  953. saturation_value, hue_value, swap_value)
  954. def transform(self, results: dict) -> dict:
  955. """Transform function to perform photometric distortion on images.
  956. Args:
  957. results (dict): Result dict from loading pipeline.
  958. Returns:
  959. dict: Result dict with images distorted.
  960. """
  961. assert 'img' in results, '`img` is not found in results'
  962. img = results['img']
  963. img = img.astype(np.float32)
  964. (mode, brightness_flag, contrast_flag, saturation_flag, hue_flag,
  965. swap_flag, delta_value, alpha_value, saturation_value, hue_value,
  966. swap_value) = self._random_flags()
  967. # random brightness
  968. if brightness_flag:
  969. img += delta_value
  970. # mode == 0 --> do random contrast first
  971. # mode == 1 --> do random contrast last
  972. if mode == 1:
  973. if contrast_flag:
  974. img *= alpha_value
  975. # convert color from BGR to HSV
  976. img = mmcv.bgr2hsv(img)
  977. # random saturation
  978. if saturation_flag:
  979. img[..., 1] *= saturation_value
  980. # For image(type=float32), after convert bgr to hsv by opencv,
  981. # valid saturation value range is [0, 1]
  982. if saturation_value > 1:
  983. img[..., 1] = img[..., 1].clip(0, 1)
  984. # random hue
  985. if hue_flag:
  986. img[..., 0] += hue_value
  987. img[..., 0][img[..., 0] > 360] -= 360
  988. img[..., 0][img[..., 0] < 0] += 360
  989. # convert color from HSV to BGR
  990. img = mmcv.hsv2bgr(img)
  991. # random contrast
  992. if mode == 0:
  993. if contrast_flag:
  994. img *= alpha_value
  995. # randomly swap channels
  996. if swap_flag:
  997. img = img[..., swap_value]
  998. results['img'] = img
  999. return results
  1000. def __repr__(self) -> str:
  1001. repr_str = self.__class__.__name__
  1002. repr_str += f'(brightness_delta={self.brightness_delta}, '
  1003. repr_str += 'contrast_range='
  1004. repr_str += f'{(self.contrast_lower, self.contrast_upper)}, '
  1005. repr_str += 'saturation_range='
  1006. repr_str += f'{(self.saturation_lower, self.saturation_upper)}, '
  1007. repr_str += f'hue_delta={self.hue_delta})'
  1008. return repr_str
  1009. @TRANSFORMS.register_module()
  1010. class Expand(BaseTransform):
  1011. """Random expand the image & bboxes & masks & segmentation map.
  1012. Randomly place the original image on a canvas of ``ratio`` x original image
  1013. size filled with mean values. The ratio is in the range of ratio_range.
  1014. Required Keys:
  1015. - img
  1016. - img_shape
  1017. - gt_bboxes (BaseBoxes[torch.float32]) (optional)
  1018. - gt_masks (BitmapMasks | PolygonMasks) (optional)
  1019. - gt_seg_map (np.uint8) (optional)
  1020. Modified Keys:
  1021. - img
  1022. - img_shape
  1023. - gt_bboxes
  1024. - gt_masks
  1025. - gt_seg_map
  1026. Args:
  1027. mean (sequence): mean value of dataset.
  1028. to_rgb (bool): if need to convert the order of mean to align with RGB.
  1029. ratio_range (sequence)): range of expand ratio.
  1030. seg_ignore_label (int): label of ignore segmentation map.
  1031. prob (float): probability of applying this transformation
  1032. """
  1033. def __init__(self,
  1034. mean: Sequence[Number] = (0, 0, 0),
  1035. to_rgb: bool = True,
  1036. ratio_range: Sequence[Number] = (1, 4),
  1037. seg_ignore_label: int = None,
  1038. prob: float = 0.5) -> None:
  1039. self.to_rgb = to_rgb
  1040. self.ratio_range = ratio_range
  1041. if to_rgb:
  1042. self.mean = mean[::-1]
  1043. else:
  1044. self.mean = mean
  1045. self.min_ratio, self.max_ratio = ratio_range
  1046. self.seg_ignore_label = seg_ignore_label
  1047. self.prob = prob
  1048. @cache_randomness
  1049. def _random_prob(self) -> float:
  1050. return random.uniform(0, 1)
  1051. @cache_randomness
  1052. def _random_ratio(self) -> float:
  1053. return random.uniform(self.min_ratio, self.max_ratio)
  1054. @cache_randomness
  1055. def _random_left_top(self, ratio: float, h: int,
  1056. w: int) -> Tuple[int, int]:
  1057. left = int(random.uniform(0, w * ratio - w))
  1058. top = int(random.uniform(0, h * ratio - h))
  1059. return left, top
  1060. @autocast_box_type()
  1061. def transform(self, results: dict) -> dict:
  1062. """Transform function to expand images, bounding boxes, masks,
  1063. segmentation map.
  1064. Args:
  1065. results (dict): Result dict from loading pipeline.
  1066. Returns:
  1067. dict: Result dict with images, bounding boxes, masks, segmentation
  1068. map expanded.
  1069. """
  1070. if self._random_prob() > self.prob:
  1071. return results
  1072. assert 'img' in results, '`img` is not found in results'
  1073. img = results['img']
  1074. h, w, c = img.shape
  1075. ratio = self._random_ratio()
  1076. # speedup expand when meets large image
  1077. if np.all(self.mean == self.mean[0]):
  1078. expand_img = np.empty((int(h * ratio), int(w * ratio), c),
  1079. img.dtype)
  1080. expand_img.fill(self.mean[0])
  1081. else:
  1082. expand_img = np.full((int(h * ratio), int(w * ratio), c),
  1083. self.mean,
  1084. dtype=img.dtype)
  1085. left, top = self._random_left_top(ratio, h, w)
  1086. expand_img[top:top + h, left:left + w] = img
  1087. results['img'] = expand_img
  1088. results['img_shape'] = expand_img.shape[:2]
  1089. # expand bboxes
  1090. if results.get('gt_bboxes', None) is not None:
  1091. results['gt_bboxes'].translate_([left, top])
  1092. # expand masks
  1093. if results.get('gt_masks', None) is not None:
  1094. results['gt_masks'] = results['gt_masks'].expand(
  1095. int(h * ratio), int(w * ratio), top, left)
  1096. # expand segmentation map
  1097. if results.get('gt_seg_map', None) is not None:
  1098. gt_seg = results['gt_seg_map']
  1099. expand_gt_seg = np.full((int(h * ratio), int(w * ratio)),
  1100. self.seg_ignore_label,
  1101. dtype=gt_seg.dtype)
  1102. expand_gt_seg[top:top + h, left:left + w] = gt_seg
  1103. results['gt_seg_map'] = expand_gt_seg
  1104. return results
  1105. def __repr__(self) -> str:
  1106. repr_str = self.__class__.__name__
  1107. repr_str += f'(mean={self.mean}, to_rgb={self.to_rgb}, '
  1108. repr_str += f'ratio_range={self.ratio_range}, '
  1109. repr_str += f'seg_ignore_label={self.seg_ignore_label}, '
  1110. repr_str += f'prob={self.prob})'
  1111. return repr_str
  1112. @TRANSFORMS.register_module()
  1113. class MinIoURandomCrop(BaseTransform):
  1114. """Random crop the image & bboxes & masks & segmentation map, the cropped
  1115. patches have minimum IoU requirement with original image & bboxes & masks.
  1116. & segmentation map, the IoU threshold is randomly selected from min_ious.
  1117. Required Keys:
  1118. - img
  1119. - img_shape
  1120. - gt_bboxes (BaseBoxes[torch.float32]) (optional)
  1121. - gt_bboxes_labels (np.int64) (optional)
  1122. - gt_masks (BitmapMasks | PolygonMasks) (optional)
  1123. - gt_ignore_flags (bool) (optional)
  1124. - gt_seg_map (np.uint8) (optional)
  1125. Modified Keys:
  1126. - img
  1127. - img_shape
  1128. - gt_bboxes
  1129. - gt_bboxes_labels
  1130. - gt_masks
  1131. - gt_ignore_flags
  1132. - gt_seg_map
  1133. Args:
  1134. min_ious (Sequence[float]): minimum IoU threshold for all intersections
  1135. with bounding boxes.
  1136. min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w,
  1137. where a >= min_crop_size).
  1138. bbox_clip_border (bool, optional): Whether clip the objects outside
  1139. the border of the image. Defaults to True.
  1140. """
  1141. def __init__(self,
  1142. min_ious: Sequence[float] = (0.1, 0.3, 0.5, 0.7, 0.9),
  1143. min_crop_size: float = 0.3,
  1144. bbox_clip_border: bool = True) -> None:
  1145. self.min_ious = min_ious
  1146. self.sample_mode = (1, *min_ious, 0)
  1147. self.min_crop_size = min_crop_size
  1148. self.bbox_clip_border = bbox_clip_border
  1149. @cache_randomness
  1150. def _random_mode(self) -> Number:
  1151. return random.choice(self.sample_mode)
  1152. @autocast_box_type()
  1153. def transform(self, results: dict) -> dict:
  1154. """Transform function to crop images and bounding boxes with minimum
  1155. IoU constraint.
  1156. Args:
  1157. results (dict): Result dict from loading pipeline.
  1158. Returns:
  1159. dict: Result dict with images and bounding boxes cropped, \
  1160. 'img_shape' key is updated.
  1161. """
  1162. assert 'img' in results, '`img` is not found in results'
  1163. assert 'gt_bboxes' in results, '`gt_bboxes` is not found in results'
  1164. img = results['img']
  1165. boxes = results['gt_bboxes']
  1166. h, w = img.shape
  1167. while True:
  1168. mode = self._random_mode()
  1169. self.mode = mode
  1170. if mode == 1:
  1171. return results
  1172. min_iou = self.mode
  1173. for i in range(50):
  1174. new_w = random.uniform(self.min_crop_size * w, w)
  1175. new_h = random.uniform(self.min_crop_size * h, h)
  1176. # h / w in [0.5, 2]
  1177. if new_h / new_w < 0.5 or new_h / new_w > 2:
  1178. continue
  1179. left = random.uniform(w - new_w)
  1180. top = random.uniform(h - new_h)
  1181. patch = np.array(
  1182. (int(left), int(top), int(left + new_w), int(top + new_h)))
  1183. # Line or point crop is not allowed
  1184. if patch[2] == patch[0] or patch[3] == patch[1]:
  1185. continue
  1186. overlaps = boxes.overlaps(
  1187. HorizontalBoxes(patch.reshape(-1, 4).astype(np.float32)),
  1188. boxes).numpy().reshape(-1)
  1189. if len(overlaps) > 0 and overlaps.min() < min_iou:
  1190. continue
  1191. # center of boxes should inside the crop img
  1192. # only adjust boxes and instance masks when the gt is not empty
  1193. if len(overlaps) > 0:
  1194. # adjust boxes
  1195. def is_center_of_bboxes_in_patch(boxes, patch):
  1196. centers = boxes.centers.numpy()
  1197. mask = ((centers[:, 0] > patch[0]) *
  1198. (centers[:, 1] > patch[1]) *
  1199. (centers[:, 0] < patch[2]) *
  1200. (centers[:, 1] < patch[3]))
  1201. return mask
  1202. mask = is_center_of_bboxes_in_patch(boxes, patch)
  1203. if not mask.any():
  1204. continue
  1205. if results.get('gt_bboxes', None) is not None:
  1206. boxes = results['gt_bboxes']
  1207. mask = is_center_of_bboxes_in_patch(boxes, patch)
  1208. boxes = boxes[mask]
  1209. boxes.translate_([-patch[0], -patch[1]])
  1210. if self.bbox_clip_border:
  1211. boxes.clip_(
  1212. [patch[3] - patch[1], patch[2] - patch[0]])
  1213. results['gt_bboxes'] = boxes
  1214. # ignore_flags
  1215. if results.get('gt_ignore_flags', None) is not None:
  1216. results['gt_ignore_flags'] = \
  1217. results['gt_ignore_flags'][mask]
  1218. # labels
  1219. if results.get('gt_bboxes_labels', None) is not None:
  1220. results['gt_bboxes_labels'] = results[
  1221. 'gt_bboxes_labels'][mask]
  1222. # mask fields
  1223. if results.get('gt_masks', None) is not None:
  1224. results['gt_masks'] = results['gt_masks'][
  1225. mask.nonzero()[0]].crop(patch)
  1226. # adjust the img no matter whether the gt is empty before crop
  1227. img = img[patch[1]:patch[3], patch[0]:patch[2]]
  1228. results['img'] = img
  1229. results['img_shape'] = img.shape[:2]
  1230. # seg fields
  1231. if results.get('gt_seg_map', None) is not None:
  1232. results['gt_seg_map'] = results['gt_seg_map'][
  1233. patch[1]:patch[3], patch[0]:patch[2]]
  1234. return results
  1235. def __repr__(self) -> str:
  1236. repr_str = self.__class__.__name__
  1237. repr_str += f'(min_ious={self.min_ious}, '
  1238. repr_str += f'min_crop_size={self.min_crop_size}, '
  1239. repr_str += f'bbox_clip_border={self.bbox_clip_border})'
  1240. return repr_str
  1241. @TRANSFORMS.register_module()
  1242. class Corrupt(BaseTransform):
  1243. """Corruption augmentation.
  1244. Corruption transforms implemented based on
  1245. `imagecorruptions <https://github.com/bethgelab/imagecorruptions>`_.
  1246. Required Keys:
  1247. - img (np.uint8)
  1248. Modified Keys:
  1249. - img (np.uint8)
  1250. Args:
  1251. corruption (str): Corruption name.
  1252. severity (int): The severity of corruption. Defaults to 1.
  1253. """
  1254. def __init__(self, corruption: str, severity: int = 1) -> None:
  1255. self.corruption = corruption
  1256. self.severity = severity
  1257. def transform(self, results: dict) -> dict:
  1258. """Call function to corrupt image.
  1259. Args:
  1260. results (dict): Result dict from loading pipeline.
  1261. Returns:
  1262. dict: Result dict with images corrupted.
  1263. """
  1264. if corrupt is None:
  1265. raise RuntimeError('imagecorruptions is not installed')
  1266. results['img'] = corrupt(
  1267. results['img'].astype(np.uint8),
  1268. corruption_name=self.corruption,
  1269. severity=self.severity)
  1270. return results
  1271. def __repr__(self) -> str:
  1272. repr_str = self.__class__.__name__
  1273. repr_str += f'(corruption={self.corruption}, '
  1274. repr_str += f'severity={self.severity})'
  1275. return repr_str
  1276. @TRANSFORMS.register_module()
  1277. @avoid_cache_randomness
  1278. class Albu(BaseTransform):
  1279. """Albumentation augmentation.
  1280. Adds custom transformations from Albumentations library.
  1281. Please, visit `https://albumentations.readthedocs.io`
  1282. to get more information.
  1283. Required Keys:
  1284. - img (np.uint8)
  1285. - gt_bboxes (HorizontalBoxes[torch.float32]) (optional)
  1286. - gt_masks (BitmapMasks | PolygonMasks) (optional)
  1287. Modified Keys:
  1288. - img (np.uint8)
  1289. - gt_bboxes (HorizontalBoxes[torch.float32]) (optional)
  1290. - gt_masks (BitmapMasks | PolygonMasks) (optional)
  1291. - img_shape (tuple)
  1292. An example of ``transforms`` is as followed:
  1293. .. code-block::
  1294. [
  1295. dict(
  1296. type='ShiftScaleRotate',
  1297. shift_limit=0.0625,
  1298. scale_limit=0.0,
  1299. rotate_limit=0,
  1300. interpolation=1,
  1301. p=0.5),
  1302. dict(
  1303. type='RandomBrightnessContrast',
  1304. brightness_limit=[0.1, 0.3],
  1305. contrast_limit=[0.1, 0.3],
  1306. p=0.2),
  1307. dict(type='ChannelShuffle', p=0.1),
  1308. dict(
  1309. type='OneOf',
  1310. transforms=[
  1311. dict(type='Blur', blur_limit=3, p=1.0),
  1312. dict(type='MedianBlur', blur_limit=3, p=1.0)
  1313. ],
  1314. p=0.1),
  1315. ]
  1316. Args:
  1317. transforms (list[dict]): A list of albu transformations
  1318. bbox_params (dict, optional): Bbox_params for albumentation `Compose`
  1319. keymap (dict, optional): Contains
  1320. {'input key':'albumentation-style key'}
  1321. skip_img_without_anno (bool): Whether to skip the image if no ann left
  1322. after aug. Defaults to False.
  1323. """
  1324. def __init__(self,
  1325. transforms: List[dict],
  1326. bbox_params: Optional[dict] = None,
  1327. keymap: Optional[dict] = None,
  1328. skip_img_without_anno: bool = False) -> None:
  1329. if Compose is None:
  1330. raise RuntimeError('albumentations is not installed')
  1331. # Args will be modified later, copying it will be safer
  1332. transforms = copy.deepcopy(transforms)
  1333. if bbox_params is not None:
  1334. bbox_params = copy.deepcopy(bbox_params)
  1335. if keymap is not None:
  1336. keymap = copy.deepcopy(keymap)
  1337. self.transforms = transforms
  1338. self.filter_lost_elements = False
  1339. self.skip_img_without_anno = skip_img_without_anno
  1340. # A simple workaround to remove masks without boxes
  1341. if (isinstance(bbox_params, dict) and 'label_fields' in bbox_params
  1342. and 'filter_lost_elements' in bbox_params):
  1343. self.filter_lost_elements = True
  1344. self.origin_label_fields = bbox_params['label_fields']
  1345. bbox_params['label_fields'] = ['idx_mapper']
  1346. del bbox_params['filter_lost_elements']
  1347. self.bbox_params = (
  1348. self.albu_builder(bbox_params) if bbox_params else None)
  1349. self.aug = Compose([self.albu_builder(t) for t in self.transforms],
  1350. bbox_params=self.bbox_params)
  1351. if not keymap:
  1352. self.keymap_to_albu = {
  1353. 'img': 'image',
  1354. 'gt_masks': 'masks',
  1355. 'gt_bboxes': 'bboxes'
  1356. }
  1357. else:
  1358. self.keymap_to_albu = keymap
  1359. self.keymap_back = {v: k for k, v in self.keymap_to_albu.items()}
  1360. def albu_builder(self, cfg: dict) -> albumentations:
  1361. """Import a module from albumentations.
  1362. It inherits some of :func:`build_from_cfg` logic.
  1363. Args:
  1364. cfg (dict): Config dict. It should at least contain the key "type".
  1365. Returns:
  1366. obj: The constructed object.
  1367. """
  1368. assert isinstance(cfg, dict) and 'type' in cfg
  1369. args = cfg.copy()
  1370. obj_type = args.pop('type')
  1371. if is_str(obj_type):
  1372. if albumentations is None:
  1373. raise RuntimeError('albumentations is not installed')
  1374. obj_cls = getattr(albumentations, obj_type)
  1375. elif inspect.isclass(obj_type):
  1376. obj_cls = obj_type
  1377. else:
  1378. raise TypeError(
  1379. f'type must be a str or valid type, but got {type(obj_type)}')
  1380. if 'transforms' in args:
  1381. args['transforms'] = [
  1382. self.albu_builder(transform)
  1383. for transform in args['transforms']
  1384. ]
  1385. return obj_cls(**args)
  1386. @staticmethod
  1387. def mapper(d: dict, keymap: dict) -> dict:
  1388. """Dictionary mapper. Renames keys according to keymap provided.
  1389. Args:
  1390. d (dict): old dict
  1391. keymap (dict): {'old_key':'new_key'}
  1392. Returns:
  1393. dict: new dict.
  1394. """
  1395. updated_dict = {}
  1396. for k, v in zip(d.keys(), d.values()):
  1397. new_k = keymap.get(k, k)
  1398. updated_dict[new_k] = d[k]
  1399. return updated_dict
  1400. @autocast_box_type()
  1401. def transform(self, results: dict) -> Union[dict, None]:
  1402. """Transform function of Albu."""
  1403. # TODO: gt_seg_map is not currently supported
  1404. # dict to albumentations format
  1405. results = self.mapper(results, self.keymap_to_albu)
  1406. results, ori_masks = self._preprocess_results(results)
  1407. results = self.aug(**results)
  1408. results = self._postprocess_results(results, ori_masks)
  1409. if results is None:
  1410. return None
  1411. # back to the original format
  1412. results = self.mapper(results, self.keymap_back)
  1413. results['img_shape'] = results['img'].shape[:2]
  1414. return results
  1415. def _preprocess_results(self, results: dict) -> tuple:
  1416. """Pre-processing results to facilitate the use of Albu."""
  1417. if 'bboxes' in results:
  1418. # to list of boxes
  1419. if not isinstance(results['bboxes'], HorizontalBoxes):
  1420. raise NotImplementedError(
  1421. 'Albu only supports horizontal boxes now')
  1422. bboxes = results['bboxes'].numpy()
  1423. results['bboxes'] = [x for x in bboxes]
  1424. # add pseudo-field for filtration
  1425. if self.filter_lost_elements:
  1426. results['idx_mapper'] = np.arange(len(results['bboxes']))
  1427. # TODO: Support mask structure in albu
  1428. ori_masks = None
  1429. if 'masks' in results:
  1430. if isinstance(results['masks'], PolygonMasks):
  1431. raise NotImplementedError(
  1432. 'Albu only supports BitMap masks now')
  1433. ori_masks = results['masks']
  1434. if albumentations.__version__ < '0.5':
  1435. results['masks'] = results['masks'].masks
  1436. else:
  1437. results['masks'] = [mask for mask in results['masks'].masks]
  1438. return results, ori_masks
  1439. def _postprocess_results(
  1440. self,
  1441. results: dict,
  1442. ori_masks: Optional[Union[BitmapMasks,
  1443. PolygonMasks]] = None) -> dict:
  1444. """Post-processing Albu output."""
  1445. # albumentations may return np.array or list on different versions
  1446. if 'gt_bboxes_labels' in results and isinstance(
  1447. results['gt_bboxes_labels'], list):
  1448. results['gt_bboxes_labels'] = np.array(
  1449. results['gt_bboxes_labels'], dtype=np.int64)
  1450. if 'gt_ignore_flags' in results and isinstance(
  1451. results['gt_ignore_flags'], list):
  1452. results['gt_ignore_flags'] = np.array(
  1453. results['gt_ignore_flags'], dtype=bool)
  1454. if 'bboxes' in results:
  1455. if isinstance(results['bboxes'], list):
  1456. results['bboxes'] = np.array(
  1457. results['bboxes'], dtype=np.float32)
  1458. results['bboxes'] = results['bboxes'].reshape(-1, 4)
  1459. results['bboxes'] = HorizontalBoxes(results['bboxes'])
  1460. # filter label_fields
  1461. if self.filter_lost_elements:
  1462. for label in self.origin_label_fields:
  1463. results[label] = np.array(
  1464. [results[label][i] for i in results['idx_mapper']])
  1465. if 'masks' in results:
  1466. assert ori_masks is not None
  1467. results['masks'] = np.array(
  1468. [results['masks'][i] for i in results['idx_mapper']])
  1469. results['masks'] = ori_masks.__class__(
  1470. results['masks'], ori_masks.height, ori_masks.width)
  1471. if (not len(results['idx_mapper'])
  1472. and self.skip_img_without_anno):
  1473. return None
  1474. elif 'masks' in results:
  1475. results['masks'] = ori_masks.__class__(results['masks'],
  1476. ori_masks.height,
  1477. ori_masks.width)
  1478. return results
  1479. def __repr__(self) -> str:
  1480. repr_str = self.__class__.__name__ + f'(transforms={self.transforms})'
  1481. return repr_str
  1482. @TRANSFORMS.register_module()
  1483. @avoid_cache_randomness
  1484. class RandomCenterCropPad(BaseTransform):
  1485. """Random center crop and random around padding for CornerNet.
  1486. This operation generates randomly cropped image from the original image and
  1487. pads it simultaneously. Different from :class:`RandomCrop`, the output
  1488. shape may not equal to ``crop_size`` strictly. We choose a random value
  1489. from ``ratios`` and the output shape could be larger or smaller than
  1490. ``crop_size``. The padding operation is also different from :class:`Pad`,
  1491. here we use around padding instead of right-bottom padding.
  1492. The relation between output image (padding image) and original image:
  1493. .. code:: text
  1494. output image
  1495. +----------------------------+
  1496. | padded area |
  1497. +------|----------------------------|----------+
  1498. | | cropped area | |
  1499. | | +---------------+ | |
  1500. | | | . center | | | original image
  1501. | | | range | | |
  1502. | | +---------------+ | |
  1503. +------|----------------------------|----------+
  1504. | padded area |
  1505. +----------------------------+
  1506. There are 5 main areas in the figure:
  1507. - output image: output image of this operation, also called padding
  1508. image in following instruction.
  1509. - original image: input image of this operation.
  1510. - padded area: non-intersect area of output image and original image.
  1511. - cropped area: the overlap of output image and original image.
  1512. - center range: a smaller area where random center chosen from.
  1513. center range is computed by ``border`` and original image's shape
  1514. to avoid our random center is too close to original image's border.
  1515. Also this operation act differently in train and test mode, the summary
  1516. pipeline is listed below.
  1517. Train pipeline:
  1518. 1. Choose a ``random_ratio`` from ``ratios``, the shape of padding image
  1519. will be ``random_ratio * crop_size``.
  1520. 2. Choose a ``random_center`` in center range.
  1521. 3. Generate padding image with center matches the ``random_center``.
  1522. 4. Initialize the padding image with pixel value equals to ``mean``.
  1523. 5. Copy the cropped area to padding image.
  1524. 6. Refine annotations.
  1525. Test pipeline:
  1526. 1. Compute output shape according to ``test_pad_mode``.
  1527. 2. Generate padding image with center matches the original image
  1528. center.
  1529. 3. Initialize the padding image with pixel value equals to ``mean``.
  1530. 4. Copy the ``cropped area`` to padding image.
  1531. Required Keys:
  1532. - img (np.float32)
  1533. - img_shape (tuple)
  1534. - gt_bboxes (BaseBoxes[torch.float32]) (optional)
  1535. - gt_bboxes_labels (np.int64) (optional)
  1536. - gt_ignore_flags (bool) (optional)
  1537. Modified Keys:
  1538. - img (np.float32)
  1539. - img_shape (tuple)
  1540. - gt_bboxes (BaseBoxes[torch.float32]) (optional)
  1541. - gt_bboxes_labels (np.int64) (optional)
  1542. - gt_ignore_flags (bool) (optional)
  1543. Args:
  1544. crop_size (tuple, optional): expected size after crop, final size will
  1545. computed according to ratio. Requires (width, height)
  1546. in train mode, and None in test mode.
  1547. ratios (tuple, optional): random select a ratio from tuple and crop
  1548. image to (crop_size[0] * ratio) * (crop_size[1] * ratio).
  1549. Only available in train mode. Defaults to (0.9, 1.0, 1.1).
  1550. border (int, optional): max distance from center select area to image
  1551. border. Only available in train mode. Defaults to 128.
  1552. mean (sequence, optional): Mean values of 3 channels.
  1553. std (sequence, optional): Std values of 3 channels.
  1554. to_rgb (bool, optional): Whether to convert the image from BGR to RGB.
  1555. test_mode (bool): whether involve random variables in transform.
  1556. In train mode, crop_size is fixed, center coords and ratio is
  1557. random selected from predefined lists. In test mode, crop_size
  1558. is image's original shape, center coords and ratio is fixed.
  1559. Defaults to False.
  1560. test_pad_mode (tuple, optional): padding method and padding shape
  1561. value, only available in test mode. Default is using
  1562. 'logical_or' with 127 as padding shape value.
  1563. - 'logical_or': final_shape = input_shape | padding_shape_value
  1564. - 'size_divisor': final_shape = int(
  1565. ceil(input_shape / padding_shape_value) * padding_shape_value)
  1566. Defaults to ('logical_or', 127).
  1567. test_pad_add_pix (int): Extra padding pixel in test mode.
  1568. Defaults to 0.
  1569. bbox_clip_border (bool): Whether clip the objects outside
  1570. the border of the image. Defaults to True.
  1571. """
  1572. def __init__(self,
  1573. crop_size: Optional[tuple] = None,
  1574. ratios: Optional[tuple] = (0.9, 1.0, 1.1),
  1575. border: Optional[int] = 128,
  1576. mean: Optional[Sequence] = None,
  1577. std: Optional[Sequence] = None,
  1578. to_rgb: Optional[bool] = None,
  1579. test_mode: bool = False,
  1580. test_pad_mode: Optional[tuple] = ('logical_or', 127),
  1581. test_pad_add_pix: int = 0,
  1582. bbox_clip_border: bool = True) -> None:
  1583. if test_mode:
  1584. assert crop_size is None, 'crop_size must be None in test mode'
  1585. assert ratios is None, 'ratios must be None in test mode'
  1586. assert border is None, 'border must be None in test mode'
  1587. assert isinstance(test_pad_mode, (list, tuple))
  1588. assert test_pad_mode[0] in ['logical_or', 'size_divisor']
  1589. else:
  1590. assert isinstance(crop_size, (list, tuple))
  1591. assert crop_size[0] > 0 and crop_size[1] > 0, (
  1592. 'crop_size must > 0 in train mode')
  1593. assert isinstance(ratios, (list, tuple))
  1594. assert test_pad_mode is None, (
  1595. 'test_pad_mode must be None in train mode')
  1596. self.crop_size = crop_size
  1597. self.ratios = ratios
  1598. self.border = border
  1599. # We do not set default value to mean, std and to_rgb because these
  1600. # hyper-parameters are easy to forget but could affect the performance.
  1601. # Please use the same setting as Normalize for performance assurance.
  1602. assert mean is not None and std is not None and to_rgb is not None
  1603. self.to_rgb = to_rgb
  1604. self.input_mean = mean
  1605. self.input_std = std
  1606. if to_rgb:
  1607. self.mean = mean[::-1]
  1608. self.std = std[::-1]
  1609. else:
  1610. self.mean = mean
  1611. self.std = std
  1612. self.test_mode = test_mode
  1613. self.test_pad_mode = test_pad_mode
  1614. self.test_pad_add_pix = test_pad_add_pix
  1615. self.bbox_clip_border = bbox_clip_border
  1616. def _get_border(self, border, size):
  1617. """Get final border for the target size.
  1618. This function generates a ``final_border`` according to image's shape.
  1619. The area between ``final_border`` and ``size - final_border`` is the
  1620. ``center range``. We randomly choose center from the ``center range``
  1621. to avoid our random center is too close to original image's border.
  1622. Also ``center range`` should be larger than 0.
  1623. Args:
  1624. border (int): The initial border, default is 128.
  1625. size (int): The width or height of original image.
  1626. Returns:
  1627. int: The final border.
  1628. """
  1629. k = 2 * border / size
  1630. i = pow(2, np.ceil(np.log2(np.ceil(k))) + (k == int(k)))
  1631. return border // i
  1632. def _filter_boxes(self, patch, boxes):
  1633. """Check whether the center of each box is in the patch.
  1634. Args:
  1635. patch (list[int]): The cropped area, [left, top, right, bottom].
  1636. boxes (numpy array, (N x 4)): Ground truth boxes.
  1637. Returns:
  1638. mask (numpy array, (N,)): Each box is inside or outside the patch.
  1639. """
  1640. center = boxes.centers.numpy()
  1641. mask = (center[:, 0] > patch[0]) * (center[:, 1] > patch[1]) * (
  1642. center[:, 0] < patch[2]) * (
  1643. center[:, 1] < patch[3])
  1644. return mask
  1645. def _crop_image_and_paste(self, image, center, size):
  1646. """Crop image with a given center and size, then paste the cropped
  1647. image to a blank image with two centers align.
  1648. This function is equivalent to generating a blank image with ``size``
  1649. as its shape. Then cover it on the original image with two centers (
  1650. the center of blank image and the random center of original image)
  1651. aligned. The overlap area is paste from the original image and the
  1652. outside area is filled with ``mean pixel``.
  1653. Args:
  1654. image (np array, H x W x C): Original image.
  1655. center (list[int]): Target crop center coord.
  1656. size (list[int]): Target crop size. [target_h, target_w]
  1657. Returns:
  1658. cropped_img (np array, target_h x target_w x C): Cropped image.
  1659. border (np array, 4): The distance of four border of
  1660. ``cropped_img`` to the original image area, [top, bottom,
  1661. left, right]
  1662. patch (list[int]): The cropped area, [left, top, right, bottom].
  1663. """
  1664. center_y, center_x = center
  1665. target_h, target_w = size
  1666. img_h, img_w, img_c = image.shape
  1667. x0 = max(0, center_x - target_w // 2)
  1668. x1 = min(center_x + target_w // 2, img_w)
  1669. y0 = max(0, center_y - target_h // 2)
  1670. y1 = min(center_y + target_h // 2, img_h)
  1671. patch = np.array((int(x0), int(y0), int(x1), int(y1)))
  1672. left, right = center_x - x0, x1 - center_x
  1673. top, bottom = center_y - y0, y1 - center_y
  1674. cropped_center_y, cropped_center_x = target_h // 2, target_w // 2
  1675. cropped_img = np.zeros((target_h, target_w, img_c), dtype=image.dtype)
  1676. for i in range(img_c):
  1677. cropped_img[:, :, i] += self.mean[i]
  1678. y_slice = slice(cropped_center_y - top, cropped_center_y + bottom)
  1679. x_slice = slice(cropped_center_x - left, cropped_center_x + right)
  1680. cropped_img[y_slice, x_slice, :] = image[y0:y1, x0:x1, :]
  1681. border = np.array([
  1682. cropped_center_y - top, cropped_center_y + bottom,
  1683. cropped_center_x - left, cropped_center_x + right
  1684. ],
  1685. dtype=np.float32)
  1686. return cropped_img, border, patch
  1687. def _train_aug(self, results):
  1688. """Random crop and around padding the original image.
  1689. Args:
  1690. results (dict): Image infomations in the augment pipeline.
  1691. Returns:
  1692. results (dict): The updated dict.
  1693. """
  1694. img = results['img']
  1695. h, w, c = img.shape
  1696. gt_bboxes = results['gt_bboxes']
  1697. while True:
  1698. scale = random.choice(self.ratios)
  1699. new_h = int(self.crop_size[1] * scale)
  1700. new_w = int(self.crop_size[0] * scale)
  1701. h_border = self._get_border(self.border, h)
  1702. w_border = self._get_border(self.border, w)
  1703. for i in range(50):
  1704. center_x = random.randint(low=w_border, high=w - w_border)
  1705. center_y = random.randint(low=h_border, high=h - h_border)
  1706. cropped_img, border, patch = self._crop_image_and_paste(
  1707. img, [center_y, center_x], [new_h, new_w])
  1708. if len(gt_bboxes) == 0:
  1709. results['img'] = cropped_img
  1710. results['img_shape'] = cropped_img.shape[:2]
  1711. return results
  1712. # if image do not have valid bbox, any crop patch is valid.
  1713. mask = self._filter_boxes(patch, gt_bboxes)
  1714. if not mask.any():
  1715. continue
  1716. results['img'] = cropped_img
  1717. results['img_shape'] = cropped_img.shape[:2]
  1718. x0, y0, x1, y1 = patch
  1719. left_w, top_h = center_x - x0, center_y - y0
  1720. cropped_center_x, cropped_center_y = new_w // 2, new_h // 2
  1721. # crop bboxes accordingly and clip to the image boundary
  1722. gt_bboxes = gt_bboxes[mask]
  1723. gt_bboxes.translate_([
  1724. cropped_center_x - left_w - x0,
  1725. cropped_center_y - top_h - y0
  1726. ])
  1727. if self.bbox_clip_border:
  1728. gt_bboxes.clip_([new_h, new_w])
  1729. keep = gt_bboxes.is_inside([new_h, new_w]).numpy()
  1730. gt_bboxes = gt_bboxes[keep]
  1731. results['gt_bboxes'] = gt_bboxes
  1732. # ignore_flags
  1733. if results.get('gt_ignore_flags', None) is not None:
  1734. gt_ignore_flags = results['gt_ignore_flags'][mask]
  1735. results['gt_ignore_flags'] = \
  1736. gt_ignore_flags[keep]
  1737. # labels
  1738. if results.get('gt_bboxes_labels', None) is not None:
  1739. gt_labels = results['gt_bboxes_labels'][mask]
  1740. results['gt_bboxes_labels'] = gt_labels[keep]
  1741. if 'gt_masks' in results or 'gt_seg_map' in results:
  1742. raise NotImplementedError(
  1743. 'RandomCenterCropPad only supports bbox.')
  1744. return results
  1745. def _test_aug(self, results):
  1746. """Around padding the original image without cropping.
  1747. The padding mode and value are from ``test_pad_mode``.
  1748. Args:
  1749. results (dict): Image infomations in the augment pipeline.
  1750. Returns:
  1751. results (dict): The updated dict.
  1752. """
  1753. img = results['img']
  1754. h, w, c = img.shape
  1755. if self.test_pad_mode[0] in ['logical_or']:
  1756. # self.test_pad_add_pix is only used for centernet
  1757. target_h = (h | self.test_pad_mode[1]) + self.test_pad_add_pix
  1758. target_w = (w | self.test_pad_mode[1]) + self.test_pad_add_pix
  1759. elif self.test_pad_mode[0] in ['size_divisor']:
  1760. divisor = self.test_pad_mode[1]
  1761. target_h = int(np.ceil(h / divisor)) * divisor
  1762. target_w = int(np.ceil(w / divisor)) * divisor
  1763. else:
  1764. raise NotImplementedError(
  1765. 'RandomCenterCropPad only support two testing pad mode:'
  1766. 'logical-or and size_divisor.')
  1767. cropped_img, border, _ = self._crop_image_and_paste(
  1768. img, [h // 2, w // 2], [target_h, target_w])
  1769. results['img'] = cropped_img
  1770. results['img_shape'] = cropped_img.shape[:2]
  1771. results['border'] = border
  1772. return results
  1773. @autocast_box_type()
  1774. def transform(self, results: dict) -> dict:
  1775. img = results['img']
  1776. assert img.dtype == np.float32, (
  1777. 'RandomCenterCropPad needs the input image of dtype np.float32,'
  1778. ' please set "to_float32=True" in "LoadImageFromFile" pipeline')
  1779. h, w, c = img.shape
  1780. assert c == len(self.mean)
  1781. if self.test_mode:
  1782. return self._test_aug(results)
  1783. else:
  1784. return self._train_aug(results)
  1785. def __repr__(self):
  1786. repr_str = self.__class__.__name__
  1787. repr_str += f'(crop_size={self.crop_size}, '
  1788. repr_str += f'ratios={self.ratios}, '
  1789. repr_str += f'border={self.border}, '
  1790. repr_str += f'mean={self.input_mean}, '
  1791. repr_str += f'std={self.input_std}, '
  1792. repr_str += f'to_rgb={self.to_rgb}, '
  1793. repr_str += f'test_mode={self.test_mode}, '
  1794. repr_str += f'test_pad_mode={self.test_pad_mode}, '
  1795. repr_str += f'bbox_clip_border={self.bbox_clip_border})'
  1796. return repr_str
  1797. @TRANSFORMS.register_module()
  1798. class CutOut(BaseTransform):
  1799. """CutOut operation.
  1800. Randomly drop some regions of image used in
  1801. `Cutout <https://arxiv.org/abs/1708.04552>`_.
  1802. Required Keys:
  1803. - img
  1804. Modified Keys:
  1805. - img
  1806. Args:
  1807. n_holes (int or tuple[int, int]): Number of regions to be dropped.
  1808. If it is given as a list, number of holes will be randomly
  1809. selected from the closed interval [``n_holes[0]``, ``n_holes[1]``].
  1810. cutout_shape (tuple[int, int] or list[tuple[int, int]], optional):
  1811. The candidate shape of dropped regions. It can be
  1812. ``tuple[int, int]`` to use a fixed cutout shape, or
  1813. ``list[tuple[int, int]]`` to randomly choose shape
  1814. from the list. Defaults to None.
  1815. cutout_ratio (tuple[float, float] or list[tuple[float, float]],
  1816. optional): The candidate ratio of dropped regions. It can be
  1817. ``tuple[float, float]`` to use a fixed ratio or
  1818. ``list[tuple[float, float]]`` to randomly choose ratio
  1819. from the list. Please note that ``cutout_shape`` and
  1820. ``cutout_ratio`` cannot be both given at the same time.
  1821. Defaults to None.
  1822. fill_in (tuple[float, float, float] or tuple[int, int, int]): The value
  1823. of pixel to fill in the dropped regions. Defaults to (0, 0, 0).
  1824. """
  1825. def __init__(
  1826. self,
  1827. n_holes: Union[int, Tuple[int, int]],
  1828. cutout_shape: Optional[Union[Tuple[int, int],
  1829. List[Tuple[int, int]]]] = None,
  1830. cutout_ratio: Optional[Union[Tuple[float, float],
  1831. List[Tuple[float, float]]]] = None,
  1832. fill_in: Union[Tuple[float, float, float], Tuple[int, int,
  1833. int]] = (0, 0, 0)
  1834. ) -> None:
  1835. assert (cutout_shape is None) ^ (cutout_ratio is None), \
  1836. 'Either cutout_shape or cutout_ratio should be specified.'
  1837. assert (isinstance(cutout_shape, (list, tuple))
  1838. or isinstance(cutout_ratio, (list, tuple)))
  1839. if isinstance(n_holes, tuple):
  1840. assert len(n_holes) == 2 and 0 <= n_holes[0] < n_holes[1]
  1841. else:
  1842. n_holes = (n_holes, n_holes)
  1843. self.n_holes = n_holes
  1844. self.fill_in = fill_in
  1845. self.with_ratio = cutout_ratio is not None
  1846. self.candidates = cutout_ratio if self.with_ratio else cutout_shape
  1847. if not isinstance(self.candidates, list):
  1848. self.candidates = [self.candidates]
  1849. @autocast_box_type()
  1850. def transform(self, results: dict) -> dict:
  1851. """Call function to drop some regions of image."""
  1852. h, w, c = results['img'].shape
  1853. n_holes = np.random.randint(self.n_holes[0], self.n_holes[1] + 1)
  1854. for _ in range(n_holes):
  1855. x1 = np.random.randint(0, w)
  1856. y1 = np.random.randint(0, h)
  1857. index = np.random.randint(0, len(self.candidates))
  1858. if not self.with_ratio:
  1859. cutout_w, cutout_h = self.candidates[index]
  1860. else:
  1861. cutout_w = int(self.candidates[index][0] * w)
  1862. cutout_h = int(self.candidates[index][1] * h)
  1863. x2 = np.clip(x1 + cutout_w, 0, w)
  1864. y2 = np.clip(y1 + cutout_h, 0, h)
  1865. results['img'][y1:y2, x1:x2, :] = self.fill_in
  1866. return results
  1867. def __repr__(self):
  1868. repr_str = self.__class__.__name__
  1869. repr_str += f'(n_holes={self.n_holes}, '
  1870. repr_str += (f'cutout_ratio={self.candidates}, ' if self.with_ratio
  1871. else f'cutout_shape={self.candidates}, ')
  1872. repr_str += f'fill_in={self.fill_in})'
  1873. return repr_str
  1874. @TRANSFORMS.register_module()
  1875. class Mosaic(BaseTransform):
  1876. """Mosaic augmentation.
  1877. Given 4 images, mosaic transform combines them into
  1878. one output image. The output image is composed of the parts from each sub-
  1879. image.
  1880. .. code:: text
  1881. mosaic transform
  1882. center_x
  1883. +------------------------------+
  1884. | pad | pad |
  1885. | +-----------+ |
  1886. | | | |
  1887. | | image1 |--------+ |
  1888. | | | | |
  1889. | | | image2 | |
  1890. center_y |----+-------------+-----------|
  1891. | | cropped | |
  1892. |pad | image3 | image4 |
  1893. | | | |
  1894. +----|-------------+-----------+
  1895. | |
  1896. +-------------+
  1897. The mosaic transform steps are as follows:
  1898. 1. Choose the mosaic center as the intersections of 4 images
  1899. 2. Get the left top image according to the index, and randomly
  1900. sample another 3 images from the custom dataset.
  1901. 3. Sub image will be cropped if image is larger than mosaic patch
  1902. Required Keys:
  1903. - img
  1904. - gt_bboxes (BaseBoxes[torch.float32]) (optional)
  1905. - gt_bboxes_labels (np.int64) (optional)
  1906. - gt_ignore_flags (bool) (optional)
  1907. - mix_results (List[dict])
  1908. Modified Keys:
  1909. - img
  1910. - img_shape
  1911. - gt_bboxes (optional)
  1912. - gt_bboxes_labels (optional)
  1913. - gt_ignore_flags (optional)
  1914. Args:
  1915. img_scale (Sequence[int]): Image size before mosaic pipeline of single
  1916. image. The shape order should be (width, height).
  1917. Defaults to (640, 640).
  1918. center_ratio_range (Sequence[float]): Center ratio range of mosaic
  1919. output. Defaults to (0.5, 1.5).
  1920. bbox_clip_border (bool, optional): Whether to clip the objects outside
  1921. the border of the image. In some dataset like MOT17, the gt bboxes
  1922. are allowed to cross the border of images. Therefore, we don't
  1923. need to clip the gt bboxes in these cases. Defaults to True.
  1924. pad_val (int): Pad value. Defaults to 114.
  1925. prob (float): Probability of applying this transformation.
  1926. Defaults to 1.0.
  1927. """
  1928. def __init__(self,
  1929. img_scale: Tuple[int, int] = (640, 640),
  1930. center_ratio_range: Tuple[float, float] = (0.5, 1.5),
  1931. bbox_clip_border: bool = True,
  1932. pad_val: float = 114.0,
  1933. prob: float = 1.0) -> None:
  1934. assert isinstance(img_scale, tuple)
  1935. assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. ' \
  1936. f'got {prob}.'
  1937. log_img_scale(img_scale, skip_square=True, shape_order='wh')
  1938. self.img_scale = img_scale
  1939. self.center_ratio_range = center_ratio_range
  1940. self.bbox_clip_border = bbox_clip_border
  1941. self.pad_val = pad_val
  1942. self.prob = prob
  1943. @cache_randomness
  1944. def get_indexes(self, dataset: BaseDataset) -> int:
  1945. """Call function to collect indexes.
  1946. Args:
  1947. dataset (:obj:`MultiImageMixDataset`): The dataset.
  1948. Returns:
  1949. list: indexes.
  1950. """
  1951. indexes = [random.randint(0, len(dataset)) for _ in range(3)]
  1952. return indexes
  1953. @autocast_box_type()
  1954. def transform(self, results: dict) -> dict:
  1955. """Mosaic transform function.
  1956. Args:
  1957. results (dict): Result dict.
  1958. Returns:
  1959. dict: Updated result dict.
  1960. """
  1961. if random.uniform(0, 1) > self.prob:
  1962. return results
  1963. assert 'mix_results' in results
  1964. mosaic_bboxes = []
  1965. mosaic_bboxes_labels = []
  1966. mosaic_ignore_flags = []
  1967. if len(results['img'].shape) == 3:
  1968. mosaic_img = np.full(
  1969. (int(self.img_scale[1] * 2), int(self.img_scale[0] * 2), 3),
  1970. self.pad_val,
  1971. dtype=results['img'].dtype)
  1972. else:
  1973. mosaic_img = np.full(
  1974. (int(self.img_scale[1] * 2), int(self.img_scale[0] * 2)),
  1975. self.pad_val,
  1976. dtype=results['img'].dtype)
  1977. # mosaic center x, y
  1978. center_x = int(
  1979. random.uniform(*self.center_ratio_range) * self.img_scale[0])
  1980. center_y = int(
  1981. random.uniform(*self.center_ratio_range) * self.img_scale[1])
  1982. center_position = (center_x, center_y)
  1983. loc_strs = ('top_left', 'top_right', 'bottom_left', 'bottom_right')
  1984. for i, loc in enumerate(loc_strs):
  1985. if loc == 'top_left':
  1986. results_patch = copy.deepcopy(results)
  1987. else:
  1988. results_patch = copy.deepcopy(results['mix_results'][i - 1])
  1989. img_i = results_patch['img']
  1990. h_i, w_i = img_i.shape[:2]
  1991. # keep_ratio resize
  1992. scale_ratio_i = min(self.img_scale[1] / h_i,
  1993. self.img_scale[0] / w_i)
  1994. img_i = mmcv.imresize(
  1995. img_i, (int(w_i * scale_ratio_i), int(h_i * scale_ratio_i)))
  1996. # compute the combine parameters
  1997. paste_coord, crop_coord = self._mosaic_combine(
  1998. loc, center_position, img_i.shape[:2][::-1])
  1999. x1_p, y1_p, x2_p, y2_p = paste_coord
  2000. x1_c, y1_c, x2_c, y2_c = crop_coord
  2001. # crop and paste image
  2002. mosaic_img[y1_p:y2_p, x1_p:x2_p] = img_i[y1_c:y2_c, x1_c:x2_c]
  2003. # adjust coordinate
  2004. gt_bboxes_i = results_patch['gt_bboxes']
  2005. gt_bboxes_labels_i = results_patch['gt_bboxes_labels']
  2006. gt_ignore_flags_i = results_patch['gt_ignore_flags']
  2007. padw = x1_p - x1_c
  2008. padh = y1_p - y1_c
  2009. gt_bboxes_i.rescale_([scale_ratio_i, scale_ratio_i])
  2010. gt_bboxes_i.translate_([padw, padh])
  2011. mosaic_bboxes.append(gt_bboxes_i)
  2012. mosaic_bboxes_labels.append(gt_bboxes_labels_i)
  2013. mosaic_ignore_flags.append(gt_ignore_flags_i)
  2014. mosaic_bboxes = mosaic_bboxes[0].cat(mosaic_bboxes, 0)
  2015. mosaic_bboxes_labels = np.concatenate(mosaic_bboxes_labels, 0)
  2016. mosaic_ignore_flags = np.concatenate(mosaic_ignore_flags, 0)
  2017. if self.bbox_clip_border:
  2018. mosaic_bboxes.clip_([2 * self.img_scale[1], 2 * self.img_scale[0]])
  2019. # remove outside bboxes
  2020. inside_inds = mosaic_bboxes.is_inside(
  2021. [2 * self.img_scale[1], 2 * self.img_scale[0]]).numpy()
  2022. mosaic_bboxes = mosaic_bboxes[inside_inds]
  2023. mosaic_bboxes_labels = mosaic_bboxes_labels[inside_inds]
  2024. mosaic_ignore_flags = mosaic_ignore_flags[inside_inds]
  2025. results['img'] = mosaic_img
  2026. results['img_shape'] = mosaic_img.shape[:2]
  2027. results['gt_bboxes'] = mosaic_bboxes
  2028. results['gt_bboxes_labels'] = mosaic_bboxes_labels
  2029. results['gt_ignore_flags'] = mosaic_ignore_flags
  2030. return results
  2031. def _mosaic_combine(
  2032. self, loc: str, center_position_xy: Sequence[float],
  2033. img_shape_wh: Sequence[int]) -> Tuple[Tuple[int], Tuple[int]]:
  2034. """Calculate global coordinate of mosaic image and local coordinate of
  2035. cropped sub-image.
  2036. Args:
  2037. loc (str): Index for the sub-image, loc in ('top_left',
  2038. 'top_right', 'bottom_left', 'bottom_right').
  2039. center_position_xy (Sequence[float]): Mixing center for 4 images,
  2040. (x, y).
  2041. img_shape_wh (Sequence[int]): Width and height of sub-image
  2042. Returns:
  2043. tuple[tuple[float]]: Corresponding coordinate of pasting and
  2044. cropping
  2045. - paste_coord (tuple): paste corner coordinate in mosaic image.
  2046. - crop_coord (tuple): crop corner coordinate in mosaic image.
  2047. """
  2048. assert loc in ('top_left', 'top_right', 'bottom_left', 'bottom_right')
  2049. if loc == 'top_left':
  2050. # index0 to top left part of image
  2051. x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \
  2052. max(center_position_xy[1] - img_shape_wh[1], 0), \
  2053. center_position_xy[0], \
  2054. center_position_xy[1]
  2055. crop_coord = img_shape_wh[0] - (x2 - x1), img_shape_wh[1] - (
  2056. y2 - y1), img_shape_wh[0], img_shape_wh[1]
  2057. elif loc == 'top_right':
  2058. # index1 to top right part of image
  2059. x1, y1, x2, y2 = center_position_xy[0], \
  2060. max(center_position_xy[1] - img_shape_wh[1], 0), \
  2061. min(center_position_xy[0] + img_shape_wh[0],
  2062. self.img_scale[0] * 2), \
  2063. center_position_xy[1]
  2064. crop_coord = 0, img_shape_wh[1] - (y2 - y1), min(
  2065. img_shape_wh[0], x2 - x1), img_shape_wh[1]
  2066. elif loc == 'bottom_left':
  2067. # index2 to bottom left part of image
  2068. x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \
  2069. center_position_xy[1], \
  2070. center_position_xy[0], \
  2071. min(self.img_scale[1] * 2, center_position_xy[1] +
  2072. img_shape_wh[1])
  2073. crop_coord = img_shape_wh[0] - (x2 - x1), 0, img_shape_wh[0], min(
  2074. y2 - y1, img_shape_wh[1])
  2075. else:
  2076. # index3 to bottom right part of image
  2077. x1, y1, x2, y2 = center_position_xy[0], \
  2078. center_position_xy[1], \
  2079. min(center_position_xy[0] + img_shape_wh[0],
  2080. self.img_scale[0] * 2), \
  2081. min(self.img_scale[1] * 2, center_position_xy[1] +
  2082. img_shape_wh[1])
  2083. crop_coord = 0, 0, min(img_shape_wh[0],
  2084. x2 - x1), min(y2 - y1, img_shape_wh[1])
  2085. paste_coord = x1, y1, x2, y2
  2086. return paste_coord, crop_coord
  2087. def __repr__(self):
  2088. repr_str = self.__class__.__name__
  2089. repr_str += f'(img_scale={self.img_scale}, '
  2090. repr_str += f'center_ratio_range={self.center_ratio_range}, '
  2091. repr_str += f'pad_val={self.pad_val}, '
  2092. repr_str += f'prob={self.prob})'
  2093. return repr_str
  2094. @TRANSFORMS.register_module()
  2095. class MixUp(BaseTransform):
  2096. """MixUp data augmentation.
  2097. .. code:: text
  2098. mixup transform
  2099. +------------------------------+
  2100. | mixup image | |
  2101. | +--------|--------+ |
  2102. | | | | |
  2103. |---------------+ | |
  2104. | | | |
  2105. | | image | |
  2106. | | | |
  2107. | | | |
  2108. | |-----------------+ |
  2109. | pad |
  2110. +------------------------------+
  2111. The mixup transform steps are as follows:
  2112. 1. Another random image is picked by dataset and embedded in
  2113. the top left patch(after padding and resizing)
  2114. 2. The target of mixup transform is the weighted average of mixup
  2115. image and origin image.
  2116. Required Keys:
  2117. - img
  2118. - gt_bboxes (BaseBoxes[torch.float32]) (optional)
  2119. - gt_bboxes_labels (np.int64) (optional)
  2120. - gt_ignore_flags (bool) (optional)
  2121. - mix_results (List[dict])
  2122. Modified Keys:
  2123. - img
  2124. - img_shape
  2125. - gt_bboxes (optional)
  2126. - gt_bboxes_labels (optional)
  2127. - gt_ignore_flags (optional)
  2128. Args:
  2129. img_scale (Sequence[int]): Image output size after mixup pipeline.
  2130. The shape order should be (width, height). Defaults to (640, 640).
  2131. ratio_range (Sequence[float]): Scale ratio of mixup image.
  2132. Defaults to (0.5, 1.5).
  2133. flip_ratio (float): Horizontal flip ratio of mixup image.
  2134. Defaults to 0.5.
  2135. pad_val (int): Pad value. Defaults to 114.
  2136. max_iters (int): The maximum number of iterations. If the number of
  2137. iterations is greater than `max_iters`, but gt_bbox is still
  2138. empty, then the iteration is terminated. Defaults to 15.
  2139. bbox_clip_border (bool, optional): Whether to clip the objects outside
  2140. the border of the image. In some dataset like MOT17, the gt bboxes
  2141. are allowed to cross the border of images. Therefore, we don't
  2142. need to clip the gt bboxes in these cases. Defaults to True.
  2143. """
  2144. def __init__(self,
  2145. img_scale: Tuple[int, int] = (640, 640),
  2146. ratio_range: Tuple[float, float] = (0.5, 1.5),
  2147. flip_ratio: float = 0.5,
  2148. pad_val: float = 114.0,
  2149. max_iters: int = 15,
  2150. bbox_clip_border: bool = True) -> None:
  2151. assert isinstance(img_scale, tuple)
  2152. log_img_scale(img_scale, skip_square=True, shape_order='wh')
  2153. self.dynamic_scale = img_scale
  2154. self.ratio_range = ratio_range
  2155. self.flip_ratio = flip_ratio
  2156. self.pad_val = pad_val
  2157. self.max_iters = max_iters
  2158. self.bbox_clip_border = bbox_clip_border
  2159. @cache_randomness
  2160. def get_indexes(self, dataset: BaseDataset) -> int:
  2161. """Call function to collect indexes.
  2162. Args:
  2163. dataset (:obj:`MultiImageMixDataset`): The dataset.
  2164. Returns:
  2165. list: indexes.
  2166. """
  2167. for i in range(self.max_iters):
  2168. index = random.randint(0, len(dataset))
  2169. gt_bboxes_i = dataset[index]['gt_bboxes']
  2170. if len(gt_bboxes_i) != 0:
  2171. break
  2172. return index
  2173. @autocast_box_type()
  2174. def transform(self, results: dict) -> dict:
  2175. """MixUp transform function.
  2176. Args:
  2177. results (dict): Result dict.
  2178. Returns:
  2179. dict: Updated result dict.
  2180. """
  2181. assert 'mix_results' in results
  2182. assert len(
  2183. results['mix_results']) == 1, 'MixUp only support 2 images now !'
  2184. if results['mix_results'][0]['gt_bboxes'].shape[0] == 0:
  2185. # empty bbox
  2186. return results
  2187. retrieve_results = results['mix_results'][0]
  2188. retrieve_img = retrieve_results['img']
  2189. jit_factor = random.uniform(*self.ratio_range)
  2190. is_filp = random.uniform(0, 1) > self.flip_ratio
  2191. if len(retrieve_img.shape) == 3:
  2192. out_img = np.ones(
  2193. (self.dynamic_scale[1], self.dynamic_scale[0], 3),
  2194. dtype=retrieve_img.dtype) * self.pad_val
  2195. else:
  2196. out_img = np.ones(
  2197. self.dynamic_scale[::-1],
  2198. dtype=retrieve_img.dtype) * self.pad_val
  2199. # 1. keep_ratio resize
  2200. scale_ratio = min(self.dynamic_scale[1] / retrieve_img.shape[0],
  2201. self.dynamic_scale[0] / retrieve_img.shape[1])
  2202. retrieve_img = mmcv.imresize(
  2203. retrieve_img, (int(retrieve_img.shape[1] * scale_ratio),
  2204. int(retrieve_img.shape[0] * scale_ratio)))
  2205. # 2. paste
  2206. out_img[:retrieve_img.shape[0], :retrieve_img.shape[1]] = retrieve_img
  2207. # 3. scale jit
  2208. scale_ratio *= jit_factor
  2209. out_img = mmcv.imresize(out_img, (int(out_img.shape[1] * jit_factor),
  2210. int(out_img.shape[0] * jit_factor)))
  2211. # 4. flip
  2212. if is_filp:
  2213. out_img = out_img[:, ::-1, :]
  2214. # 5. random crop
  2215. ori_img = results['img']
  2216. origin_h, origin_w = out_img.shape[:2]
  2217. target_h, target_w = ori_img.shape[:2]
  2218. padded_img = np.ones((max(origin_h, target_h), max(
  2219. origin_w, target_w), 3)) * self.pad_val
  2220. padded_img = padded_img.astype(np.uint8)
  2221. padded_img[:origin_h, :origin_w] = out_img
  2222. x_offset, y_offset = 0, 0
  2223. if padded_img.shape[0] > target_h:
  2224. y_offset = random.randint(0, padded_img.shape[0] - target_h)
  2225. if padded_img.shape[1] > target_w:
  2226. x_offset = random.randint(0, padded_img.shape[1] - target_w)
  2227. padded_cropped_img = padded_img[y_offset:y_offset + target_h,
  2228. x_offset:x_offset + target_w]
  2229. # 6. adjust bbox
  2230. retrieve_gt_bboxes = retrieve_results['gt_bboxes']
  2231. retrieve_gt_bboxes.rescale_([scale_ratio, scale_ratio])
  2232. if self.bbox_clip_border:
  2233. retrieve_gt_bboxes.clip_([origin_h, origin_w])
  2234. if is_filp:
  2235. retrieve_gt_bboxes.flip_([origin_h, origin_w],
  2236. direction='horizontal')
  2237. # 7. filter
  2238. cp_retrieve_gt_bboxes = retrieve_gt_bboxes.clone()
  2239. cp_retrieve_gt_bboxes.translate_([-x_offset, -y_offset])
  2240. if self.bbox_clip_border:
  2241. cp_retrieve_gt_bboxes.clip_([target_h, target_w])
  2242. # 8. mix up
  2243. ori_img = ori_img.astype(np.float32)
  2244. mixup_img = 0.5 * ori_img + 0.5 * padded_cropped_img.astype(np.float32)
  2245. retrieve_gt_bboxes_labels = retrieve_results['gt_bboxes_labels']
  2246. retrieve_gt_ignore_flags = retrieve_results['gt_ignore_flags']
  2247. mixup_gt_bboxes = cp_retrieve_gt_bboxes.cat(
  2248. (results['gt_bboxes'], cp_retrieve_gt_bboxes), dim=0)
  2249. mixup_gt_bboxes_labels = np.concatenate(
  2250. (results['gt_bboxes_labels'], retrieve_gt_bboxes_labels), axis=0)
  2251. mixup_gt_ignore_flags = np.concatenate(
  2252. (results['gt_ignore_flags'], retrieve_gt_ignore_flags), axis=0)
  2253. # remove outside bbox
  2254. inside_inds = mixup_gt_bboxes.is_inside([target_h, target_w]).numpy()
  2255. mixup_gt_bboxes = mixup_gt_bboxes[inside_inds]
  2256. mixup_gt_bboxes_labels = mixup_gt_bboxes_labels[inside_inds]
  2257. mixup_gt_ignore_flags = mixup_gt_ignore_flags[inside_inds]
  2258. results['img'] = mixup_img.astype(np.uint8)
  2259. results['img_shape'] = mixup_img.shape[:2]
  2260. results['gt_bboxes'] = mixup_gt_bboxes
  2261. results['gt_bboxes_labels'] = mixup_gt_bboxes_labels
  2262. results['gt_ignore_flags'] = mixup_gt_ignore_flags
  2263. return results
  2264. def __repr__(self):
  2265. repr_str = self.__class__.__name__
  2266. repr_str += f'(dynamic_scale={self.dynamic_scale}, '
  2267. repr_str += f'ratio_range={self.ratio_range}, '
  2268. repr_str += f'flip_ratio={self.flip_ratio}, '
  2269. repr_str += f'pad_val={self.pad_val}, '
  2270. repr_str += f'max_iters={self.max_iters}, '
  2271. repr_str += f'bbox_clip_border={self.bbox_clip_border})'
  2272. return repr_str
  2273. @TRANSFORMS.register_module()
  2274. class RandomAffine(BaseTransform):
  2275. """Random affine transform data augmentation.
  2276. This operation randomly generates affine transform matrix which including
  2277. rotation, translation, shear and scaling transforms.
  2278. Required Keys:
  2279. - img
  2280. - gt_bboxes (BaseBoxes[torch.float32]) (optional)
  2281. - gt_bboxes_labels (np.int64) (optional)
  2282. - gt_ignore_flags (bool) (optional)
  2283. Modified Keys:
  2284. - img
  2285. - img_shape
  2286. - gt_bboxes (optional)
  2287. - gt_bboxes_labels (optional)
  2288. - gt_ignore_flags (optional)
  2289. Args:
  2290. max_rotate_degree (float): Maximum degrees of rotation transform.
  2291. Defaults to 10.
  2292. max_translate_ratio (float): Maximum ratio of translation.
  2293. Defaults to 0.1.
  2294. scaling_ratio_range (tuple[float]): Min and max ratio of
  2295. scaling transform. Defaults to (0.5, 1.5).
  2296. max_shear_degree (float): Maximum degrees of shear
  2297. transform. Defaults to 2.
  2298. border (tuple[int]): Distance from width and height sides of input
  2299. image to adjust output shape. Only used in mosaic dataset.
  2300. Defaults to (0, 0).
  2301. border_val (tuple[int]): Border padding values of 3 channels.
  2302. Defaults to (114, 114, 114).
  2303. bbox_clip_border (bool, optional): Whether to clip the objects outside
  2304. the border of the image. In some dataset like MOT17, the gt bboxes
  2305. are allowed to cross the border of images. Therefore, we don't
  2306. need to clip the gt bboxes in these cases. Defaults to True.
  2307. """
  2308. def __init__(self,
  2309. max_rotate_degree: float = 10.0,
  2310. max_translate_ratio: float = 0.1,
  2311. scaling_ratio_range: Tuple[float, float] = (0.5, 1.5),
  2312. max_shear_degree: float = 2.0,
  2313. border: Tuple[int, int] = (0, 0),
  2314. border_val: Tuple[int, int, int] = (114, 114, 114),
  2315. bbox_clip_border: bool = True) -> None:
  2316. assert 0 <= max_translate_ratio <= 1
  2317. assert scaling_ratio_range[0] <= scaling_ratio_range[1]
  2318. assert scaling_ratio_range[0] > 0
  2319. self.max_rotate_degree = max_rotate_degree
  2320. self.max_translate_ratio = max_translate_ratio
  2321. self.scaling_ratio_range = scaling_ratio_range
  2322. self.max_shear_degree = max_shear_degree
  2323. self.border = border
  2324. self.border_val = border_val
  2325. self.bbox_clip_border = bbox_clip_border
  2326. @cache_randomness
  2327. def _get_random_homography_matrix(self, height, width):
  2328. # Rotation
  2329. rotation_degree = random.uniform(-self.max_rotate_degree,
  2330. self.max_rotate_degree)
  2331. rotation_matrix = self._get_rotation_matrix(rotation_degree)
  2332. # Scaling
  2333. scaling_ratio = random.uniform(self.scaling_ratio_range[0],
  2334. self.scaling_ratio_range[1])
  2335. scaling_matrix = self._get_scaling_matrix(scaling_ratio)
  2336. # Shear
  2337. x_degree = random.uniform(-self.max_shear_degree,
  2338. self.max_shear_degree)
  2339. y_degree = random.uniform(-self.max_shear_degree,
  2340. self.max_shear_degree)
  2341. shear_matrix = self._get_shear_matrix(x_degree, y_degree)
  2342. # Translation
  2343. trans_x = random.uniform(-self.max_translate_ratio,
  2344. self.max_translate_ratio) * width
  2345. trans_y = random.uniform(-self.max_translate_ratio,
  2346. self.max_translate_ratio) * height
  2347. translate_matrix = self._get_translation_matrix(trans_x, trans_y)
  2348. warp_matrix = (
  2349. translate_matrix @ shear_matrix @ rotation_matrix @ scaling_matrix)
  2350. return warp_matrix
  2351. @autocast_box_type()
  2352. def transform(self, results: dict) -> dict:
  2353. img = results['img']
  2354. height = img.shape[0] + self.border[1] * 2
  2355. width = img.shape[1] + self.border[0] * 2
  2356. warp_matrix = self._get_random_homography_matrix(height, width)
  2357. img = cv2.warpPerspective(
  2358. img,
  2359. warp_matrix,
  2360. dsize=(width, height),
  2361. borderValue=self.border_val)
  2362. results['img'] = img
  2363. results['img_shape'] = img.shape[:2]
  2364. bboxes = results['gt_bboxes']
  2365. num_bboxes = len(bboxes)
  2366. if num_bboxes:
  2367. bboxes.project_(warp_matrix)
  2368. if self.bbox_clip_border:
  2369. bboxes.clip_([height, width])
  2370. # remove outside bbox
  2371. valid_index = bboxes.is_inside([height, width]).numpy()
  2372. results['gt_bboxes'] = bboxes[valid_index]
  2373. results['gt_bboxes_labels'] = results['gt_bboxes_labels'][
  2374. valid_index]
  2375. results['gt_ignore_flags'] = results['gt_ignore_flags'][
  2376. valid_index]
  2377. if 'gt_masks' in results:
  2378. raise NotImplementedError('RandomAffine only supports bbox.')
  2379. return results
  2380. def __repr__(self):
  2381. repr_str = self.__class__.__name__
  2382. repr_str += f'(max_rotate_degree={self.max_rotate_degree}, '
  2383. repr_str += f'max_translate_ratio={self.max_translate_ratio}, '
  2384. repr_str += f'scaling_ratio_range={self.scaling_ratio_range}, '
  2385. repr_str += f'max_shear_degree={self.max_shear_degree}, '
  2386. repr_str += f'border={self.border}, '
  2387. repr_str += f'border_val={self.border_val}, '
  2388. repr_str += f'bbox_clip_border={self.bbox_clip_border})'
  2389. return repr_str
  2390. @staticmethod
  2391. def _get_rotation_matrix(rotate_degrees: float) -> np.ndarray:
  2392. radian = math.radians(rotate_degrees)
  2393. rotation_matrix = np.array(
  2394. [[np.cos(radian), -np.sin(radian), 0.],
  2395. [np.sin(radian), np.cos(radian), 0.], [0., 0., 1.]],
  2396. dtype=np.float32)
  2397. return rotation_matrix
  2398. @staticmethod
  2399. def _get_scaling_matrix(scale_ratio: float) -> np.ndarray:
  2400. scaling_matrix = np.array(
  2401. [[scale_ratio, 0., 0.], [0., scale_ratio, 0.], [0., 0., 1.]],
  2402. dtype=np.float32)
  2403. return scaling_matrix
  2404. @staticmethod
  2405. def _get_shear_matrix(x_shear_degrees: float,
  2406. y_shear_degrees: float) -> np.ndarray:
  2407. x_radian = math.radians(x_shear_degrees)
  2408. y_radian = math.radians(y_shear_degrees)
  2409. shear_matrix = np.array([[1, np.tan(x_radian), 0.],
  2410. [np.tan(y_radian), 1, 0.], [0., 0., 1.]],
  2411. dtype=np.float32)
  2412. return shear_matrix
  2413. @staticmethod
  2414. def _get_translation_matrix(x: float, y: float) -> np.ndarray:
  2415. translation_matrix = np.array([[1, 0., x], [0., 1, y], [0., 0., 1.]],
  2416. dtype=np.float32)
  2417. return translation_matrix
  2418. @TRANSFORMS.register_module()
  2419. class YOLOXHSVRandomAug(BaseTransform):
  2420. """Apply HSV augmentation to image sequentially. It is referenced from
  2421. https://github.com/Megvii-
  2422. BaseDetection/YOLOX/blob/main/yolox/data/data_augment.py#L21.
  2423. Required Keys:
  2424. - img
  2425. Modified Keys:
  2426. - img
  2427. Args:
  2428. hue_delta (int): delta of hue. Defaults to 5.
  2429. saturation_delta (int): delta of saturation. Defaults to 30.
  2430. value_delta (int): delat of value. Defaults to 30.
  2431. """
  2432. def __init__(self,
  2433. hue_delta: int = 5,
  2434. saturation_delta: int = 30,
  2435. value_delta: int = 30) -> None:
  2436. self.hue_delta = hue_delta
  2437. self.saturation_delta = saturation_delta
  2438. self.value_delta = value_delta
  2439. @cache_randomness
  2440. def _get_hsv_gains(self):
  2441. hsv_gains = np.random.uniform(-1, 1, 3) * [
  2442. self.hue_delta, self.saturation_delta, self.value_delta
  2443. ]
  2444. # random selection of h, s, v
  2445. hsv_gains *= np.random.randint(0, 2, 3)
  2446. # prevent overflow
  2447. hsv_gains = hsv_gains.astype(np.int16)
  2448. return hsv_gains
  2449. def transform(self, results: dict) -> dict:
  2450. img = results['img']
  2451. hsv_gains = self._get_hsv_gains()
  2452. img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV).astype(np.int16)
  2453. img_hsv[..., 0] = (img_hsv[..., 0] + hsv_gains[0]) % 180
  2454. img_hsv[..., 1] = np.clip(img_hsv[..., 1] + hsv_gains[1], 0, 255)
  2455. img_hsv[..., 2] = np.clip(img_hsv[..., 2] + hsv_gains[2], 0, 255)
  2456. cv2.cvtColor(img_hsv.astype(img.dtype), cv2.COLOR_HSV2BGR, dst=img)
  2457. results['img'] = img
  2458. return results
  2459. def __repr__(self):
  2460. repr_str = self.__class__.__name__
  2461. repr_str += f'(hue_delta={self.hue_delta}, '
  2462. repr_str += f'saturation_delta={self.saturation_delta}, '
  2463. repr_str += f'value_delta={self.value_delta})'
  2464. return repr_str
  2465. @TRANSFORMS.register_module()
  2466. class CopyPaste(BaseTransform):
  2467. """Simple Copy-Paste is a Strong Data Augmentation Method for Instance
  2468. Segmentation The simple copy-paste transform steps are as follows:
  2469. 1. The destination image is already resized with aspect ratio kept,
  2470. cropped and padded.
  2471. 2. Randomly select a source image, which is also already resized
  2472. with aspect ratio kept, cropped and padded in a similar way
  2473. as the destination image.
  2474. 3. Randomly select some objects from the source image.
  2475. 4. Paste these source objects to the destination image directly,
  2476. due to the source and destination image have the same size.
  2477. 5. Update object masks of the destination image, for some origin objects
  2478. may be occluded.
  2479. 6. Generate bboxes from the updated destination masks and
  2480. filter some objects which are totally occluded, and adjust bboxes
  2481. which are partly occluded.
  2482. 7. Append selected source bboxes, masks, and labels.
  2483. Required Keys:
  2484. - img
  2485. - gt_bboxes (BaseBoxes[torch.float32]) (optional)
  2486. - gt_bboxes_labels (np.int64) (optional)
  2487. - gt_ignore_flags (bool) (optional)
  2488. - gt_masks (BitmapMasks) (optional)
  2489. Modified Keys:
  2490. - img
  2491. - gt_bboxes (optional)
  2492. - gt_bboxes_labels (optional)
  2493. - gt_ignore_flags (optional)
  2494. - gt_masks (optional)
  2495. Args:
  2496. max_num_pasted (int): The maximum number of pasted objects.
  2497. Defaults to 100.
  2498. bbox_occluded_thr (int): The threshold of occluded bbox.
  2499. Defaults to 10.
  2500. mask_occluded_thr (int): The threshold of occluded mask.
  2501. Defaults to 300.
  2502. selected (bool): Whether select objects or not. If select is False,
  2503. all objects of the source image will be pasted to the
  2504. destination image.
  2505. Defaults to True.
  2506. paste_by_box (bool): Whether use boxes as masks when masks are not
  2507. available.
  2508. Defaults to False.
  2509. """
  2510. def __init__(
  2511. self,
  2512. max_num_pasted: int = 100,
  2513. bbox_occluded_thr: int = 10,
  2514. mask_occluded_thr: int = 300,
  2515. selected: bool = True,
  2516. paste_by_box: bool = False,
  2517. ) -> None:
  2518. self.max_num_pasted = max_num_pasted
  2519. self.bbox_occluded_thr = bbox_occluded_thr
  2520. self.mask_occluded_thr = mask_occluded_thr
  2521. self.selected = selected
  2522. self.paste_by_box = paste_by_box
  2523. @cache_randomness
  2524. def get_indexes(self, dataset: BaseDataset) -> int:
  2525. """Call function to collect indexes.s.
  2526. Args:
  2527. dataset (:obj:`MultiImageMixDataset`): The dataset.
  2528. Returns:
  2529. list: Indexes.
  2530. """
  2531. return random.randint(0, len(dataset))
  2532. @autocast_box_type()
  2533. def transform(self, results: dict) -> dict:
  2534. """Transform function to make a copy-paste of image.
  2535. Args:
  2536. results (dict): Result dict.
  2537. Returns:
  2538. dict: Result dict with copy-paste transformed.
  2539. """
  2540. assert 'mix_results' in results
  2541. num_images = len(results['mix_results'])
  2542. assert num_images == 1, \
  2543. f'CopyPaste only supports processing 2 images, got {num_images}'
  2544. if self.selected:
  2545. selected_results = self._select_object(results['mix_results'][0])
  2546. else:
  2547. selected_results = results['mix_results'][0]
  2548. return self._copy_paste(results, selected_results)
  2549. @cache_randomness
  2550. def _get_selected_inds(self, num_bboxes: int) -> np.ndarray:
  2551. max_num_pasted = min(num_bboxes + 1, self.max_num_pasted)
  2552. num_pasted = np.random.randint(0, max_num_pasted)
  2553. return np.random.choice(num_bboxes, size=num_pasted, replace=False)
  2554. def get_gt_masks(self, results: dict) -> BitmapMasks:
  2555. """Get gt_masks originally or generated based on bboxes.
  2556. If gt_masks is not contained in results,
  2557. it will be generated based on gt_bboxes.
  2558. Args:
  2559. results (dict): Result dict.
  2560. Returns:
  2561. BitmapMasks: gt_masks, originally or generated based on bboxes.
  2562. """
  2563. if results.get('gt_masks', None) is not None:
  2564. if self.paste_by_box:
  2565. warnings.warn('gt_masks is already contained in results, '
  2566. 'so paste_by_box is disabled.')
  2567. return results['gt_masks']
  2568. else:
  2569. if not self.paste_by_box:
  2570. raise RuntimeError('results does not contain masks.')
  2571. return results['gt_bboxes'].create_masks(results['img'].shape[:2])
  2572. def _select_object(self, results: dict) -> dict:
  2573. """Select some objects from the source results."""
  2574. bboxes = results['gt_bboxes']
  2575. labels = results['gt_bboxes_labels']
  2576. masks = self.get_gt_masks(results)
  2577. ignore_flags = results['gt_ignore_flags']
  2578. selected_inds = self._get_selected_inds(bboxes.shape[0])
  2579. selected_bboxes = bboxes[selected_inds]
  2580. selected_labels = labels[selected_inds]
  2581. selected_masks = masks[selected_inds]
  2582. selected_ignore_flags = ignore_flags[selected_inds]
  2583. results['gt_bboxes'] = selected_bboxes
  2584. results['gt_bboxes_labels'] = selected_labels
  2585. results['gt_masks'] = selected_masks
  2586. results['gt_ignore_flags'] = selected_ignore_flags
  2587. return results
  2588. def _copy_paste(self, dst_results: dict, src_results: dict) -> dict:
  2589. """CopyPaste transform function.
  2590. Args:
  2591. dst_results (dict): Result dict of the destination image.
  2592. src_results (dict): Result dict of the source image.
  2593. Returns:
  2594. dict: Updated result dict.
  2595. """
  2596. dst_img = dst_results['img']
  2597. dst_bboxes = dst_results['gt_bboxes']
  2598. dst_labels = dst_results['gt_bboxes_labels']
  2599. dst_masks = self.get_gt_masks(dst_results)
  2600. dst_ignore_flags = dst_results['gt_ignore_flags']
  2601. src_img = src_results['img']
  2602. src_bboxes = src_results['gt_bboxes']
  2603. src_labels = src_results['gt_bboxes_labels']
  2604. src_masks = src_results['gt_masks']
  2605. src_ignore_flags = src_results['gt_ignore_flags']
  2606. if len(src_bboxes) == 0:
  2607. return dst_results
  2608. # update masks and generate bboxes from updated masks
  2609. composed_mask = np.where(np.any(src_masks.masks, axis=0), 1, 0)
  2610. updated_dst_masks = self._get_updated_masks(dst_masks, composed_mask)
  2611. updated_dst_bboxes = updated_dst_masks.get_bboxes(type(dst_bboxes))
  2612. assert len(updated_dst_bboxes) == len(updated_dst_masks)
  2613. # filter totally occluded objects
  2614. l1_distance = (updated_dst_bboxes.tensor - dst_bboxes.tensor).abs()
  2615. bboxes_inds = (l1_distance <= self.bbox_occluded_thr).all(
  2616. dim=-1).numpy()
  2617. masks_inds = updated_dst_masks.masks.sum(
  2618. axis=(1, 2)) > self.mask_occluded_thr
  2619. valid_inds = bboxes_inds | masks_inds
  2620. # Paste source objects to destination image directly
  2621. img = dst_img * (1 - composed_mask[..., np.newaxis]
  2622. ) + src_img * composed_mask[..., np.newaxis]
  2623. bboxes = src_bboxes.cat([updated_dst_bboxes[valid_inds], src_bboxes])
  2624. labels = np.concatenate([dst_labels[valid_inds], src_labels])
  2625. masks = np.concatenate(
  2626. [updated_dst_masks.masks[valid_inds], src_masks.masks])
  2627. ignore_flags = np.concatenate(
  2628. [dst_ignore_flags[valid_inds], src_ignore_flags])
  2629. dst_results['img'] = img
  2630. dst_results['gt_bboxes'] = bboxes
  2631. dst_results['gt_bboxes_labels'] = labels
  2632. dst_results['gt_masks'] = BitmapMasks(masks, masks.shape[1],
  2633. masks.shape[2])
  2634. dst_results['gt_ignore_flags'] = ignore_flags
  2635. return dst_results
  2636. def _get_updated_masks(self, masks: BitmapMasks,
  2637. composed_mask: np.ndarray) -> BitmapMasks:
  2638. """Update masks with composed mask."""
  2639. assert masks.masks.shape[-2:] == composed_mask.shape[-2:], \
  2640. 'Cannot compare two arrays of different size'
  2641. masks.masks = np.where(composed_mask, 0, masks.masks)
  2642. return masks
  2643. def __repr__(self):
  2644. repr_str = self.__class__.__name__
  2645. repr_str += f'(max_num_pasted={self.max_num_pasted}, '
  2646. repr_str += f'bbox_occluded_thr={self.bbox_occluded_thr}, '
  2647. repr_str += f'mask_occluded_thr={self.mask_occluded_thr}, '
  2648. repr_str += f'selected={self.selected}), '
  2649. repr_str += f'paste_by_box={self.paste_by_box})'
  2650. return repr_str
  2651. @TRANSFORMS.register_module()
  2652. class RandomErasing(BaseTransform):
  2653. """RandomErasing operation.
  2654. Random Erasing randomly selects a rectangle region
  2655. in an image and erases its pixels with random values.
  2656. `RandomErasing <https://arxiv.org/abs/1708.04896>`_.
  2657. Required Keys:
  2658. - img
  2659. - gt_bboxes (HorizontalBoxes[torch.float32]) (optional)
  2660. - gt_bboxes_labels (np.int64) (optional)
  2661. - gt_ignore_flags (bool) (optional)
  2662. - gt_masks (BitmapMasks) (optional)
  2663. Modified Keys:
  2664. - img
  2665. - gt_bboxes (optional)
  2666. - gt_bboxes_labels (optional)
  2667. - gt_ignore_flags (optional)
  2668. - gt_masks (optional)
  2669. Args:
  2670. n_patches (int or tuple[int, int]): Number of regions to be dropped.
  2671. If it is given as a tuple, number of patches will be randomly
  2672. selected from the closed interval [``n_patches[0]``,
  2673. ``n_patches[1]``].
  2674. ratio (float or tuple[float, float]): The ratio of erased regions.
  2675. It can be ``float`` to use a fixed ratio or ``tuple[float, float]``
  2676. to randomly choose ratio from the interval.
  2677. squared (bool): Whether to erase square region. Defaults to True.
  2678. bbox_erased_thr (float): The threshold for the maximum area proportion
  2679. of the bbox to be erased. When the proportion of the area where the
  2680. bbox is erased is greater than the threshold, the bbox will be
  2681. removed. Defaults to 0.9.
  2682. img_border_value (int or float or tuple): The filled values for
  2683. image border. If float, the same fill value will be used for
  2684. all the three channels of image. If tuple, it should be 3 elements.
  2685. Defaults to 128.
  2686. mask_border_value (int): The fill value used for masks. Defaults to 0.
  2687. seg_ignore_label (int): The fill value used for segmentation map.
  2688. Note this value must equals ``ignore_label`` in ``semantic_head``
  2689. of the corresponding config. Defaults to 255.
  2690. """
  2691. def __init__(
  2692. self,
  2693. n_patches: Union[int, Tuple[int, int]],
  2694. ratio: Union[float, Tuple[float, float]],
  2695. squared: bool = True,
  2696. bbox_erased_thr: float = 0.9,
  2697. img_border_value: Union[int, float, tuple] = 128,
  2698. mask_border_value: int = 0,
  2699. seg_ignore_label: int = 255,
  2700. ) -> None:
  2701. if isinstance(n_patches, tuple):
  2702. assert len(n_patches) == 2 and 0 <= n_patches[0] < n_patches[1]
  2703. else:
  2704. n_patches = (n_patches, n_patches)
  2705. if isinstance(ratio, tuple):
  2706. assert len(ratio) == 2 and 0 <= ratio[0] < ratio[1] <= 1
  2707. else:
  2708. ratio = (ratio, ratio)
  2709. self.n_patches = n_patches
  2710. self.ratio = ratio
  2711. self.squared = squared
  2712. self.bbox_erased_thr = bbox_erased_thr
  2713. self.img_border_value = img_border_value
  2714. self.mask_border_value = mask_border_value
  2715. self.seg_ignore_label = seg_ignore_label
  2716. @cache_randomness
  2717. def _get_patches(self, img_shape: Tuple[int, int]) -> List[list]:
  2718. """Get patches for random erasing."""
  2719. patches = []
  2720. n_patches = np.random.randint(self.n_patches[0], self.n_patches[1] + 1)
  2721. for _ in range(n_patches):
  2722. if self.squared:
  2723. ratio = np.random.random() * (self.ratio[1] -
  2724. self.ratio[0]) + self.ratio[0]
  2725. ratio = (ratio, ratio)
  2726. else:
  2727. ratio = (np.random.random() * (self.ratio[1] - self.ratio[0]) +
  2728. self.ratio[0], np.random.random() *
  2729. (self.ratio[1] - self.ratio[0]) + self.ratio[0])
  2730. ph, pw = int(img_shape[0] * ratio[0]), int(img_shape[1] * ratio[1])
  2731. px1, py1 = np.random.randint(0,
  2732. img_shape[1] - pw), np.random.randint(
  2733. 0, img_shape[0] - ph)
  2734. px2, py2 = px1 + pw, py1 + ph
  2735. patches.append([px1, py1, px2, py2])
  2736. return np.array(patches)
  2737. def _transform_img(self, results: dict, patches: List[list]) -> None:
  2738. """Random erasing the image."""
  2739. for patch in patches:
  2740. px1, py1, px2, py2 = patch
  2741. results['img'][py1:py2, px1:px2, :] = self.img_border_value
  2742. def _transform_bboxes(self, results: dict, patches: List[list]) -> None:
  2743. """Random erasing the bboxes."""
  2744. bboxes = results['gt_bboxes']
  2745. # TODO: unify the logic by using operators in BaseBoxes.
  2746. assert isinstance(bboxes, HorizontalBoxes)
  2747. bboxes = bboxes.numpy()
  2748. left_top = np.maximum(bboxes[:, None, :2], patches[:, :2])
  2749. right_bottom = np.minimum(bboxes[:, None, 2:], patches[:, 2:])
  2750. wh = np.maximum(right_bottom - left_top, 0)
  2751. inter_areas = wh[:, :, 0] * wh[:, :, 1]
  2752. bbox_areas = (bboxes[:, 2] - bboxes[:, 0]) * (
  2753. bboxes[:, 3] - bboxes[:, 1])
  2754. bboxes_erased_ratio = inter_areas.sum(-1) / (bbox_areas + 1e-7)
  2755. valid_inds = bboxes_erased_ratio < self.bbox_erased_thr
  2756. results['gt_bboxes'] = HorizontalBoxes(bboxes[valid_inds])
  2757. results['gt_bboxes_labels'] = results['gt_bboxes_labels'][valid_inds]
  2758. results['gt_ignore_flags'] = results['gt_ignore_flags'][valid_inds]
  2759. if results.get('gt_masks', None) is not None:
  2760. results['gt_masks'] = results['gt_masks'][valid_inds]
  2761. def _transform_masks(self, results: dict, patches: List[list]) -> None:
  2762. """Random erasing the masks."""
  2763. for patch in patches:
  2764. px1, py1, px2, py2 = patch
  2765. results['gt_masks'].masks[:, py1:py2,
  2766. px1:px2] = self.mask_border_value
  2767. def _transform_seg(self, results: dict, patches: List[list]) -> None:
  2768. """Random erasing the segmentation map."""
  2769. for patch in patches:
  2770. px1, py1, px2, py2 = patch
  2771. results['gt_seg_map'][py1:py2, px1:px2] = self.seg_ignore_label
  2772. @autocast_box_type()
  2773. def transform(self, results: dict) -> dict:
  2774. """Transform function to erase some regions of image."""
  2775. patches = self._get_patches(results['img_shape'])
  2776. self._transform_img(results, patches)
  2777. if results.get('gt_bboxes', None) is not None:
  2778. self._transform_bboxes(results, patches)
  2779. if results.get('gt_masks', None) is not None:
  2780. self._transform_masks(results, patches)
  2781. if results.get('gt_seg_map', None) is not None:
  2782. self._transform_seg(results, patches)
  2783. return results
  2784. def __repr__(self):
  2785. repr_str = self.__class__.__name__
  2786. repr_str += f'(n_patches={self.n_patches}, '
  2787. repr_str += f'ratio={self.ratio}, '
  2788. repr_str += f'squared={self.squared}, '
  2789. repr_str += f'bbox_erased_thr={self.bbox_erased_thr}, '
  2790. repr_str += f'img_border_value={self.img_border_value}, '
  2791. repr_str += f'mask_border_value={self.mask_border_value}, '
  2792. repr_str += f'seg_ignore_label={self.seg_ignore_label})'
  2793. return repr_str
  2794. @TRANSFORMS.register_module()
  2795. class CachedMosaic(Mosaic):
  2796. """Cached mosaic augmentation.
  2797. Cached mosaic transform will random select images from the cache
  2798. and combine them into one output image.
  2799. .. code:: text
  2800. mosaic transform
  2801. center_x
  2802. +------------------------------+
  2803. | pad | pad |
  2804. | +-----------+ |
  2805. | | | |
  2806. | | image1 |--------+ |
  2807. | | | | |
  2808. | | | image2 | |
  2809. center_y |----+-------------+-----------|
  2810. | | cropped | |
  2811. |pad | image3 | image4 |
  2812. | | | |
  2813. +----|-------------+-----------+
  2814. | |
  2815. +-------------+
  2816. The cached mosaic transform steps are as follows:
  2817. 1. Append the results from the last transform into the cache.
  2818. 2. Choose the mosaic center as the intersections of 4 images
  2819. 3. Get the left top image according to the index, and randomly
  2820. sample another 3 images from the result cache.
  2821. 4. Sub image will be cropped if image is larger than mosaic patch
  2822. Required Keys:
  2823. - img
  2824. - gt_bboxes (np.float32) (optional)
  2825. - gt_bboxes_labels (np.int64) (optional)
  2826. - gt_ignore_flags (bool) (optional)
  2827. Modified Keys:
  2828. - img
  2829. - img_shape
  2830. - gt_bboxes (optional)
  2831. - gt_bboxes_labels (optional)
  2832. - gt_ignore_flags (optional)
  2833. Args:
  2834. img_scale (Sequence[int]): Image size before mosaic pipeline of single
  2835. image. The shape order should be (width, height).
  2836. Defaults to (640, 640).
  2837. center_ratio_range (Sequence[float]): Center ratio range of mosaic
  2838. output. Defaults to (0.5, 1.5).
  2839. bbox_clip_border (bool, optional): Whether to clip the objects outside
  2840. the border of the image. In some dataset like MOT17, the gt bboxes
  2841. are allowed to cross the border of images. Therefore, we don't
  2842. need to clip the gt bboxes in these cases. Defaults to True.
  2843. pad_val (int): Pad value. Defaults to 114.
  2844. prob (float): Probability of applying this transformation.
  2845. Defaults to 1.0.
  2846. max_cached_images (int): The maximum length of the cache. The larger
  2847. the cache, the stronger the randomness of this transform. As a
  2848. rule of thumb, providing 10 caches for each image suffices for
  2849. randomness. Defaults to 40.
  2850. random_pop (bool): Whether to randomly pop a result from the cache
  2851. when the cache is full. If set to False, use FIFO popping method.
  2852. Defaults to True.
  2853. """
  2854. def __init__(self,
  2855. *args,
  2856. max_cached_images: int = 40,
  2857. random_pop: bool = True,
  2858. **kwargs) -> None:
  2859. super().__init__(*args, **kwargs)
  2860. self.results_cache = []
  2861. self.random_pop = random_pop
  2862. assert max_cached_images >= 4, 'The length of cache must >= 4, ' \
  2863. f'but got {max_cached_images}.'
  2864. self.max_cached_images = max_cached_images
  2865. @cache_randomness
  2866. def get_indexes(self, cache: list) -> list:
  2867. """Call function to collect indexes.
  2868. Args:
  2869. cache (list): The results cache.
  2870. Returns:
  2871. list: indexes.
  2872. """
  2873. indexes = [random.randint(0, len(cache) - 1) for _ in range(3)]
  2874. return indexes
  2875. @autocast_box_type()
  2876. def transform(self, results: dict) -> dict:
  2877. """Mosaic transform function.
  2878. Args:
  2879. results (dict): Result dict.
  2880. Returns:
  2881. dict: Updated result dict.
  2882. """
  2883. # cache and pop images
  2884. self.results_cache.append(copy.deepcopy(results))
  2885. if len(self.results_cache) > self.max_cached_images:
  2886. if self.random_pop:
  2887. index = random.randint(0, len(self.results_cache) - 1)
  2888. else:
  2889. index = 0
  2890. self.results_cache.pop(index)
  2891. if len(self.results_cache) <= 4:
  2892. return results
  2893. if random.uniform(0, 1) > self.prob:
  2894. return results
  2895. indices = self.get_indexes(self.results_cache)
  2896. mix_results = [copy.deepcopy(self.results_cache[i]) for i in indices]
  2897. # TODO: refactor mosaic to reuse these code.
  2898. mosaic_bboxes = []
  2899. mosaic_bboxes_labels = []
  2900. mosaic_ignore_flags = []
  2901. mosaic_masks = []
  2902. with_mask = True if 'gt_masks' in results else False
  2903. if len(results['img'].shape) == 3:
  2904. mosaic_img = np.full(
  2905. (int(self.img_scale[1] * 2), int(self.img_scale[0] * 2), 3),
  2906. self.pad_val,
  2907. dtype=results['img'].dtype)
  2908. else:
  2909. mosaic_img = np.full(
  2910. (int(self.img_scale[1] * 2), int(self.img_scale[0] * 2)),
  2911. self.pad_val,
  2912. dtype=results['img'].dtype)
  2913. # mosaic center x, y
  2914. center_x = int(
  2915. random.uniform(*self.center_ratio_range) * self.img_scale[0])
  2916. center_y = int(
  2917. random.uniform(*self.center_ratio_range) * self.img_scale[1])
  2918. center_position = (center_x, center_y)
  2919. loc_strs = ('top_left', 'top_right', 'bottom_left', 'bottom_right')
  2920. for i, loc in enumerate(loc_strs):
  2921. if loc == 'top_left':
  2922. results_patch = copy.deepcopy(results)
  2923. else:
  2924. results_patch = copy.deepcopy(mix_results[i - 1])
  2925. img_i = results_patch['img']
  2926. h_i, w_i = img_i.shape[:2]
  2927. # keep_ratio resize
  2928. scale_ratio_i = min(self.img_scale[1] / h_i,
  2929. self.img_scale[0] / w_i)
  2930. img_i = mmcv.imresize(
  2931. img_i, (int(w_i * scale_ratio_i), int(h_i * scale_ratio_i)))
  2932. # compute the combine parameters
  2933. paste_coord, crop_coord = self._mosaic_combine(
  2934. loc, center_position, img_i.shape[:2][::-1])
  2935. x1_p, y1_p, x2_p, y2_p = paste_coord
  2936. x1_c, y1_c, x2_c, y2_c = crop_coord
  2937. # crop and paste image
  2938. mosaic_img[y1_p:y2_p, x1_p:x2_p] = img_i[y1_c:y2_c, x1_c:x2_c]
  2939. # adjust coordinate
  2940. gt_bboxes_i = results_patch['gt_bboxes']
  2941. gt_bboxes_labels_i = results_patch['gt_bboxes_labels']
  2942. gt_ignore_flags_i = results_patch['gt_ignore_flags']
  2943. padw = x1_p - x1_c
  2944. padh = y1_p - y1_c
  2945. gt_bboxes_i.rescale_([scale_ratio_i, scale_ratio_i])
  2946. gt_bboxes_i.translate_([padw, padh])
  2947. mosaic_bboxes.append(gt_bboxes_i)
  2948. mosaic_bboxes_labels.append(gt_bboxes_labels_i)
  2949. mosaic_ignore_flags.append(gt_ignore_flags_i)
  2950. if with_mask and results_patch.get('gt_masks', None) is not None:
  2951. gt_masks_i = results_patch['gt_masks']
  2952. gt_masks_i = gt_masks_i.rescale(float(scale_ratio_i))
  2953. gt_masks_i = gt_masks_i.translate(
  2954. out_shape=(int(self.img_scale[0] * 2),
  2955. int(self.img_scale[1] * 2)),
  2956. offset=padw,
  2957. direction='horizontal')
  2958. gt_masks_i = gt_masks_i.translate(
  2959. out_shape=(int(self.img_scale[0] * 2),
  2960. int(self.img_scale[1] * 2)),
  2961. offset=padh,
  2962. direction='vertical')
  2963. mosaic_masks.append(gt_masks_i)
  2964. mosaic_bboxes = mosaic_bboxes[0].cat(mosaic_bboxes, 0)
  2965. mosaic_bboxes_labels = np.concatenate(mosaic_bboxes_labels, 0)
  2966. mosaic_ignore_flags = np.concatenate(mosaic_ignore_flags, 0)
  2967. if self.bbox_clip_border:
  2968. mosaic_bboxes.clip_([2 * self.img_scale[1], 2 * self.img_scale[0]])
  2969. # remove outside bboxes
  2970. inside_inds = mosaic_bboxes.is_inside(
  2971. [2 * self.img_scale[1], 2 * self.img_scale[0]]).numpy()
  2972. mosaic_bboxes = mosaic_bboxes[inside_inds]
  2973. mosaic_bboxes_labels = mosaic_bboxes_labels[inside_inds]
  2974. mosaic_ignore_flags = mosaic_ignore_flags[inside_inds]
  2975. results['img'] = mosaic_img
  2976. results['img_shape'] = mosaic_img.shape[:2]
  2977. results['gt_bboxes'] = mosaic_bboxes
  2978. results['gt_bboxes_labels'] = mosaic_bboxes_labels
  2979. results['gt_ignore_flags'] = mosaic_ignore_flags
  2980. if with_mask:
  2981. mosaic_masks = mosaic_masks[0].cat(mosaic_masks)
  2982. results['gt_masks'] = mosaic_masks[inside_inds]
  2983. return results
  2984. def __repr__(self):
  2985. repr_str = self.__class__.__name__
  2986. repr_str += f'(img_scale={self.img_scale}, '
  2987. repr_str += f'center_ratio_range={self.center_ratio_range}, '
  2988. repr_str += f'pad_val={self.pad_val}, '
  2989. repr_str += f'prob={self.prob}, '
  2990. repr_str += f'max_cached_images={self.max_cached_images}, '
  2991. repr_str += f'random_pop={self.random_pop})'
  2992. return repr_str
  2993. @TRANSFORMS.register_module()
  2994. class CachedMixUp(BaseTransform):
  2995. """Cached mixup data augmentation.
  2996. .. code:: text
  2997. mixup transform
  2998. +------------------------------+
  2999. | mixup image | |
  3000. | +--------|--------+ |
  3001. | | | | |
  3002. |---------------+ | |
  3003. | | | |
  3004. | | image | |
  3005. | | | |
  3006. | | | |
  3007. | |-----------------+ |
  3008. | pad |
  3009. +------------------------------+
  3010. The cached mixup transform steps are as follows:
  3011. 1. Append the results from the last transform into the cache.
  3012. 2. Another random image is picked from the cache and embedded in
  3013. the top left patch(after padding and resizing)
  3014. 3. The target of mixup transform is the weighted average of mixup
  3015. image and origin image.
  3016. Required Keys:
  3017. - img
  3018. - gt_bboxes (np.float32) (optional)
  3019. - gt_bboxes_labels (np.int64) (optional)
  3020. - gt_ignore_flags (bool) (optional)
  3021. - mix_results (List[dict])
  3022. Modified Keys:
  3023. - img
  3024. - img_shape
  3025. - gt_bboxes (optional)
  3026. - gt_bboxes_labels (optional)
  3027. - gt_ignore_flags (optional)
  3028. Args:
  3029. img_scale (Sequence[int]): Image output size after mixup pipeline.
  3030. The shape order should be (width, height). Defaults to (640, 640).
  3031. ratio_range (Sequence[float]): Scale ratio of mixup image.
  3032. Defaults to (0.5, 1.5).
  3033. flip_ratio (float): Horizontal flip ratio of mixup image.
  3034. Defaults to 0.5.
  3035. pad_val (int): Pad value. Defaults to 114.
  3036. max_iters (int): The maximum number of iterations. If the number of
  3037. iterations is greater than `max_iters`, but gt_bbox is still
  3038. empty, then the iteration is terminated. Defaults to 15.
  3039. bbox_clip_border (bool, optional): Whether to clip the objects outside
  3040. the border of the image. In some dataset like MOT17, the gt bboxes
  3041. are allowed to cross the border of images. Therefore, we don't
  3042. need to clip the gt bboxes in these cases. Defaults to True.
  3043. max_cached_images (int): The maximum length of the cache. The larger
  3044. the cache, the stronger the randomness of this transform. As a
  3045. rule of thumb, providing 10 caches for each image suffices for
  3046. randomness. Defaults to 20.
  3047. random_pop (bool): Whether to randomly pop a result from the cache
  3048. when the cache is full. If set to False, use FIFO popping method.
  3049. Defaults to True.
  3050. prob (float): Probability of applying this transformation.
  3051. Defaults to 1.0.
  3052. """
  3053. def __init__(self,
  3054. img_scale: Tuple[int, int] = (640, 640),
  3055. ratio_range: Tuple[float, float] = (0.5, 1.5),
  3056. flip_ratio: float = 0.5,
  3057. pad_val: float = 114.0,
  3058. max_iters: int = 15,
  3059. bbox_clip_border: bool = True,
  3060. max_cached_images: int = 20,
  3061. random_pop: bool = True,
  3062. prob: float = 1.0) -> None:
  3063. assert isinstance(img_scale, tuple)
  3064. assert max_cached_images >= 2, 'The length of cache must >= 2, ' \
  3065. f'but got {max_cached_images}.'
  3066. assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. ' \
  3067. f'got {prob}.'
  3068. self.dynamic_scale = img_scale
  3069. self.ratio_range = ratio_range
  3070. self.flip_ratio = flip_ratio
  3071. self.pad_val = pad_val
  3072. self.max_iters = max_iters
  3073. self.bbox_clip_border = bbox_clip_border
  3074. self.results_cache = []
  3075. self.max_cached_images = max_cached_images
  3076. self.random_pop = random_pop
  3077. self.prob = prob
  3078. @cache_randomness
  3079. def get_indexes(self, cache: list) -> int:
  3080. """Call function to collect indexes.
  3081. Args:
  3082. cache (list): The result cache.
  3083. Returns:
  3084. int: index.
  3085. """
  3086. for i in range(self.max_iters):
  3087. index = random.randint(0, len(cache) - 1)
  3088. gt_bboxes_i = cache[index]['gt_bboxes']
  3089. if len(gt_bboxes_i) != 0:
  3090. break
  3091. return index
  3092. @autocast_box_type()
  3093. def transform(self, results: dict) -> dict:
  3094. """MixUp transform function.
  3095. Args:
  3096. results (dict): Result dict.
  3097. Returns:
  3098. dict: Updated result dict.
  3099. """
  3100. # cache and pop images
  3101. self.results_cache.append(copy.deepcopy(results))
  3102. if len(self.results_cache) > self.max_cached_images:
  3103. if self.random_pop:
  3104. index = random.randint(0, len(self.results_cache) - 1)
  3105. else:
  3106. index = 0
  3107. self.results_cache.pop(index)
  3108. if len(self.results_cache) <= 1:
  3109. return results
  3110. if random.uniform(0, 1) > self.prob:
  3111. return results
  3112. index = self.get_indexes(self.results_cache)
  3113. retrieve_results = copy.deepcopy(self.results_cache[index])
  3114. # TODO: refactor mixup to reuse these code.
  3115. if retrieve_results['gt_bboxes'].shape[0] == 0:
  3116. # empty bbox
  3117. return results
  3118. retrieve_img = retrieve_results['img']
  3119. with_mask = True if 'gt_masks' in results else False
  3120. jit_factor = random.uniform(*self.ratio_range)
  3121. is_filp = random.uniform(0, 1) > self.flip_ratio
  3122. if len(retrieve_img.shape) == 3:
  3123. out_img = np.ones(
  3124. (self.dynamic_scale[1], self.dynamic_scale[0], 3),
  3125. dtype=retrieve_img.dtype) * self.pad_val
  3126. else:
  3127. out_img = np.ones(
  3128. self.dynamic_scale[::-1],
  3129. dtype=retrieve_img.dtype) * self.pad_val
  3130. # 1. keep_ratio resize
  3131. scale_ratio = min(self.dynamic_scale[1] / retrieve_img.shape[0],
  3132. self.dynamic_scale[0] / retrieve_img.shape[1])
  3133. retrieve_img = mmcv.imresize(
  3134. retrieve_img, (int(retrieve_img.shape[1] * scale_ratio),
  3135. int(retrieve_img.shape[0] * scale_ratio)))
  3136. # 2. paste
  3137. out_img[:retrieve_img.shape[0], :retrieve_img.shape[1]] = retrieve_img
  3138. # 3. scale jit
  3139. scale_ratio *= jit_factor
  3140. out_img = mmcv.imresize(out_img, (int(out_img.shape[1] * jit_factor),
  3141. int(out_img.shape[0] * jit_factor)))
  3142. # 4. flip
  3143. if is_filp:
  3144. out_img = out_img[:, ::-1, :]
  3145. # 5. random crop
  3146. ori_img = results['img']
  3147. origin_h, origin_w = out_img.shape[:2]
  3148. target_h, target_w = ori_img.shape[:2]
  3149. padded_img = np.ones((max(origin_h, target_h), max(
  3150. origin_w, target_w), 3)) * self.pad_val
  3151. padded_img = padded_img.astype(np.uint8)
  3152. padded_img[:origin_h, :origin_w] = out_img
  3153. x_offset, y_offset = 0, 0
  3154. if padded_img.shape[0] > target_h:
  3155. y_offset = random.randint(0, padded_img.shape[0] - target_h)
  3156. if padded_img.shape[1] > target_w:
  3157. x_offset = random.randint(0, padded_img.shape[1] - target_w)
  3158. padded_cropped_img = padded_img[y_offset:y_offset + target_h,
  3159. x_offset:x_offset + target_w]
  3160. # 6. adjust bbox
  3161. retrieve_gt_bboxes = retrieve_results['gt_bboxes']
  3162. retrieve_gt_bboxes.rescale_([scale_ratio, scale_ratio])
  3163. if with_mask:
  3164. retrieve_gt_masks = retrieve_results['gt_masks'].rescale(
  3165. scale_ratio)
  3166. if self.bbox_clip_border:
  3167. retrieve_gt_bboxes.clip_([origin_h, origin_w])
  3168. if is_filp:
  3169. retrieve_gt_bboxes.flip_([origin_h, origin_w],
  3170. direction='horizontal')
  3171. if with_mask:
  3172. retrieve_gt_masks = retrieve_gt_masks.flip()
  3173. # 7. filter
  3174. cp_retrieve_gt_bboxes = retrieve_gt_bboxes.clone()
  3175. cp_retrieve_gt_bboxes.translate_([-x_offset, -y_offset])
  3176. if with_mask:
  3177. retrieve_gt_masks = retrieve_gt_masks.translate(
  3178. out_shape=(target_h, target_w),
  3179. offset=-x_offset,
  3180. direction='horizontal')
  3181. retrieve_gt_masks = retrieve_gt_masks.translate(
  3182. out_shape=(target_h, target_w),
  3183. offset=-y_offset,
  3184. direction='vertical')
  3185. if self.bbox_clip_border:
  3186. cp_retrieve_gt_bboxes.clip_([target_h, target_w])
  3187. # 8. mix up
  3188. ori_img = ori_img.astype(np.float32)
  3189. mixup_img = 0.5 * ori_img + 0.5 * padded_cropped_img.astype(np.float32)
  3190. retrieve_gt_bboxes_labels = retrieve_results['gt_bboxes_labels']
  3191. retrieve_gt_ignore_flags = retrieve_results['gt_ignore_flags']
  3192. mixup_gt_bboxes = cp_retrieve_gt_bboxes.cat(
  3193. (results['gt_bboxes'], cp_retrieve_gt_bboxes), dim=0)
  3194. mixup_gt_bboxes_labels = np.concatenate(
  3195. (results['gt_bboxes_labels'], retrieve_gt_bboxes_labels), axis=0)
  3196. mixup_gt_ignore_flags = np.concatenate(
  3197. (results['gt_ignore_flags'], retrieve_gt_ignore_flags), axis=0)
  3198. if with_mask:
  3199. mixup_gt_masks = retrieve_gt_masks.cat(
  3200. [results['gt_masks'], retrieve_gt_masks])
  3201. # remove outside bbox
  3202. inside_inds = mixup_gt_bboxes.is_inside([target_h, target_w]).numpy()
  3203. mixup_gt_bboxes = mixup_gt_bboxes[inside_inds]
  3204. mixup_gt_bboxes_labels = mixup_gt_bboxes_labels[inside_inds]
  3205. mixup_gt_ignore_flags = mixup_gt_ignore_flags[inside_inds]
  3206. if with_mask:
  3207. mixup_gt_masks = mixup_gt_masks[inside_inds]
  3208. results['img'] = mixup_img.astype(np.uint8)
  3209. results['img_shape'] = mixup_img.shape[:2]
  3210. results['gt_bboxes'] = mixup_gt_bboxes
  3211. results['gt_bboxes_labels'] = mixup_gt_bboxes_labels
  3212. results['gt_ignore_flags'] = mixup_gt_ignore_flags
  3213. if with_mask:
  3214. results['gt_masks'] = mixup_gt_masks
  3215. return results
  3216. def __repr__(self):
  3217. repr_str = self.__class__.__name__
  3218. repr_str += f'(dynamic_scale={self.dynamic_scale}, '
  3219. repr_str += f'ratio_range={self.ratio_range}, '
  3220. repr_str += f'flip_ratio={self.flip_ratio}, '
  3221. repr_str += f'pad_val={self.pad_val}, '
  3222. repr_str += f'max_iters={self.max_iters}, '
  3223. repr_str += f'bbox_clip_border={self.bbox_clip_border}, '
  3224. repr_str += f'max_cached_images={self.max_cached_images}, '
  3225. repr_str += f'random_pop={self.random_pop}, '
  3226. repr_str += f'prob={self.prob})'
  3227. return repr_str