123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130 |
- _base_ = [
- '../_base_/models/mask-rcnn_r50_fpn.py',
- '../_base_/datasets/youtube_vis.py', '../_base_/default_runtime.py'
- ]
- detector = _base_.model
- detector.pop('data_preprocessor')
- detector.roi_head.bbox_head.update(dict(num_classes=40))
- detector.roi_head.mask_head.update(dict(num_classes=40))
- detector.train_cfg.rpn.sampler.update(dict(num=64))
- detector.train_cfg.rpn_proposal.update(dict(nms_pre=200, max_per_img=200))
- detector.train_cfg.rcnn.sampler.update(dict(num=128))
- detector.test_cfg.rpn.update(dict(nms_pre=200, max_per_img=200))
- detector.test_cfg.rcnn.update(dict(score_thr=0.01))
- detector['init_cfg'] = dict(
- type='Pretrained',
- checkpoint= # noqa: E251
- 'https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth' # noqa: E501
- )
- del _base_.model
- model = dict(
- type='MaskTrackRCNN',
- data_preprocessor=dict(
- type='TrackDataPreprocessor',
- mean=[123.675, 116.28, 103.53],
- std=[58.395, 57.12, 57.375],
- bgr_to_rgb=True,
- pad_mask=True,
- pad_size_divisor=32),
- detector=detector,
- track_head=dict(
- type='RoITrackHead',
- roi_extractor=dict(
- type='SingleRoIExtractor',
- roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
- out_channels=256,
- featmap_strides=[4, 8, 16, 32]),
- embed_head=dict(
- type='RoIEmbedHead',
- num_fcs=2,
- roi_feat_size=7,
- in_channels=256,
- fc_out_channels=1024),
- train_cfg=dict(
- assigner=dict(
- type='MaxIoUAssigner',
- pos_iou_thr=0.5,
- neg_iou_thr=0.5,
- min_pos_iou=0.5,
- match_low_quality=True,
- ignore_iof_thr=-1),
- sampler=dict(
- type='RandomSampler',
- num=128,
- pos_fraction=0.25,
- neg_pos_ub=-1,
- add_gt_as_proposals=True),
- pos_weight=-1,
- debug=False)),
- tracker=dict(
- type='MaskTrackRCNNTracker',
- match_weights=dict(det_score=1.0, iou=2.0, det_label=10.0),
- num_frames_retain=20))
- dataset_type = 'YouTubeVISDataset'
- data_root = 'data/youtube_vis_2019/'
- dataset_version = data_root[-5:-1] # 2019 or 2021
- # train_dataloader
- train_dataloader = dict(
- _delete_=True,
- batch_size=1,
- num_workers=2,
- persistent_workers=True,
- sampler=dict(type='TrackImgSampler'), # image-based sampling
- batch_sampler=dict(type='TrackAspectRatioBatchSampler'),
- dataset=dict(
- type=dataset_type,
- data_root=data_root,
- dataset_version=dataset_version,
- ann_file='annotations/youtube_vis_2019_train.json',
- data_prefix=dict(img_path='train/JPEGImages'),
- pipeline=_base_.train_pipeline))
- # optimizer
- optim_wrapper = dict(
- type='OptimWrapper',
- optimizer=dict(type='SGD', lr=0.00125, momentum=0.9, weight_decay=0.0001),
- clip_grad=dict(max_norm=35, norm_type=2))
- # learning policy
- param_scheduler = [
- dict(
- type='LinearLR',
- start_factor=1.0 / 3.0,
- by_epoch=False,
- begin=0,
- end=500),
- dict(
- type='MultiStepLR',
- begin=0,
- end=12,
- by_epoch=True,
- milestones=[8, 11],
- gamma=0.1)
- ]
- # visualizer
- default_hooks = dict(
- visualization=dict(type='TrackVisualizationHook', draw=False))
- vis_backends = [dict(type='LocalVisBackend')]
- visualizer = dict(
- type='TrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
- # runtime settings
- train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_begin=13)
- val_cfg = dict(type='ValLoop')
- test_cfg = dict(type='TestLoop')
- # evaluator
- val_evaluator = dict(
- type='YouTubeVISMetric',
- metric='youtube_vis_ap',
- outfile_prefix='./youtube_vis_results',
- format_only=True)
- test_evaluator = val_evaluator
- del detector
|