123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960 |
- _base_ = [
- '../../../configs/_base_/models/mask-rcnn_r50_fpn.py',
- './lsj-100e_coco-instance.py',
- ]
- custom_imports = dict(imports=['projects.ViTDet.vitdet'])
- backbone_norm_cfg = dict(type='LN', requires_grad=True)
- norm_cfg = dict(type='LN2d', requires_grad=True)
- image_size = (1024, 1024)
- batch_augments = [
- dict(type='BatchFixedSizePad', size=image_size, pad_mask=True)
- ]
- # model settings
- model = dict(
- data_preprocessor=dict(pad_size_divisor=32, batch_augments=batch_augments),
- backbone=dict(
- _delete_=True,
- type='ViT',
- img_size=1024,
- patch_size=16,
- embed_dim=768,
- depth=12,
- num_heads=12,
- drop_path_rate=0.1,
- window_size=14,
- mlp_ratio=4,
- qkv_bias=True,
- norm_cfg=backbone_norm_cfg,
- window_block_indexes=[
- 0,
- 1,
- 3,
- 4,
- 6,
- 7,
- 9,
- 10,
- ],
- use_rel_pos=True,
- init_cfg=dict(
- type='Pretrained', checkpoint='mae_pretrain_vit_base.pth')),
- neck=dict(
- _delete_=True,
- type='SimpleFPN',
- backbone_channel=768,
- in_channels=[192, 384, 768, 768],
- out_channels=256,
- num_outs=5,
- norm_cfg=norm_cfg),
- rpn_head=dict(num_convs=2),
- roi_head=dict(
- bbox_head=dict(
- type='Shared4Conv1FCBBoxHead',
- conv_out_channels=256,
- norm_cfg=norm_cfg),
- mask_head=dict(norm_cfg=norm_cfg)))
- custom_hooks = [dict(type='Fp16CompresssionHook')]
|