vitdet_mask-rcnn_vit-b-mae_lsj-100e.py 1.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960
  1. _base_ = [
  2. '../../../configs/_base_/models/mask-rcnn_r50_fpn.py',
  3. './lsj-100e_coco-instance.py',
  4. ]
  5. custom_imports = dict(imports=['projects.ViTDet.vitdet'])
  6. backbone_norm_cfg = dict(type='LN', requires_grad=True)
  7. norm_cfg = dict(type='LN2d', requires_grad=True)
  8. image_size = (1024, 1024)
  9. batch_augments = [
  10. dict(type='BatchFixedSizePad', size=image_size, pad_mask=True)
  11. ]
  12. # model settings
  13. model = dict(
  14. data_preprocessor=dict(pad_size_divisor=32, batch_augments=batch_augments),
  15. backbone=dict(
  16. _delete_=True,
  17. type='ViT',
  18. img_size=1024,
  19. patch_size=16,
  20. embed_dim=768,
  21. depth=12,
  22. num_heads=12,
  23. drop_path_rate=0.1,
  24. window_size=14,
  25. mlp_ratio=4,
  26. qkv_bias=True,
  27. norm_cfg=backbone_norm_cfg,
  28. window_block_indexes=[
  29. 0,
  30. 1,
  31. 3,
  32. 4,
  33. 6,
  34. 7,
  35. 9,
  36. 10,
  37. ],
  38. use_rel_pos=True,
  39. init_cfg=dict(
  40. type='Pretrained', checkpoint='mae_pretrain_vit_base.pth')),
  41. neck=dict(
  42. _delete_=True,
  43. type='SimpleFPN',
  44. backbone_channel=768,
  45. in_channels=[192, 384, 768, 768],
  46. out_channels=256,
  47. num_outs=5,
  48. norm_cfg=norm_cfg),
  49. rpn_head=dict(num_convs=2),
  50. roi_head=dict(
  51. bbox_head=dict(
  52. type='Shared4Conv1FCBBoxHead',
  53. conv_out_channels=256,
  54. norm_cfg=norm_cfg),
  55. mask_head=dict(norm_cfg=norm_cfg)))
  56. custom_hooks = [dict(type='Fp16CompresssionHook')]