@@ -118,6 +118,17 @@ def _cfg(url='', **kwargs):
118
118
'vit_deit_base_distilled_patch16_384' : _cfg (
119
119
url = 'https://dl.fbaipublicfiles.com/deit/deit_base_distilled_patch16_384-d0272ac0.pth' ,
120
120
input_size = (3 , 384 , 384 ), crop_pct = 1.0 , classifier = ('head' , 'head_dist' )),
121
+
122
+ # ViT ImageNet-21K-P pretraining
123
+ 'vit_base_patch16_224_miil_in21k' : _cfg (
124
+ url = 'https://miil-public-eu.oss-eu-central-1.aliyuncs.com/model-zoo/ImageNet_21K_P/models/timm/vit_base_patch16_224_in21k_miil.pth' ,
125
+ mean = (0 , 0 , 0 ), std = (1 , 1 , 1 ), crop_pct = 0.875 , interpolation = 'bilinear' , num_classes = 11221 ,
126
+ ),
127
+ 'vit_base_patch16_224_miil' : _cfg (
128
+ url = 'https://miil-public-eu.oss-eu-central-1.aliyuncs.com/model-zoo/ImageNet_21K_P/models/timm'
129
+ '/vit_base_patch16_224_1k_miil_84_4.pth' ,
130
+ mean = (0 , 0 , 0 ), std = (1 , 1 , 1 ), crop_pct = 0.875 , interpolation = 'bilinear' ,
131
+ ),
121
132
}
122
133
123
134
@@ -687,3 +698,23 @@ def vit_deit_base_distilled_patch16_384(pretrained=False, **kwargs):
687
698
model = _create_vision_transformer (
688
699
'vit_deit_base_distilled_patch16_384' , pretrained = pretrained , distilled = True , ** model_kwargs )
689
700
return model
701
+
702
+
703
+ @register_model
704
+ def vit_base_patch16_224_miil_in21k (pretrained = False , ** kwargs ):
705
+ """ ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
706
+ Weights taken from: https://github.com/Alibaba-MIIL/ImageNet21K
707
+ """
708
+ model_kwargs = dict (patch_size = 16 , embed_dim = 768 , depth = 12 , num_heads = 12 , qkv_bias = False , ** kwargs )
709
+ model = _create_vision_transformer ('vit_base_patch16_224_miil_in21k' , pretrained = pretrained , ** model_kwargs )
710
+ return model
711
+
712
+
713
+ @register_model
714
+ def vit_base_patch16_224_miil (pretrained = False , ** kwargs ):
715
+ """ ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
716
+ Weights taken from: https://github.com/Alibaba-MIIL/ImageNet21K
717
+ """
718
+ model_kwargs = dict (patch_size = 16 , embed_dim = 768 , depth = 12 , num_heads = 12 , qkv_bias = False , ** kwargs )
719
+ model = _create_vision_transformer ('vit_base_patch16_224_miil' , pretrained = pretrained , ** model_kwargs )
720
+ return model
0 commit comments