Sync with nvidia/C-RADIOv2-VLM-H-RC3

Files changed (4) hide show

README.md CHANGED Viewed

@@ -6,6 +6,8 @@ license_link: https://developer.download.nvidia.com/licenses/nvidia-open-model-l
 # Model Overview
 ## Description
 This model performs visual feature extraction.
@@ -78,7 +80,7 @@ import torch
 from PIL import Image
 from transformers import AutoModel, CLIPImageProcessor
-hf_repo = "nvidia/C-RADIOv2-H"
 image_processor = CLIPImageProcessor.from_pretrained(hf_repo)
 model = AutoModel.from_pretrained(hf_repo, trust_remote_code=True)

 # Model Overview
+[[**Github**](https://github.com/NVlabs/RADIO)]  [[**CVPR 2025**](https://arxiv.org/abs/2412.07679)]  [[**CVPR 2024**](https://arxiv.org/abs/2312.06709)]
 ## Description
 This model performs visual feature extraction.
 from PIL import Image
 from transformers import AutoModel, CLIPImageProcessor
+hf_repo = "nvidia/C-RADIOv2-VLM-H-RC3"
 image_processor = CLIPImageProcessor.from_pretrained(hf_repo)
 model = AutoModel.from_pretrained(hf_repo, trust_remote_code=True)

config.json CHANGED Viewed

@@ -16,7 +16,7 @@
     "cache_dir": null,
     "channels_last": false,
     "checkpoint_hist": 10,
-    "chk_keep_forever": 50,
     "class_map": "",
     "clip_grad": null,
     "clip_mode": "norm",
@@ -31,6 +31,7 @@
     "crop_pct": null,
     "cutmix": 0.0,
     "cutmix_minmax": null,
     "dataset_download": false,
     "debug_full_knn": false,
     "decay_epochs": 90,
@@ -64,7 +65,7 @@
     "force_new_wandb_id": false,
     "force_spectral_reparam": true,
     "freeze_bn": false,
-    "fsdp": true,
     "full_equivariance": false,
     "fuser": "",
     "gp": null,
@@ -169,6 +170,15 @@
         "name": "siglip2-g",
         "type": "siglip2",
         "use_summary": true
       }
     ],
     "torchcompile": null,

     "cache_dir": null,
     "channels_last": false,
     "checkpoint_hist": 10,
+    "chk_keep_forever": 100,
     "class_map": "",
     "clip_grad": null,
     "clip_mode": "norm",
     "crop_pct": null,
     "cutmix": 0.0,
     "cutmix_minmax": null,
+    "damp": null,
     "dataset_download": false,
     "debug_full_knn": false,
     "decay_epochs": 90,
     "force_new_wandb_id": false,
     "force_spectral_reparam": true,
     "freeze_bn": false,
+    "fsdp": false,
     "full_equivariance": false,
     "fuser": "",
     "gp": null,
         "name": "siglip2-g",
         "type": "siglip2",
         "use_summary": true
+      },
+      {
+        "fd_normalize": false,
+        "feature_distillation": true,
+        "input_size": 384,
+        "model": "siglip2-g-384",
+        "name": "siglip2-g-dirty",
+        "type": "siglip2",
+        "use_summary": false
       }
     ],
     "torchcompile": null,

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e00385571b42a060742e3dd14ea7ac17f099968482115dc49a66aac34d1aa0a2
 size 2606616120

 version https://git-lfs.github.com/spec/v1
+oid sha256:96ff3bfec4f732d68a0c38c41a49de043abd2503df24481526ea87d26dd6a4f5
 size 2606616120

vit_patch_generator.py CHANGED Viewed

@@ -119,6 +119,10 @@ class ViTPatchGenerator(nn.Module):
             'pos_embed',
         ]
     def _load_embed(self, src_embed: torch.Tensor, targ_embed: nn.Parameter):
         if src_embed.shape != targ_embed.shape:
             src_size = int(math.sqrt(src_embed.shape[1]))
@@ -281,3 +285,18 @@ class ViTPatchLinear(nn.Linear):
             **factory
         )
         self.patch_size = patch_size

             'pos_embed',
         ]
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
+        if self.abs_pos:
+            self._load_embed(state_dict[f'{prefix}pos_embed'], self.pos_embed)
     def _load_embed(self, src_embed: torch.Tensor, targ_embed: nn.Parameter):
         if src_embed.shape != targ_embed.shape:
             src_size = int(math.sqrt(src_embed.shape[1]))
             **factory
         )
         self.patch_size = patch_size
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
+        if self.bias is not None:
+            self.bias.data.copy_(state_dict[f'{prefix}bias'])
+        chk_weight = state_dict[f'{prefix}weight']
+        if chk_weight.shape != self.weight.shape:
+            src_patch_size = int(math.sqrt(chk_weight.shape[1] // 3))
+            assert (src_patch_size ** 2) * 3 == chk_weight.shape[1], 'Unable to interpolate non-square patch size'
+            chk_weight = rearrange(chk_weight, 'b (c h w) -> b c h w', c=3, h=src_patch_size, w=src_patch_size)
+            chk_weight = F.interpolate(chk_weight, size=(self.patch_size, self.patch_size), mode='bicubic', align_corners=True, antialias=False)
+            chk_weight = rearrange(chk_weight, 'b c h w -> b (c h w)')
+        self.weight.data.copy_(chk_weight)