Sync with nvidia/C-RADIOv2-VLM-H-RC3
Browse files- README.md +3 -1
- config.json +12 -2
- model.safetensors +1 -1
- vit_patch_generator.py +19 -0
README.md
CHANGED
|
@@ -6,6 +6,8 @@ license_link: https://developer.download.nvidia.com/licenses/nvidia-open-model-l
|
|
| 6 |
|
| 7 |
# Model Overview
|
| 8 |
|
|
|
|
|
|
|
| 9 |
## Description
|
| 10 |
|
| 11 |
This model performs visual feature extraction.
|
|
@@ -78,7 +80,7 @@ import torch
|
|
| 78 |
from PIL import Image
|
| 79 |
from transformers import AutoModel, CLIPImageProcessor
|
| 80 |
|
| 81 |
-
hf_repo = "nvidia/C-RADIOv2-H"
|
| 82 |
|
| 83 |
image_processor = CLIPImageProcessor.from_pretrained(hf_repo)
|
| 84 |
model = AutoModel.from_pretrained(hf_repo, trust_remote_code=True)
|
|
|
|
| 6 |
|
| 7 |
# Model Overview
|
| 8 |
|
| 9 |
+
[[**Github**](https://github.com/NVlabs/RADIO)] [[**CVPR 2025**](https://arxiv.org/abs/2412.07679)] [[**CVPR 2024**](https://arxiv.org/abs/2312.06709)]
|
| 10 |
+
|
| 11 |
## Description
|
| 12 |
|
| 13 |
This model performs visual feature extraction.
|
|
|
|
| 80 |
from PIL import Image
|
| 81 |
from transformers import AutoModel, CLIPImageProcessor
|
| 82 |
|
| 83 |
+
hf_repo = "nvidia/C-RADIOv2-VLM-H-RC3"
|
| 84 |
|
| 85 |
image_processor = CLIPImageProcessor.from_pretrained(hf_repo)
|
| 86 |
model = AutoModel.from_pretrained(hf_repo, trust_remote_code=True)
|
config.json
CHANGED
|
@@ -16,7 +16,7 @@
|
|
| 16 |
"cache_dir": null,
|
| 17 |
"channels_last": false,
|
| 18 |
"checkpoint_hist": 10,
|
| 19 |
-
"chk_keep_forever":
|
| 20 |
"class_map": "",
|
| 21 |
"clip_grad": null,
|
| 22 |
"clip_mode": "norm",
|
|
@@ -31,6 +31,7 @@
|
|
| 31 |
"crop_pct": null,
|
| 32 |
"cutmix": 0.0,
|
| 33 |
"cutmix_minmax": null,
|
|
|
|
| 34 |
"dataset_download": false,
|
| 35 |
"debug_full_knn": false,
|
| 36 |
"decay_epochs": 90,
|
|
@@ -64,7 +65,7 @@
|
|
| 64 |
"force_new_wandb_id": false,
|
| 65 |
"force_spectral_reparam": true,
|
| 66 |
"freeze_bn": false,
|
| 67 |
-
"fsdp":
|
| 68 |
"full_equivariance": false,
|
| 69 |
"fuser": "",
|
| 70 |
"gp": null,
|
|
@@ -169,6 +170,15 @@
|
|
| 169 |
"name": "siglip2-g",
|
| 170 |
"type": "siglip2",
|
| 171 |
"use_summary": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
}
|
| 173 |
],
|
| 174 |
"torchcompile": null,
|
|
|
|
| 16 |
"cache_dir": null,
|
| 17 |
"channels_last": false,
|
| 18 |
"checkpoint_hist": 10,
|
| 19 |
+
"chk_keep_forever": 100,
|
| 20 |
"class_map": "",
|
| 21 |
"clip_grad": null,
|
| 22 |
"clip_mode": "norm",
|
|
|
|
| 31 |
"crop_pct": null,
|
| 32 |
"cutmix": 0.0,
|
| 33 |
"cutmix_minmax": null,
|
| 34 |
+
"damp": null,
|
| 35 |
"dataset_download": false,
|
| 36 |
"debug_full_knn": false,
|
| 37 |
"decay_epochs": 90,
|
|
|
|
| 65 |
"force_new_wandb_id": false,
|
| 66 |
"force_spectral_reparam": true,
|
| 67 |
"freeze_bn": false,
|
| 68 |
+
"fsdp": false,
|
| 69 |
"full_equivariance": false,
|
| 70 |
"fuser": "",
|
| 71 |
"gp": null,
|
|
|
|
| 170 |
"name": "siglip2-g",
|
| 171 |
"type": "siglip2",
|
| 172 |
"use_summary": true
|
| 173 |
+
},
|
| 174 |
+
{
|
| 175 |
+
"fd_normalize": false,
|
| 176 |
+
"feature_distillation": true,
|
| 177 |
+
"input_size": 384,
|
| 178 |
+
"model": "siglip2-g-384",
|
| 179 |
+
"name": "siglip2-g-dirty",
|
| 180 |
+
"type": "siglip2",
|
| 181 |
+
"use_summary": false
|
| 182 |
}
|
| 183 |
],
|
| 184 |
"torchcompile": null,
|
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2606616120
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:96ff3bfec4f732d68a0c38c41a49de043abd2503df24481526ea87d26dd6a4f5
|
| 3 |
size 2606616120
|
vit_patch_generator.py
CHANGED
|
@@ -119,6 +119,10 @@ class ViTPatchGenerator(nn.Module):
|
|
| 119 |
'pos_embed',
|
| 120 |
]
|
| 121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
def _load_embed(self, src_embed: torch.Tensor, targ_embed: nn.Parameter):
|
| 123 |
if src_embed.shape != targ_embed.shape:
|
| 124 |
src_size = int(math.sqrt(src_embed.shape[1]))
|
|
@@ -281,3 +285,18 @@ class ViTPatchLinear(nn.Linear):
|
|
| 281 |
**factory
|
| 282 |
)
|
| 283 |
self.patch_size = patch_size
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
'pos_embed',
|
| 120 |
]
|
| 121 |
|
| 122 |
+
def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
|
| 123 |
+
if self.abs_pos:
|
| 124 |
+
self._load_embed(state_dict[f'{prefix}pos_embed'], self.pos_embed)
|
| 125 |
+
|
| 126 |
def _load_embed(self, src_embed: torch.Tensor, targ_embed: nn.Parameter):
|
| 127 |
if src_embed.shape != targ_embed.shape:
|
| 128 |
src_size = int(math.sqrt(src_embed.shape[1]))
|
|
|
|
| 285 |
**factory
|
| 286 |
)
|
| 287 |
self.patch_size = patch_size
|
| 288 |
+
|
| 289 |
+
def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
|
| 290 |
+
if self.bias is not None:
|
| 291 |
+
self.bias.data.copy_(state_dict[f'{prefix}bias'])
|
| 292 |
+
|
| 293 |
+
chk_weight = state_dict[f'{prefix}weight']
|
| 294 |
+
if chk_weight.shape != self.weight.shape:
|
| 295 |
+
src_patch_size = int(math.sqrt(chk_weight.shape[1] // 3))
|
| 296 |
+
|
| 297 |
+
assert (src_patch_size ** 2) * 3 == chk_weight.shape[1], 'Unable to interpolate non-square patch size'
|
| 298 |
+
|
| 299 |
+
chk_weight = rearrange(chk_weight, 'b (c h w) -> b c h w', c=3, h=src_patch_size, w=src_patch_size)
|
| 300 |
+
chk_weight = F.interpolate(chk_weight, size=(self.patch_size, self.patch_size), mode='bicubic', align_corners=True, antialias=False)
|
| 301 |
+
chk_weight = rearrange(chk_weight, 'b c h w -> b (c h w)')
|
| 302 |
+
self.weight.data.copy_(chk_weight)
|