NikV09 commited on
Commit
4d96d02
·
verified ·
1 Parent(s): 2d7b155

Push model using huggingface_hub.

Browse files
Files changed (3) hide show
  1. README.md +4 -34
  2. config.json +32 -20
  3. model.safetensors +2 -2
README.md CHANGED
@@ -2,39 +2,9 @@
2
  tags:
3
  - model_hub_mixin
4
  - pytorch_model_hub_mixin
5
- - computer-vision
6
- - 3d-reconstruction
7
- - multi-view-stereo
8
- - depth-estimation
9
- - camera-pose
10
- - covisibility
11
- - mapanything
12
- license: cc-by-nc-4.0
13
- language:
14
- - en
15
- pipeline_tag: image-to-3d
16
  ---
17
 
18
- ## Overview
19
-
20
- MapAnything is a simple, end-to-end trained transformer model that directly regresses the factored metric 3D geometry of a scene given various types of modalities as inputs. A single feed-forward model supports over 12 different 3D reconstruction tasks including multi-image sfm, multi-view stereo, monocular metric depth estimation, registration, depth completion and more.
21
-
22
- This is the CC-BY-NC-4.0 variant of the model. Latest release on Dec 18th 2025.
23
-
24
- ## Quick Start
25
-
26
- Please refer to our [Github Repo](https://github.com/facebookresearch/map-anything)
27
-
28
- ## Citation
29
-
30
- If you find our repository useful, please consider giving it a star ⭐ and citing our paper in your work:
31
-
32
- ```bibtex
33
- @inproceedings{keetha2026mapanything,
34
- title={{MapAnything}: Universal Feed-Forward Metric 3D Reconstruction},
35
- author={Keetha, Nikhil and M{\"u}ller, Norman and Sch{\"o}nberger, Johannes and Porzi, Lorenzo and Zhang, Yuchen and Fischer, Tobias and Knapitsch, Arno and Zauss, Duncan and Weber, Ethan and Antunes, Nelson and others},
36
- booktitle={International Conference on 3D Vision (3DV)},
37
- year={2026},
38
- organization={IEEE}
39
- }
40
- ```
 
2
  tags:
3
  - model_hub_mixin
4
  - pytorch_model_hub_mixin
 
 
 
 
 
 
 
 
 
 
 
5
  ---
6
 
7
+ This model has been pushed to the Hub using the [PytorchModelHubMixin](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.PyTorchModelHubMixin) integration:
8
+ - Code: [More Information Needed]
9
+ - Paper: [More Information Needed]
10
+ - Docs: [More Information Needed]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config.json CHANGED
@@ -3,8 +3,10 @@
3
  "data_norm_type": "dinov2",
4
  "encoder_str": "dinov2",
5
  "gradient_checkpointing": true,
6
- "name": "dinov2_large",
7
- "size": "large",
 
 
8
  "torch_hub_force_reload": false,
9
  "uses_torch_hub": true,
10
  "with_registers": false
@@ -12,20 +14,20 @@
12
  "geometric_input_config": {
13
  "cam_prob": 1.0,
14
  "cam_rot_encoder_config": {
15
- "enc_embed_dim": 1024,
16
  "encoder_str": "global_rep_encoder",
17
  "in_chans": 4,
18
  "name": "cam_rot_quats_encoder"
19
  },
20
  "cam_trans_encoder_config": {
21
- "enc_embed_dim": 1024,
22
  "encoder_str": "global_rep_encoder",
23
  "in_chans": 3,
24
  "name": "cam_trans_encoder"
25
  },
26
  "depth_encoder_config": {
27
  "apply_pe": false,
28
- "enc_embed_dim": 1024,
29
  "encoder_str": "dense_rep_encoder",
30
  "in_chans": 1,
31
  "name": "depth_encoder",
@@ -38,7 +40,7 @@
38
  "pose_scale_norm_all_prob": 0.0,
39
  "ray_dirs_encoder_config": {
40
  "apply_pe": false,
41
- "enc_embed_dim": 1024,
42
  "encoder_str": "dense_rep_encoder",
43
  "in_chans": 3,
44
  "name": "ray_dirs_encoder",
@@ -46,7 +48,7 @@
46
  },
47
  "ray_dirs_prob": 1.0,
48
  "scale_encoder_config": {
49
- "enc_embed_dim": 1024,
50
  "encoder_str": "global_rep_encoder",
51
  "in_chans": 1,
52
  "name": "scale_encoder"
@@ -60,19 +62,28 @@
60
  "model_type": "alternating_attention",
61
  "module_args": {
62
  "custom_positional_encoding": null,
63
- "depth": 24,
 
64
  "distinguish_ref_and_non_ref_views": true,
65
  "gradient_checkpointing": false,
66
  "indices": [
67
- 11,
68
- 17
69
  ],
70
- "input_embed_dim": 1024,
71
- "name": "aat_24_layers_ifr",
 
 
 
72
  "norm_intermediate": true,
73
- "size": "24_layers"
 
 
 
 
74
  }
75
  },
 
76
  "load_specific_pretrained_submodules": false,
77
  "name": "mapanything",
78
  "pred_head_config": {
@@ -141,10 +152,10 @@
141
  3
142
  ],
143
  "input_feature_dims": [
144
- 1024,
145
- 768,
146
- 768,
147
- 768
148
  ],
149
  "patch_size": 14
150
  },
@@ -160,7 +171,7 @@
160
  "quaternions_vmin": -Infinity
161
  },
162
  "pose_head": {
163
- "input_feature_dim": 768,
164
  "num_resconv_block": 2,
165
  "patch_size": 14,
166
  "rot_representation_dim": 4
@@ -177,12 +188,13 @@
177
  "vmin": 1e-08
178
  },
179
  "scale_head": {
180
- "input_feature_dim": 768,
181
  "output_dim": 1
182
  },
183
  "type": "dpt+pose"
184
  },
185
  "pretrained_checkpoint_path": null,
186
  "specific_pretrained_submodules": [],
187
- "torch_hub_force_reload": false
 
188
  }
 
3
  "data_norm_type": "dinov2",
4
  "encoder_str": "dinov2",
5
  "gradient_checkpointing": true,
6
+ "keep_first_n_layers": 24,
7
+ "name": "dinov2_giant_24_layers",
8
+ "norm_returned_features": false,
9
+ "size": "giant",
10
  "torch_hub_force_reload": false,
11
  "uses_torch_hub": true,
12
  "with_registers": false
 
14
  "geometric_input_config": {
15
  "cam_prob": 1.0,
16
  "cam_rot_encoder_config": {
17
+ "enc_embed_dim": 1536,
18
  "encoder_str": "global_rep_encoder",
19
  "in_chans": 4,
20
  "name": "cam_rot_quats_encoder"
21
  },
22
  "cam_trans_encoder_config": {
23
+ "enc_embed_dim": 1536,
24
  "encoder_str": "global_rep_encoder",
25
  "in_chans": 3,
26
  "name": "cam_trans_encoder"
27
  },
28
  "depth_encoder_config": {
29
  "apply_pe": false,
30
+ "enc_embed_dim": 1536,
31
  "encoder_str": "dense_rep_encoder",
32
  "in_chans": 1,
33
  "name": "depth_encoder",
 
40
  "pose_scale_norm_all_prob": 0.0,
41
  "ray_dirs_encoder_config": {
42
  "apply_pe": false,
43
+ "enc_embed_dim": 1536,
44
  "encoder_str": "dense_rep_encoder",
45
  "in_chans": 3,
46
  "name": "ray_dirs_encoder",
 
48
  },
49
  "ray_dirs_prob": 1.0,
50
  "scale_encoder_config": {
51
+ "enc_embed_dim": 1536,
52
  "encoder_str": "global_rep_encoder",
53
  "in_chans": 1,
54
  "name": "scale_encoder"
 
62
  "model_type": "alternating_attention",
63
  "module_args": {
64
  "custom_positional_encoding": null,
65
+ "depth": 16,
66
+ "dim": 1536,
67
  "distinguish_ref_and_non_ref_views": true,
68
  "gradient_checkpointing": false,
69
  "indices": [
70
+ 7,
71
+ 11
72
  ],
73
+ "init_values": 1e-05,
74
+ "input_embed_dim": 1536,
75
+ "mlp_layer": "dummy",
76
+ "mlp_ratio": 4,
77
+ "name": "aat_16_layers_dinov2_vitg_init",
78
  "norm_intermediate": true,
79
+ "num_heads": 24,
80
+ "pretrained_checkpoint_path": null,
81
+ "qk_norm": false,
82
+ "qkv_bias": true,
83
+ "size": "16_layers"
84
  }
85
  },
86
+ "info_sharing_mlp_layer_str": "swiglufused",
87
  "load_specific_pretrained_submodules": false,
88
  "name": "mapanything",
89
  "pred_head_config": {
 
152
  3
153
  ],
154
  "input_feature_dims": [
155
+ 1536,
156
+ 1536,
157
+ 1536,
158
+ 1536
159
  ],
160
  "patch_size": 14
161
  },
 
171
  "quaternions_vmin": -Infinity
172
  },
173
  "pose_head": {
174
+ "input_feature_dim": 1536,
175
  "num_resconv_block": 2,
176
  "patch_size": 14,
177
  "rot_representation_dim": 4
 
188
  "vmin": 1e-08
189
  },
190
  "scale_head": {
191
+ "input_feature_dim": 1536,
192
  "output_dim": 1
193
  },
194
  "type": "dpt+pose"
195
  },
196
  "pretrained_checkpoint_path": null,
197
  "specific_pretrained_submodules": [],
198
+ "torch_hub_force_reload": false,
199
+ "use_register_tokens_from_encoder": true
200
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:20111747deb2c9a3c02fd3bb91f25ac51be951bdeffb5e89ebd45d6cb268b70e
3
- size 2253444224
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50b908b6de0061533f35cbb0b47be3c1dfbfc1ab93ef9e770a5e39299b85b9e2
3
+ size 4914062480