facebook
/

map-anything

@@ -2,39 +2,9 @@
 tags:
 - model_hub_mixin
 - pytorch_model_hub_mixin
-- computer-vision
-- 3d-reconstruction
-- multi-view-stereo
-- depth-estimation
-- camera-pose
-- covisibility
-- mapanything
-license: cc-by-nc-4.0
-language:
-- en
-pipeline_tag: image-to-3d
 ---
-## Overview
-MapAnything is a simple, end-to-end trained transformer model that directly regresses the factored metric 3D geometry of a scene given various types of modalities as inputs. A single feed-forward model supports over 12 different 3D reconstruction tasks including multi-image sfm, multi-view stereo, monocular metric depth estimation, registration, depth completion and more.
-This is the CC-BY-NC-4.0 variant of the model. Latest release on Dec 18th 2025.
-## Quick Start
-Please refer to our [Github Repo](https://github.com/facebookresearch/map-anything)
-## Citation
-If you find our repository useful, please consider giving it a star ⭐ and citing our paper in your work:
-```bibtex
-@inproceedings{keetha2026mapanything,
-  title={{MapAnything}: Universal Feed-Forward Metric 3D Reconstruction},
-  author={Keetha, Nikhil and M{\"u}ller, Norman and Sch{\"o}nberger, Johannes and Porzi, Lorenzo and Zhang, Yuchen and Fischer, Tobias and Knapitsch, Arno and Zauss, Duncan and Weber, Ethan and Antunes, Nelson and others},
-  booktitle={International Conference on 3D Vision (3DV)},
-  year={2026},
-  organization={IEEE}
-}
-```

 tags:
 - model_hub_mixin
 - pytorch_model_hub_mixin
 ---
+This model has been pushed to the Hub using the [PytorchModelHubMixin](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.PyTorchModelHubMixin) integration:
+- Code: [More Information Needed]
+- Paper: [More Information Needed]
+- Docs: [More Information Needed]

config.json CHANGED Viewed

@@ -3,8 +3,10 @@
     "data_norm_type": "dinov2",
     "encoder_str": "dinov2",
     "gradient_checkpointing": true,
-    "name": "dinov2_large",
-    "size": "large",
     "torch_hub_force_reload": false,
     "uses_torch_hub": true,
     "with_registers": false
@@ -12,20 +14,20 @@
   "geometric_input_config": {
     "cam_prob": 1.0,
     "cam_rot_encoder_config": {
-      "enc_embed_dim": 1024,
       "encoder_str": "global_rep_encoder",
       "in_chans": 4,
       "name": "cam_rot_quats_encoder"
     },
     "cam_trans_encoder_config": {
-      "enc_embed_dim": 1024,
       "encoder_str": "global_rep_encoder",
       "in_chans": 3,
       "name": "cam_trans_encoder"
     },
     "depth_encoder_config": {
       "apply_pe": false,
-      "enc_embed_dim": 1024,
       "encoder_str": "dense_rep_encoder",
       "in_chans": 1,
       "name": "depth_encoder",
@@ -38,7 +40,7 @@
     "pose_scale_norm_all_prob": 0.0,
     "ray_dirs_encoder_config": {
       "apply_pe": false,
-      "enc_embed_dim": 1024,
       "encoder_str": "dense_rep_encoder",
       "in_chans": 3,
       "name": "ray_dirs_encoder",
@@ -46,7 +48,7 @@
     },
     "ray_dirs_prob": 1.0,
     "scale_encoder_config": {
-      "enc_embed_dim": 1024,
       "encoder_str": "global_rep_encoder",
       "in_chans": 1,
       "name": "scale_encoder"
@@ -60,19 +62,28 @@
     "model_type": "alternating_attention",
     "module_args": {
       "custom_positional_encoding": null,
-      "depth": 24,
       "distinguish_ref_and_non_ref_views": true,
       "gradient_checkpointing": false,
       "indices": [
-        11,
-        17
       ],
-      "input_embed_dim": 1024,
-      "name": "aat_24_layers_ifr",
       "norm_intermediate": true,
-      "size": "24_layers"
     }
   },
   "load_specific_pretrained_submodules": false,
   "name": "mapanything",
   "pred_head_config": {
@@ -141,10 +152,10 @@
         3
       ],
       "input_feature_dims": [
-        1024,
-        768,
-        768,
-        768
       ],
       "patch_size": 14
     },
@@ -160,7 +171,7 @@
       "quaternions_vmin": -Infinity
     },
     "pose_head": {
-      "input_feature_dim": 768,
       "num_resconv_block": 2,
       "patch_size": 14,
       "rot_representation_dim": 4
@@ -177,12 +188,13 @@
       "vmin": 1e-08
     },
     "scale_head": {
-      "input_feature_dim": 768,
       "output_dim": 1
     },
     "type": "dpt+pose"
   },
   "pretrained_checkpoint_path": null,
   "specific_pretrained_submodules": [],
-  "torch_hub_force_reload": false
 }

     "data_norm_type": "dinov2",
     "encoder_str": "dinov2",
     "gradient_checkpointing": true,
+    "keep_first_n_layers": 24,
+    "name": "dinov2_giant_24_layers",
+    "norm_returned_features": false,
+    "size": "giant",
     "torch_hub_force_reload": false,
     "uses_torch_hub": true,
     "with_registers": false
   "geometric_input_config": {
     "cam_prob": 1.0,
     "cam_rot_encoder_config": {
+      "enc_embed_dim": 1536,
       "encoder_str": "global_rep_encoder",
       "in_chans": 4,
       "name": "cam_rot_quats_encoder"
     },
     "cam_trans_encoder_config": {
+      "enc_embed_dim": 1536,
       "encoder_str": "global_rep_encoder",
       "in_chans": 3,
       "name": "cam_trans_encoder"
     },
     "depth_encoder_config": {
       "apply_pe": false,
+      "enc_embed_dim": 1536,
       "encoder_str": "dense_rep_encoder",
       "in_chans": 1,
       "name": "depth_encoder",
     "pose_scale_norm_all_prob": 0.0,
     "ray_dirs_encoder_config": {
       "apply_pe": false,
+      "enc_embed_dim": 1536,
       "encoder_str": "dense_rep_encoder",
       "in_chans": 3,
       "name": "ray_dirs_encoder",
     },
     "ray_dirs_prob": 1.0,
     "scale_encoder_config": {
+      "enc_embed_dim": 1536,
       "encoder_str": "global_rep_encoder",
       "in_chans": 1,
       "name": "scale_encoder"
     "model_type": "alternating_attention",
     "module_args": {
       "custom_positional_encoding": null,
+      "depth": 16,
+      "dim": 1536,
       "distinguish_ref_and_non_ref_views": true,
       "gradient_checkpointing": false,
       "indices": [
+        7,
+        11
       ],
+      "init_values": 1e-05,
+      "input_embed_dim": 1536,
+      "mlp_layer": "dummy",
+      "mlp_ratio": 4,
+      "name": "aat_16_layers_dinov2_vitg_init",
       "norm_intermediate": true,
+      "num_heads": 24,
+      "pretrained_checkpoint_path": null,
+      "qk_norm": false,
+      "qkv_bias": true,
+      "size": "16_layers"
     }
   },
+  "info_sharing_mlp_layer_str": "swiglufused",
   "load_specific_pretrained_submodules": false,
   "name": "mapanything",
   "pred_head_config": {
         3
       ],
       "input_feature_dims": [
+        1536,
+        1536,
+        1536,
+        1536
       ],
       "patch_size": 14
     },
       "quaternions_vmin": -Infinity
     },
     "pose_head": {
+      "input_feature_dim": 1536,
       "num_resconv_block": 2,
       "patch_size": 14,
       "rot_representation_dim": 4
       "vmin": 1e-08
     },
     "scale_head": {
+      "input_feature_dim": 1536,
       "output_dim": 1
     },
     "type": "dpt+pose"
   },
   "pretrained_checkpoint_path": null,
   "specific_pretrained_submodules": [],
+  "torch_hub_force_reload": false,
+  "use_register_tokens_from_encoder": true
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:20111747deb2c9a3c02fd3bb91f25ac51be951bdeffb5e89ebd45d6cb268b70e
-size 2253444224

 version https://git-lfs.github.com/spec/v1
+oid sha256:50b908b6de0061533f35cbb0b47be3c1dfbfc1ab93ef9e770a5e39299b85b9e2
+size 4914062480