| """Multi-view geometry helpers around pycolmap. |
| |
| This module extracts the intrinsics/extrinsics we need for triangulation, |
| in a shape-normalised form that doesn't depend on pycolmap's exact version. |
| |
| Everything here is pure numpy + pycolmap — no torch, no kornia — so it can |
| run inside the HuggingFace submission container without installs. |
| |
| Key data structure: ``ViewInfo`` (a plain dict) with keys: |
| |
| image_id str — the short sample-level id (matches entry['image_ids']) |
| colmap_img pycolmap.Image |
| camera_id int |
| K (3,3) float64 — calibration matrix |
| R (3,3) float64 — world→camera rotation |
| t (3,) float64 — world→camera translation |
| P (3,4) float64 — K @ [R | t] (projection matrix) |
| center (3,) float64 — camera centre in world coords, -R^T t |
| width, height int — image resolution at COLMAP scale |
| |
| Downstream code uses ``P`` for DLT triangulation and ``K, R, t`` for epipolar |
| geometry. All functions here are side-effect-free. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import numpy as np |
|
|
| from hoho2025.example_solutions import _cam_matrix_from_image |
|
|
|
|
| def get_view_info(colmap_rec, img_id_substring: str) -> dict | None: |
| """Return ViewInfo for the COLMAP image whose name contains ``img_id_substring``. |
| |
| Returns None if the image is not registered in the reconstruction. |
| """ |
| found = None |
| for _, col_img in colmap_rec.images.items(): |
| if img_id_substring in col_img.name: |
| found = col_img |
| break |
| if found is None: |
| return None |
|
|
| R, t = _cam_matrix_from_image(found) |
| cam = colmap_rec.cameras[found.camera_id] |
| K = np.asarray(cam.calibration_matrix(), dtype=np.float64) |
| P = K @ np.hstack([R, t.reshape(3, 1)]) |
| center = -R.T @ t |
|
|
| return { |
| "image_id": img_id_substring, |
| "colmap_img": found, |
| "camera_id": int(found.camera_id), |
| "K": K, |
| "R": R, |
| "t": t, |
| "P": P, |
| "center": center, |
| "width": int(cam.width), |
| "height": int(cam.height), |
| } |
|
|
|
|
| def collect_views(colmap_rec, image_ids) -> dict[str, dict]: |
| """Build a mapping ``{image_id → ViewInfo}`` for every id found in the recon. |
| |
| Skips ids that are not registered (returns fewer items than requested |
| — caller must handle the missing keys). |
| """ |
| out: dict[str, dict] = {} |
| for iid in image_ids: |
| info = get_view_info(colmap_rec, iid) |
| if info is not None: |
| out[iid] = info |
| return out |
|
|
|
|
| def project_world_to_image(P: np.ndarray, points3d: np.ndarray) -> tuple[np.ndarray, np.ndarray]: |
| """Project Nx3 world points through a 3x4 projection matrix. |
| |
| Returns |
| ------- |
| uv : (N, 2) float64 — pixel coordinates |
| z : (N,) float64 — camera-space depth (>0 means in front of the camera) |
| """ |
| pts = np.asarray(points3d, dtype=np.float64) |
| if pts.ndim == 1: |
| pts = pts.reshape(1, 3) |
| homog = np.hstack([pts, np.ones((len(pts), 1))]) |
| proj = homog @ P.T |
| z = proj[:, 2] |
| safe = np.where(np.abs(z) < 1e-12, 1e-12, z) |
| uv = proj[:, :2] / safe[:, None] |
| return uv, z |
|
|
|
|
| def relative_pose(view_a: dict, view_b: dict) -> tuple[np.ndarray, np.ndarray]: |
| """Return rotation and translation from view_a's frame to view_b's frame. |
| |
| If x_a is a point in view_a's camera frame, then |
| x_b = R_ab @ x_a + t_ab |
| with |
| R_ab = R_b @ R_a^T |
| t_ab = t_b - R_ab @ t_a |
| """ |
| R_a, t_a = view_a["R"], view_a["t"] |
| R_b, t_b = view_b["R"], view_b["t"] |
| R_ab = R_b @ R_a.T |
| t_ab = t_b - R_ab @ t_a |
| return R_ab, t_ab |
|
|
|
|
| def _skew(v: np.ndarray) -> np.ndarray: |
| x, y, z = v |
| return np.array([[0, -z, y], |
| [z, 0, -x], |
| [-y, x, 0]], dtype=np.float64) |
|
|
|
|
| def fundamental_matrix(view_a: dict, view_b: dict) -> np.ndarray: |
| """Compute the fundamental matrix F_ab such that |
| x_b^T @ F_ab @ x_a = 0 |
| for corresponding points (in homogeneous pixel coordinates). |
| |
| Derivation: F = K_b^{-T} · [t_ab]× · R_ab · K_a^{-1} |
| """ |
| R_ab, t_ab = relative_pose(view_a, view_b) |
| K_a_inv = np.linalg.inv(view_a["K"]) |
| K_b_inv_T = np.linalg.inv(view_b["K"]).T |
| E = _skew(t_ab) @ R_ab |
| F = K_b_inv_T @ E @ K_a_inv |
| return F |
|
|
|
|
| def epipolar_line(F: np.ndarray, point_in_a: np.ndarray) -> np.ndarray: |
| """Epipolar line in view b induced by a point in view a. |
| |
| Returns ``(a, b, c)`` with ``a*u + b*v + c = 0`` in view b. |
| """ |
| x = np.array([point_in_a[0], point_in_a[1], 1.0], dtype=np.float64) |
| return F @ x |
|
|
|
|
| def point_to_line_distance(line: np.ndarray, point_uv: np.ndarray) -> float: |
| """Perpendicular distance from a 2D point to a homogeneous line (a,b,c).""" |
| a, b, c = line |
| num = abs(a * point_uv[0] + b * point_uv[1] + c) |
| den = np.sqrt(a * a + b * b) + 1e-12 |
| return float(num / den) |
|
|
|
|
| def triangulate_dlt(Ps, pts2d) -> np.ndarray: |
| """Linear triangulation (DLT) from ``>=2`` views. |
| |
| Parameters |
| ---------- |
| Ps : sequence of (3,4) projection matrices |
| pts2d : sequence of (x, y) pixel coordinates, one per view |
| |
| Returns the 3D point as a (3,) ndarray in world coordinates. |
| """ |
| A = [] |
| for P, (x, y) in zip(Ps, pts2d): |
| A.append(x * P[2] - P[0]) |
| A.append(y * P[2] - P[1]) |
| A = np.asarray(A, dtype=np.float64) |
| try: |
| _, _, Vt = np.linalg.svd(A) |
| except Exception: |
| return np.array([np.nan, np.nan, np.nan], dtype=np.float64) |
| X = Vt[-1] |
| if abs(X[3]) < 1e-12: |
| return np.array([np.nan, np.nan, np.nan], dtype=np.float64) |
| return X[:3] / X[3] |
|
|
|
|
| def mean_reprojection_error(X: np.ndarray, Ps, pts2d) -> float: |
| """Mean L2 reprojection error of ``X`` across multiple views. |
| |
| Points behind the camera (depth <= 0) contribute a large penalty so the |
| caller can use this as a direct cost for track acceptance. |
| """ |
| if np.any(~np.isfinite(X)): |
| return float("inf") |
| errs = [] |
| for P, uv in zip(Ps, pts2d): |
| u, z = project_world_to_image(P, X.reshape(1, 3)) |
| if z[0] <= 0: |
| return float("inf") |
| errs.append(float(np.linalg.norm(u[0] - np.asarray(uv, dtype=np.float64)))) |
| if not errs: |
| return float("inf") |
| return float(np.mean(errs)) |
|
|