Codeseys's picture
Wave 13: serverless DiLoCo + replaysim normalization + 3 distillation losses + PRIME-RL + Monarch
b266c31
"""Modal executor — skeleton for v0.
This file is a STUB. The full Modal integration requires the `modal`
client library installed (`pip install modal`) and a configured Modal
account (`~/.modal.toml`). The user's environment has both, but the
test suite must run without them, so we keep this file import-safe.
Real implementation lives in v0 polish; the docstring below is the
contract.
"""
from __future__ import annotations
from typing import Any, Callable, Mapping
from composer_replication.diloco.serverless.executor import (
ReplicaHandle,
ServerlessExecutor,
)
class ModalExecutor(ServerlessExecutor):
"""Run replicas as Modal Functions in parallel.
Reference implementation pattern (per ADR-005):
@app.function(gpu="A100-40GB", timeout=3600)
def run_replica(rank: int, rendezvous_uri: str, **kwargs):
os.environ["REPLICA_RANK"] = str(rank)
from composer_replication.diloco.serverless import (
MockManager, ObjectStoreAllReduce,
)
store = ObjectStoreAllReduce(rendezvous_uri,
rank=rank, world_size=N)
manager = MockManager(store)
# ... run the trainer with this manager ...
Then `launch_replicas` does:
calls = [run_replica.spawn(rank=i, ...) for i in range(N)]
return [ReplicaHandle(rank=i, backend_name="modal",
metadata={"call_id": calls[i].object_id})
for i in range(N)]
Pricing reference (2026-05-26): A100-40GB ≈ $1.95/hr, H100 ≈ $5.50/hr.
Cold start ≈ 30s. Inter-job networking via cluster mode (opt-in,
not used by default).
Status: SKELETON. Real implementation pending v0 polish wave.
"""
backend_name = "modal"
supports_inter_replica_network = False # default; cluster mode = True
def __init__(self, *, app_name: str = "composer-replication-diloco") -> None:
try:
import modal # noqa: F401
except ImportError as e:
raise RuntimeError(
"ModalExecutor requires the modal client. Install with "
"`pip install modal` and configure with `modal token new`. "
"Got: " + repr(e)
)
self.app_name = app_name
# Real implementation: build a `modal.App` and register `run_replica`
# here so that subsequent `launch_replicas` can `.spawn()` it.
raise NotImplementedError(
"ModalExecutor is a v0 skeleton; full implementation pending. "
"Use LocalProcessExecutor for testing."
)
# All Protocol methods raise NotImplementedError via __init__ — the
# class never instantiates successfully in the skeleton. Sketch
# signatures here for documentation:
def launch_replicas(
self,
n_replicas: int,
entrypoint: str | Callable[..., Any],
entrypoint_args: Mapping[str, Any],
*,
gpu: str | None = "A100-40GB",
timeout: int = 3600,
) -> list[ReplicaHandle]:
raise NotImplementedError
def poll(self, handle: ReplicaHandle) -> str:
raise NotImplementedError
def stream_logs(self, handle: ReplicaHandle, *, n_lines: int = 200) -> str:
raise NotImplementedError
def cancel(self, handle: ReplicaHandle) -> None:
raise NotImplementedError
def collect(
self,
handles: list[ReplicaHandle],
*,
timeout: int | None = None,
) -> list[dict[str, Any]]:
raise NotImplementedError
__all__ = ["ModalExecutor"]