Wave 13: serverless DiLoCo + replaysim normalization + 3 distillation losses + PRIME-RL + Monarch

b266c31 12 days ago

3.61 kB

	"""Modal executor — skeleton for v0.

	This file is a STUB. The full Modal integration requires the `modal`
	client library installed (`pip install modal`) and a configured Modal
	account (`~/.modal.toml`). The user's environment has both, but the
	test suite must run without them, so we keep this file import-safe.

	Real implementation lives in v0 polish; the docstring below is the
	contract.
	"""
	from __future__ import annotations

	from typing import Any, Callable, Mapping

	from composer_replication.diloco.serverless.executor import (
	ReplicaHandle,
	ServerlessExecutor,
	)


	class ModalExecutor(ServerlessExecutor):
	"""Run replicas as Modal Functions in parallel.

	Reference implementation pattern (per ADR-005):

	@app.function(gpu="A100-40GB", timeout=3600)
	def run_replica(rank: int, rendezvous_uri: str, **kwargs):
	os.environ["REPLICA_RANK"] = str(rank)
	from composer_replication.diloco.serverless import (
	MockManager, ObjectStoreAllReduce,
	)
	store = ObjectStoreAllReduce(rendezvous_uri,
	rank=rank, world_size=N)
	manager = MockManager(store)
	# ... run the trainer with this manager ...

	Then `launch_replicas` does:
	calls = [run_replica.spawn(rank=i, ...) for i in range(N)]
	return [ReplicaHandle(rank=i, backend_name="modal",
	metadata={"call_id": calls[i].object_id})
	for i in range(N)]

	Pricing reference (2026-05-26): A100-40GB ≈ $1.95/hr, H100 ≈ $5.50/hr.
	Cold start ≈ 30s. Inter-job networking via cluster mode (opt-in,
	not used by default).

	Status: SKELETON. Real implementation pending v0 polish wave.
	"""
	backend_name = "modal"
	supports_inter_replica_network = False # default; cluster mode = True

	def __init__(self, *, app_name: str = "composer-replication-diloco") -> None:
	try:
	import modal # noqa: F401
	except ImportError as e:
	raise RuntimeError(
	"ModalExecutor requires the modal client. Install with "
	"`pip install modal` and configure with `modal token new`. "
	"Got: " + repr(e)
	)
	self.app_name = app_name
	# Real implementation: build a `modal.App` and register `run_replica`
	# here so that subsequent `launch_replicas` can `.spawn()` it.
	raise NotImplementedError(
	"ModalExecutor is a v0 skeleton; full implementation pending. "
	"Use LocalProcessExecutor for testing."
	)

	# All Protocol methods raise NotImplementedError via __init__ — the
	# class never instantiates successfully in the skeleton. Sketch
	# signatures here for documentation:

	def launch_replicas(
	self,
	n_replicas: int,
	entrypoint: str \| Callable[..., Any],
	entrypoint_args: Mapping[str, Any],
	*,
	gpu: str \| None = "A100-40GB",
	timeout: int = 3600,
	) -> list[ReplicaHandle]:
	raise NotImplementedError

	def poll(self, handle: ReplicaHandle) -> str:
	raise NotImplementedError

	def stream_logs(self, handle: ReplicaHandle, *, n_lines: int = 200) -> str:
	raise NotImplementedError

	def cancel(self, handle: ReplicaHandle) -> None:
	raise NotImplementedError

	def collect(
	self,
	handles: list[ReplicaHandle],
	*,
	timeout: int \| None = None,
	) -> list[dict[str, Any]]:
	raise NotImplementedError


	__all__ = ["ModalExecutor"]