Reinforcement Learning
Transformers
English
post-training
distillation
agentic-coding
composer-2.5
cursor
kimi-k2
grpo
dapo
diloco
openenv
trl
verl
research
methodology
Instructions to use Codeseys/composer-replication-framework with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use Codeseys/composer-replication-framework with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("Codeseys/composer-replication-framework", dtype="auto") - Notebooks
- Google Colab
- Kaggle
| """Gradient-flow tests for compose_loss channels (Wave 16b). | |
| Wave 14-15 verified compose_loss returns correct numeric values and that | |
| channel disables behave correctly. This file closes the gap by verifying | |
| that gradients actually flow back through each enabled channel and reach | |
| model parameters when the channel is on, AND that disabled channels | |
| produce zero side-effects on the autograd graph. | |
| Coverage: | |
| 1. test_alpha_sdpo_routes_grad_to_params | |
| — alpha_sdpo=1.0 + SDPO inputs => non-zero finite grads on params | |
| 2. test_beta_replay_routes_grad_to_params | |
| — beta_replay=1.0 + DPO inputs => non-zero finite grads on params | |
| 3. test_alpha_zero_blocks_sdpo_grad | |
| — alpha_sdpo=0.0: SDPO inputs present vs absent yields BIT-IDENTICAL | |
| param.grad on every parameter (catches phantom-gradient leaks | |
| from disabled channels) | |
| 4. test_taid_grad_flows_through_sdpo_path | |
| — sdpo_wrapper="taid", taid_t=0.5 still routes grads through | |
| the SDPO channel under autograd | |
| Same TinyLM scaffold as test_compose_loss_integration.py — no HF / TRL, | |
| all tests run in milliseconds. | |
| """ | |
| from __future__ import annotations | |
| import math | |
| import torch | |
| import torch.nn as nn | |
| from composer_replication import compose_loss | |
| # ---------------------------------------------------------------------- | |
| # Tiny LM stand-in (mirrors test_compose_loss_integration.py) | |
| # ---------------------------------------------------------------------- | |
| class TinyLM(nn.Module): | |
| """Minimal nn.Module with HF-style ``model(input_ids=...).logits`` API.""" | |
| def __init__(self, vocab: int = 32, hidden: int = 16, seed: int = 0): | |
| super().__init__() | |
| torch.manual_seed(seed) | |
| self.embed = nn.Embedding(vocab, hidden) | |
| self.fc = nn.Linear(hidden, hidden) | |
| self.head = nn.Linear(hidden, vocab) | |
| def forward(self, input_ids: torch.Tensor): | |
| h = torch.tanh(self.fc(self.embed(input_ids))) | |
| logits = self.head(h) | |
| class _Out: | |
| pass | |
| out = _Out() | |
| out.logits = logits | |
| return out | |
| # ---------------------------------------------------------------------- | |
| # Batch fixtures (mirror test_compose_loss_integration.py shape) | |
| # ---------------------------------------------------------------------- | |
| VOCAB = 32 | |
| B = 2 | |
| T = 8 | |
| def _make_inputs(seed: int = 7, *, with_sdpo: bool, with_dpo: bool) -> dict: | |
| """Build a deterministic input batch with optional channel inputs. | |
| SDPO and DPO inputs can be independently included or excluded so we | |
| can exercise the channel-disable code paths cleanly. | |
| """ | |
| g = torch.Generator().manual_seed(seed) | |
| inputs: dict[str, torch.Tensor] = { | |
| "input_ids": torch.randint(0, VOCAB, (B, T), generator=g), | |
| "response_mask": torch.zeros(B, T, dtype=torch.long), | |
| } | |
| inputs["response_mask"][:, T // 2:] = 1 | |
| if with_sdpo: | |
| inputs["ctx_teacher_input_ids"] = torch.randint(0, VOCAB, (B, T), generator=g) | |
| inputs["sdpo_loss_mask"] = torch.zeros(B, T, dtype=torch.long) | |
| inputs["sdpo_loss_mask"][:, T // 2:] = 1 | |
| if with_dpo: | |
| inputs["dpo_chosen_input_ids"] = torch.randint(0, VOCAB, (B, T), generator=g) | |
| inputs["dpo_chosen_response_mask"] = torch.ones(B, T, dtype=torch.long) | |
| inputs["dpo_rejected_input_ids"] = torch.randint(0, VOCAB, (B, T), generator=g) | |
| inputs["dpo_rejected_response_mask"] = torch.ones(B, T, dtype=torch.long) | |
| inputs["dpo_chosen_ref_logprobs"] = torch.randn(B, generator=g) | |
| inputs["dpo_rejected_ref_logprobs"] = torch.randn(B, generator=g) | |
| return inputs | |
| def _grad_norm(model: nn.Module) -> float: | |
| """Sum of |grad| across all params with non-None grad.""" | |
| return sum( | |
| p.grad.detach().abs().sum().item() | |
| for p in model.parameters() | |
| if p.grad is not None | |
| ) | |
| def _grad_is_finite(model: nn.Module) -> bool: | |
| """All param grads are finite (no inf, no nan).""" | |
| for p in model.parameters(): | |
| if p.grad is None: | |
| continue | |
| if not torch.isfinite(p.grad).all(): | |
| return False | |
| return True | |
| def _model() -> TinyLM: | |
| """Fresh TinyLM with deterministic init.""" | |
| return TinyLM(vocab=VOCAB, hidden=16, seed=0) | |
| # ---------------------------------------------------------------------- | |
| # Test 1 — SDPO channel routes grads to params when alpha_sdpo > 0 | |
| # ---------------------------------------------------------------------- | |
| def test_alpha_sdpo_routes_grad_to_params(): | |
| """When alpha_sdpo > 0 and SDPO inputs are present, calling | |
| out.total.backward() must produce non-zero finite gradients on | |
| model parameters. | |
| """ | |
| model = _model() | |
| inputs = _make_inputs(with_sdpo=True, with_dpo=False) | |
| out = compose_loss( | |
| model, | |
| inputs, | |
| alpha_sdpo=1.0, | |
| beta_replay=0.0, | |
| ) | |
| # Sanity: SDPO actually fired (channel is non-zero). | |
| assert float(out.sdpo_jsd) != 0.0, ( | |
| "alpha_sdpo=1.0 with SDPO inputs should produce a non-zero sdpo_jsd; " | |
| f"got {float(out.sdpo_jsd)}" | |
| ) | |
| out.total.backward() | |
| g = _grad_norm(model) | |
| assert g > 0.0, f"Expected non-zero grad sum from SDPO channel; got {g}" | |
| assert math.isfinite(g), f"Grad sum is not finite: {g}" | |
| assert _grad_is_finite(model), "Some grads are inf/nan" | |
| # ---------------------------------------------------------------------- | |
| # Test 2 — Replay-DPO channel routes grads to params when beta_replay > 0 | |
| # ---------------------------------------------------------------------- | |
| def test_beta_replay_routes_grad_to_params(): | |
| """When beta_replay > 0 and DPO inputs are present, backward must | |
| produce non-zero finite gradients on model parameters. | |
| Note: response_mask is set to all-zeros so the LM-CE channel is | |
| exactly zero — any non-zero grad must come from the DPO channel. | |
| """ | |
| model = _model() | |
| inputs = _make_inputs(with_sdpo=False, with_dpo=True) | |
| # Zero out response_mask so LM-CE contributes nothing — isolates DPO. | |
| inputs["response_mask"] = torch.zeros(B, T, dtype=torch.long) | |
| out = compose_loss( | |
| model, | |
| inputs, | |
| alpha_sdpo=0.0, | |
| beta_replay=1.0, | |
| ) | |
| assert float(out.lm_ce) == 0.0, "LM-CE should be zero with empty response_mask" | |
| assert float(out.trace_replay_dpo) != 0.0, ( | |
| "beta_replay=1.0 with DPO inputs should produce a non-zero " | |
| f"trace_replay_dpo; got {float(out.trace_replay_dpo)}" | |
| ) | |
| out.total.backward() | |
| g = _grad_norm(model) | |
| assert g > 0.0, f"Expected non-zero grad sum from DPO channel; got {g}" | |
| assert math.isfinite(g), f"Grad sum is not finite: {g}" | |
| assert _grad_is_finite(model), "Some grads are inf/nan" | |
| # ---------------------------------------------------------------------- | |
| # Test 3 — Disabled SDPO channel produces ZERO side-effects on autograd | |
| # ---------------------------------------------------------------------- | |
| def test_alpha_zero_blocks_sdpo_grad(): | |
| """With alpha_sdpo=0.0, providing SDPO inputs vs omitting them must | |
| produce bit-identical parameter gradients. | |
| This catches a class of bug where a disabled channel leaks a phantom | |
| contribution into the autograd graph (e.g. if the SDPO branch ran a | |
| forward pass even when alpha=0 and somehow scaled the result by | |
| alpha=0 incorrectly). | |
| """ | |
| inputs_with_sdpo = _make_inputs(with_sdpo=True, with_dpo=False) | |
| inputs_no_sdpo = _make_inputs(with_sdpo=False, with_dpo=False) | |
| # Trial A: SDPO inputs present, alpha=0 — channel should be silent. | |
| model_a = _model() | |
| out_a = compose_loss(model_a, inputs_with_sdpo, alpha_sdpo=0.0, beta_replay=0.0) | |
| out_a.total.backward() | |
| grads_a = { | |
| name: p.grad.detach().clone() if p.grad is not None else None | |
| for name, p in model_a.named_parameters() | |
| } | |
| # Trial B: SDPO inputs absent, alpha=0. | |
| model_b = _model() # Same seed -> bit-identical init. | |
| out_b = compose_loss(model_b, inputs_no_sdpo, alpha_sdpo=0.0, beta_replay=0.0) | |
| out_b.total.backward() | |
| grads_b = { | |
| name: p.grad.detach().clone() if p.grad is not None else None | |
| for name, p in model_b.named_parameters() | |
| } | |
| # Bit-identical grads on every parameter. | |
| assert set(grads_a.keys()) == set(grads_b.keys()) | |
| for name in grads_a: | |
| ga, gb = grads_a[name], grads_b[name] | |
| if ga is None and gb is None: | |
| continue | |
| assert ga is not None and gb is not None, ( | |
| f"Param {name}: grad_a={ga is not None}, grad_b={gb is not None}" | |
| ) | |
| # atol=0, rtol=0 -> bit-exact equality. SDPO inputs with alpha=0 | |
| # must not perturb the autograd graph by even one ULP. | |
| assert torch.equal(ga, gb), ( | |
| f"Param {name}: disabled SDPO channel leaked phantom gradient. " | |
| f"|diff|.max()={float((ga - gb).abs().max())}" | |
| ) | |
| # ---------------------------------------------------------------------- | |
| # Test 4 — TAID-wrapped SDPO channel still routes grads under autograd | |
| # ---------------------------------------------------------------------- | |
| def test_taid_grad_flows_through_sdpo_path(): | |
| """The Wave 15 TAID rewrite (logit-space mix, current-student-detached | |
| anchor) must remain differentiable. With sdpo_wrapper='taid' and | |
| taid_t=0.5, backward must produce non-zero finite gradients on | |
| model parameters. | |
| """ | |
| model = _model() | |
| inputs = _make_inputs(with_sdpo=True, with_dpo=False) | |
| out = compose_loss( | |
| model, | |
| inputs, | |
| alpha_sdpo=1.0, | |
| beta_replay=0.0, | |
| sdpo_wrapper="taid", | |
| taid_t=0.5, | |
| ) | |
| assert float(out.sdpo_jsd) != 0.0, ( | |
| f"taid_t=0.5 should still produce a non-zero sdpo_jsd; " | |
| f"got {float(out.sdpo_jsd)}" | |
| ) | |
| out.total.backward() | |
| g = _grad_norm(model) | |
| assert g > 0.0, ( | |
| f"Expected non-zero grad sum from TAID-wrapped SDPO channel; got {g}" | |
| ) | |
| assert math.isfinite(g), f"Grad sum is not finite: {g}" | |
| assert _grad_is_finite(model), "Some grads are inf/nan" | |
| # ---------------------------------------------------------------------- | |
| # Test 5 — Both channels enabled simultaneously route grads correctly | |
| # (Wave 18 — closes the implicit-additivity gap from Wave 16's coverage) | |
| # ---------------------------------------------------------------------- | |
| def test_both_channels_enabled_route_grad_to_params(): | |
| """When alpha_sdpo > 0 AND beta_replay > 0 simultaneously, both channels | |
| must contribute to the gradient. | |
| Wave 16's tests covered each channel in isolation. This pins the | |
| additivity property at the gradient-norm level: with both channels | |
| enabled the gradient norm should be at least comparable to (and | |
| typically larger than) either channel alone. | |
| """ | |
| inputs = _make_inputs(with_sdpo=True, with_dpo=True) | |
| def grads_and_norm(alpha, beta): | |
| m = _model() # seed=0 — same init every call | |
| out = compose_loss(m, inputs, alpha_sdpo=alpha, beta_replay=beta) | |
| out.total.backward() | |
| return _grad_norm(m) | |
| g_sdpo_only = grads_and_norm(alpha=1.0, beta=0.0) | |
| g_dpo_only = grads_and_norm(alpha=0.0, beta=1.0) | |
| g_both = grads_and_norm(alpha=1.0, beta=1.0) | |
| assert g_both > 0.0, f"Both-channels grad sum is zero: {g_both}" | |
| assert math.isfinite(g_both), f"Both-channels grad sum is not finite: {g_both}" | |
| # Smoke property: enabling both channels produces a finite, non-zero | |
| # gradient. We deliberately do NOT assert any lower bound relative to | |
| # individual-channel norms — there's no mathematical floor on the | |
| # composed gradient (the channels operate on different inputs and | |
| # their gradients can cancel arbitrarily on shared parameters). The | |
| # additivity property of autograd holds at the per-tensor level | |
| # (∂(αL1 + βL2)/∂θ = α∂L1/∂θ + β∂L2/∂θ exactly) but L1 norms of | |
| # vector sums need not be ≥ either summand's L1 norm. | |
| # | |
| # The companion test below verifies the per-channel grad-flow | |
| # property: alpha=1,beta=0 routes grad through SDPO; alpha=0,beta=1 | |
| # routes grad through DPO. Both being non-zero in isolation + this | |
| # test's assertion that they jointly produce finite non-zero grads | |
| # is sufficient to pin "both channels contribute" without overclaiming. | |
| # Compute the single-channel norms purely as diagnostic context for | |
| # debugging when this test fails (no assertion uses them). | |
| _diagnostic = (g_sdpo_only, g_dpo_only) # noqa: F841 — kept for debug | |
| # ---------------------------------------------------------------------- | |
| # Test 6 — entropy_opd wrapper routes grads through SDPO path | |
| # (Wave 18 — Wave 15 added entropy_aware_opd_loss without an autograd test) | |
| # ---------------------------------------------------------------------- | |
| def test_entropy_opd_grad_flows_through_sdpo_path(): | |
| """sdpo_wrapper='entropy_opd' must remain differentiable. | |
| Wave 15 plumbed entropy_aware_opd_loss through compose_loss's | |
| sdpo_wrapper switch. Wave 16 tested the 'taid' wrapper under autograd | |
| but didn't exercise 'entropy_opd'. This test pins the entropy_opd | |
| path is differentiable end-to-end. | |
| """ | |
| model = _model() | |
| inputs = _make_inputs(with_sdpo=True, with_dpo=False) | |
| out = compose_loss( | |
| model, | |
| inputs, | |
| alpha_sdpo=1.0, | |
| beta_replay=0.0, | |
| sdpo_wrapper="entropy_opd", | |
| ) | |
| assert math.isfinite(float(out.sdpo_jsd)), ( | |
| f"entropy_opd produced non-finite sdpo_jsd: {float(out.sdpo_jsd)}" | |
| ) | |
| out.total.backward() | |
| g = _grad_norm(model) | |
| assert g > 0.0, ( | |
| f"Expected non-zero grad sum from entropy_opd-wrapped SDPO; got {g}" | |
| ) | |
| assert math.isfinite(g), f"Grad sum is not finite: {g}" | |
| assert _grad_is_finite(model), "Some grads are inf/nan" | |