Reinforcement Learning
Transformers
English
post-training
distillation
agentic-coding
composer-2.5
cursor
kimi-k2
grpo
dapo
diloco
openenv
trl
verl
research
methodology
Instructions to use Codeseys/composer-replication-framework with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use Codeseys/composer-replication-framework with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("Codeseys/composer-replication-framework", dtype="auto") - Notebooks
- Google Colab
- Kaggle
File size: 13,867 Bytes
c0a5ab7 54efac8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 | """Gradient-flow tests for compose_loss channels (Wave 16b).
Wave 14-15 verified compose_loss returns correct numeric values and that
channel disables behave correctly. This file closes the gap by verifying
that gradients actually flow back through each enabled channel and reach
model parameters when the channel is on, AND that disabled channels
produce zero side-effects on the autograd graph.
Coverage:
1. test_alpha_sdpo_routes_grad_to_params
— alpha_sdpo=1.0 + SDPO inputs => non-zero finite grads on params
2. test_beta_replay_routes_grad_to_params
— beta_replay=1.0 + DPO inputs => non-zero finite grads on params
3. test_alpha_zero_blocks_sdpo_grad
— alpha_sdpo=0.0: SDPO inputs present vs absent yields BIT-IDENTICAL
param.grad on every parameter (catches phantom-gradient leaks
from disabled channels)
4. test_taid_grad_flows_through_sdpo_path
— sdpo_wrapper="taid", taid_t=0.5 still routes grads through
the SDPO channel under autograd
Same TinyLM scaffold as test_compose_loss_integration.py — no HF / TRL,
all tests run in milliseconds.
"""
from __future__ import annotations
import math
import torch
import torch.nn as nn
from composer_replication import compose_loss
# ----------------------------------------------------------------------
# Tiny LM stand-in (mirrors test_compose_loss_integration.py)
# ----------------------------------------------------------------------
class TinyLM(nn.Module):
"""Minimal nn.Module with HF-style ``model(input_ids=...).logits`` API."""
def __init__(self, vocab: int = 32, hidden: int = 16, seed: int = 0):
super().__init__()
torch.manual_seed(seed)
self.embed = nn.Embedding(vocab, hidden)
self.fc = nn.Linear(hidden, hidden)
self.head = nn.Linear(hidden, vocab)
def forward(self, input_ids: torch.Tensor):
h = torch.tanh(self.fc(self.embed(input_ids)))
logits = self.head(h)
class _Out:
pass
out = _Out()
out.logits = logits
return out
# ----------------------------------------------------------------------
# Batch fixtures (mirror test_compose_loss_integration.py shape)
# ----------------------------------------------------------------------
VOCAB = 32
B = 2
T = 8
def _make_inputs(seed: int = 7, *, with_sdpo: bool, with_dpo: bool) -> dict:
"""Build a deterministic input batch with optional channel inputs.
SDPO and DPO inputs can be independently included or excluded so we
can exercise the channel-disable code paths cleanly.
"""
g = torch.Generator().manual_seed(seed)
inputs: dict[str, torch.Tensor] = {
"input_ids": torch.randint(0, VOCAB, (B, T), generator=g),
"response_mask": torch.zeros(B, T, dtype=torch.long),
}
inputs["response_mask"][:, T // 2:] = 1
if with_sdpo:
inputs["ctx_teacher_input_ids"] = torch.randint(0, VOCAB, (B, T), generator=g)
inputs["sdpo_loss_mask"] = torch.zeros(B, T, dtype=torch.long)
inputs["sdpo_loss_mask"][:, T // 2:] = 1
if with_dpo:
inputs["dpo_chosen_input_ids"] = torch.randint(0, VOCAB, (B, T), generator=g)
inputs["dpo_chosen_response_mask"] = torch.ones(B, T, dtype=torch.long)
inputs["dpo_rejected_input_ids"] = torch.randint(0, VOCAB, (B, T), generator=g)
inputs["dpo_rejected_response_mask"] = torch.ones(B, T, dtype=torch.long)
inputs["dpo_chosen_ref_logprobs"] = torch.randn(B, generator=g)
inputs["dpo_rejected_ref_logprobs"] = torch.randn(B, generator=g)
return inputs
def _grad_norm(model: nn.Module) -> float:
"""Sum of |grad| across all params with non-None grad."""
return sum(
p.grad.detach().abs().sum().item()
for p in model.parameters()
if p.grad is not None
)
def _grad_is_finite(model: nn.Module) -> bool:
"""All param grads are finite (no inf, no nan)."""
for p in model.parameters():
if p.grad is None:
continue
if not torch.isfinite(p.grad).all():
return False
return True
def _model() -> TinyLM:
"""Fresh TinyLM with deterministic init."""
return TinyLM(vocab=VOCAB, hidden=16, seed=0)
# ----------------------------------------------------------------------
# Test 1 — SDPO channel routes grads to params when alpha_sdpo > 0
# ----------------------------------------------------------------------
def test_alpha_sdpo_routes_grad_to_params():
"""When alpha_sdpo > 0 and SDPO inputs are present, calling
out.total.backward() must produce non-zero finite gradients on
model parameters.
"""
model = _model()
inputs = _make_inputs(with_sdpo=True, with_dpo=False)
out = compose_loss(
model,
inputs,
alpha_sdpo=1.0,
beta_replay=0.0,
)
# Sanity: SDPO actually fired (channel is non-zero).
assert float(out.sdpo_jsd) != 0.0, (
"alpha_sdpo=1.0 with SDPO inputs should produce a non-zero sdpo_jsd; "
f"got {float(out.sdpo_jsd)}"
)
out.total.backward()
g = _grad_norm(model)
assert g > 0.0, f"Expected non-zero grad sum from SDPO channel; got {g}"
assert math.isfinite(g), f"Grad sum is not finite: {g}"
assert _grad_is_finite(model), "Some grads are inf/nan"
# ----------------------------------------------------------------------
# Test 2 — Replay-DPO channel routes grads to params when beta_replay > 0
# ----------------------------------------------------------------------
def test_beta_replay_routes_grad_to_params():
"""When beta_replay > 0 and DPO inputs are present, backward must
produce non-zero finite gradients on model parameters.
Note: response_mask is set to all-zeros so the LM-CE channel is
exactly zero — any non-zero grad must come from the DPO channel.
"""
model = _model()
inputs = _make_inputs(with_sdpo=False, with_dpo=True)
# Zero out response_mask so LM-CE contributes nothing — isolates DPO.
inputs["response_mask"] = torch.zeros(B, T, dtype=torch.long)
out = compose_loss(
model,
inputs,
alpha_sdpo=0.0,
beta_replay=1.0,
)
assert float(out.lm_ce) == 0.0, "LM-CE should be zero with empty response_mask"
assert float(out.trace_replay_dpo) != 0.0, (
"beta_replay=1.0 with DPO inputs should produce a non-zero "
f"trace_replay_dpo; got {float(out.trace_replay_dpo)}"
)
out.total.backward()
g = _grad_norm(model)
assert g > 0.0, f"Expected non-zero grad sum from DPO channel; got {g}"
assert math.isfinite(g), f"Grad sum is not finite: {g}"
assert _grad_is_finite(model), "Some grads are inf/nan"
# ----------------------------------------------------------------------
# Test 3 — Disabled SDPO channel produces ZERO side-effects on autograd
# ----------------------------------------------------------------------
def test_alpha_zero_blocks_sdpo_grad():
"""With alpha_sdpo=0.0, providing SDPO inputs vs omitting them must
produce bit-identical parameter gradients.
This catches a class of bug where a disabled channel leaks a phantom
contribution into the autograd graph (e.g. if the SDPO branch ran a
forward pass even when alpha=0 and somehow scaled the result by
alpha=0 incorrectly).
"""
inputs_with_sdpo = _make_inputs(with_sdpo=True, with_dpo=False)
inputs_no_sdpo = _make_inputs(with_sdpo=False, with_dpo=False)
# Trial A: SDPO inputs present, alpha=0 — channel should be silent.
model_a = _model()
out_a = compose_loss(model_a, inputs_with_sdpo, alpha_sdpo=0.0, beta_replay=0.0)
out_a.total.backward()
grads_a = {
name: p.grad.detach().clone() if p.grad is not None else None
for name, p in model_a.named_parameters()
}
# Trial B: SDPO inputs absent, alpha=0.
model_b = _model() # Same seed -> bit-identical init.
out_b = compose_loss(model_b, inputs_no_sdpo, alpha_sdpo=0.0, beta_replay=0.0)
out_b.total.backward()
grads_b = {
name: p.grad.detach().clone() if p.grad is not None else None
for name, p in model_b.named_parameters()
}
# Bit-identical grads on every parameter.
assert set(grads_a.keys()) == set(grads_b.keys())
for name in grads_a:
ga, gb = grads_a[name], grads_b[name]
if ga is None and gb is None:
continue
assert ga is not None and gb is not None, (
f"Param {name}: grad_a={ga is not None}, grad_b={gb is not None}"
)
# atol=0, rtol=0 -> bit-exact equality. SDPO inputs with alpha=0
# must not perturb the autograd graph by even one ULP.
assert torch.equal(ga, gb), (
f"Param {name}: disabled SDPO channel leaked phantom gradient. "
f"|diff|.max()={float((ga - gb).abs().max())}"
)
# ----------------------------------------------------------------------
# Test 4 — TAID-wrapped SDPO channel still routes grads under autograd
# ----------------------------------------------------------------------
def test_taid_grad_flows_through_sdpo_path():
"""The Wave 15 TAID rewrite (logit-space mix, current-student-detached
anchor) must remain differentiable. With sdpo_wrapper='taid' and
taid_t=0.5, backward must produce non-zero finite gradients on
model parameters.
"""
model = _model()
inputs = _make_inputs(with_sdpo=True, with_dpo=False)
out = compose_loss(
model,
inputs,
alpha_sdpo=1.0,
beta_replay=0.0,
sdpo_wrapper="taid",
taid_t=0.5,
)
assert float(out.sdpo_jsd) != 0.0, (
f"taid_t=0.5 should still produce a non-zero sdpo_jsd; "
f"got {float(out.sdpo_jsd)}"
)
out.total.backward()
g = _grad_norm(model)
assert g > 0.0, (
f"Expected non-zero grad sum from TAID-wrapped SDPO channel; got {g}"
)
assert math.isfinite(g), f"Grad sum is not finite: {g}"
assert _grad_is_finite(model), "Some grads are inf/nan"
# ----------------------------------------------------------------------
# Test 5 — Both channels enabled simultaneously route grads correctly
# (Wave 18 — closes the implicit-additivity gap from Wave 16's coverage)
# ----------------------------------------------------------------------
def test_both_channels_enabled_route_grad_to_params():
"""When alpha_sdpo > 0 AND beta_replay > 0 simultaneously, both channels
must contribute to the gradient.
Wave 16's tests covered each channel in isolation. This pins the
additivity property at the gradient-norm level: with both channels
enabled the gradient norm should be at least comparable to (and
typically larger than) either channel alone.
"""
inputs = _make_inputs(with_sdpo=True, with_dpo=True)
def grads_and_norm(alpha, beta):
m = _model() # seed=0 — same init every call
out = compose_loss(m, inputs, alpha_sdpo=alpha, beta_replay=beta)
out.total.backward()
return _grad_norm(m)
g_sdpo_only = grads_and_norm(alpha=1.0, beta=0.0)
g_dpo_only = grads_and_norm(alpha=0.0, beta=1.0)
g_both = grads_and_norm(alpha=1.0, beta=1.0)
assert g_both > 0.0, f"Both-channels grad sum is zero: {g_both}"
assert math.isfinite(g_both), f"Both-channels grad sum is not finite: {g_both}"
# Smoke property: enabling both channels produces a finite, non-zero
# gradient. We deliberately do NOT assert any lower bound relative to
# individual-channel norms — there's no mathematical floor on the
# composed gradient (the channels operate on different inputs and
# their gradients can cancel arbitrarily on shared parameters). The
# additivity property of autograd holds at the per-tensor level
# (∂(αL1 + βL2)/∂θ = α∂L1/∂θ + β∂L2/∂θ exactly) but L1 norms of
# vector sums need not be ≥ either summand's L1 norm.
#
# The companion test below verifies the per-channel grad-flow
# property: alpha=1,beta=0 routes grad through SDPO; alpha=0,beta=1
# routes grad through DPO. Both being non-zero in isolation + this
# test's assertion that they jointly produce finite non-zero grads
# is sufficient to pin "both channels contribute" without overclaiming.
# Compute the single-channel norms purely as diagnostic context for
# debugging when this test fails (no assertion uses them).
_diagnostic = (g_sdpo_only, g_dpo_only) # noqa: F841 — kept for debug
# ----------------------------------------------------------------------
# Test 6 — entropy_opd wrapper routes grads through SDPO path
# (Wave 18 — Wave 15 added entropy_aware_opd_loss without an autograd test)
# ----------------------------------------------------------------------
def test_entropy_opd_grad_flows_through_sdpo_path():
"""sdpo_wrapper='entropy_opd' must remain differentiable.
Wave 15 plumbed entropy_aware_opd_loss through compose_loss's
sdpo_wrapper switch. Wave 16 tested the 'taid' wrapper under autograd
but didn't exercise 'entropy_opd'. This test pins the entropy_opd
path is differentiable end-to-end.
"""
model = _model()
inputs = _make_inputs(with_sdpo=True, with_dpo=False)
out = compose_loss(
model,
inputs,
alpha_sdpo=1.0,
beta_replay=0.0,
sdpo_wrapper="entropy_opd",
)
assert math.isfinite(float(out.sdpo_jsd)), (
f"entropy_opd produced non-finite sdpo_jsd: {float(out.sdpo_jsd)}"
)
out.total.backward()
g = _grad_norm(model)
assert g > 0.0, (
f"Expected non-zero grad sum from entropy_opd-wrapped SDPO; got {g}"
)
assert math.isfinite(g), f"Grad sum is not finite: {g}"
assert _grad_is_finite(model), "Some grads are inf/nan"
|