Reinforcement Learning
Transformers
English
post-training
distillation
agentic-coding
composer-2.5
cursor
kimi-k2
grpo
dapo
diloco
openenv
trl
verl
research
methodology
Instructions to use Codeseys/composer-replication-framework with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use Codeseys/composer-replication-framework with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("Codeseys/composer-replication-framework", dtype="auto") - Notebooks
- Google Colab
- Kaggle
| """hint_generator.py — Template-based hint generator (v0.1 starter). | |
| Composer 2.5 inserts text hints at error-turn sites: | |
| "Reminder: Available tools are: …" (when a tool-call refs a non-existent tool) | |
| "Reminder: tool arguments must be valid JSON" (on JSONDecodeError) | |
| ... etc. | |
| This module provides a registry of hint templates keyed by error_kind. The | |
| data collator (in trl_path/data_collator.py) calls dispatch(error_kind, ctx) | |
| to get the hint text to splice into ctx_teacher. | |
| v0.2 will replace these templates with an LLM-driven hint generator (likely | |
| Sonnet 4.6 or Opus 4.7 via OpenRouter) for cases where templates are too rigid | |
| (style violations, wasteful explanations). | |
| """ | |
| from __future__ import annotations | |
| from collections.abc import Callable | |
| from typing import TypedDict | |
| class HintContext(TypedDict, total=False): | |
| """Per-error context the hint generator can use.""" | |
| error_kind: str # e.g. "tool_not_found", "json_decode", "type_error" | |
| error_message: str # raw error from the env | |
| available_tools: list[str] # for tool_not_found | |
| tool_name: str # the failing tool, if known | |
| tool_schema: dict # the schema, if known | |
| intent: str # student's apparent intent, if extractable | |
| # --------------------------------------------------------------------------- | |
| # Hint templates | |
| # --------------------------------------------------------------------------- | |
| def hint_tool_not_found(ctx: HintContext) -> str: | |
| tools = ctx.get("available_tools", []) | |
| if tools: | |
| tool_list = ", ".join(f"`{t}`" for t in tools) | |
| return f"Reminder: Available tools are: {tool_list}. Please use one of these." | |
| return "Reminder: the tool you tried to call does not exist. Use only available tools." | |
| def hint_json_decode(ctx: HintContext) -> str: | |
| return ( | |
| "Reminder: tool arguments must be valid JSON. Common mistakes: " | |
| "single quotes (use double), trailing commas, unescaped newlines in strings." | |
| ) | |
| def hint_type_error(ctx: HintContext) -> str: | |
| name = ctx.get("tool_name") | |
| schema = ctx.get("tool_schema") | |
| if name and schema: | |
| return ( | |
| f"Reminder: `{name}` expects arguments matching this schema:\n" | |
| f" {schema}\n" | |
| "Re-issue the call with arguments matching the schema." | |
| ) | |
| return "Reminder: tool arguments do not match the expected types. Check the schema." | |
| def hint_runtime_error(ctx: HintContext) -> str: | |
| msg = ctx.get("error_message", "an exception") | |
| return ( | |
| f"Reminder: the previous tool call raised {msg}. " | |
| "Reconsider the inputs or read the relevant code first to understand state." | |
| ) | |
| def hint_repeated_failure(ctx: HintContext) -> str: | |
| """Triggered when the same kind of error happens 3+ times in a row.""" | |
| return ( | |
| "Reminder: this approach has failed multiple times. " | |
| "Step back and consider an alternative approach: read more files, " | |
| "search for similar patterns elsewhere, or break the task down differently." | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Registry | |
| # --------------------------------------------------------------------------- | |
| HINT_TEMPLATES: dict[str, Callable[[HintContext], str]] = { | |
| "tool_not_found": hint_tool_not_found, | |
| "json_decode": hint_json_decode, | |
| "type_error": hint_type_error, | |
| "runtime_error": hint_runtime_error, | |
| "repeated_failure": hint_repeated_failure, | |
| } | |
| def dispatch(error_kind: str, ctx: HintContext | None = None) -> str | None: | |
| """Generate a hint for the given error_kind. Returns None if unknown.""" | |
| fn = HINT_TEMPLATES.get(error_kind) | |
| if fn is None: | |
| return None | |
| return fn(ctx or {}) | |
| def register(error_kind: str, fn: Callable[[HintContext], str]) -> None: | |
| """Add a custom hint template.""" | |
| HINT_TEMPLATES[error_kind] = fn | |
| # =========================================================================== | |
| # Layered HintGenerator architecture (ADR-009) | |
| # =========================================================================== | |
| # | |
| # Composer 2.5 inserts a natural-language hint at each error turn; the | |
| # hint-conditioned forward becomes the SDPO teacher. HOW Cursor generates the | |
| # hint is unstated in every Cursor artifact (both blogs + the Composer 2 tech | |
| # report, arXiv:2603.24477 — confirmed absent in research/10). So this is our | |
| # design problem. The cited papers bracket the answer: OPSD conditions the | |
| # teacher on ground-truth; SDPO generalizes to environment feedback and the | |
| # "successful sibling rollout as implicit feedback" trick. | |
| # | |
| # We implement a layered generator, tried cheapest-first: | |
| # 1. TemplateHintGenerator — the registry above (free, deterministic; | |
| # covers tool-error classes). The first layer. | |
| # 2. RawErrorHintGenerator — wrap the raw env/tool error text as the hint | |
| # (free; covers any error with a message but unmatched by a template). | |
| # 3. LLMJudgeHintGenerator — an LLM produces a <=2-sentence corrective hint | |
| # (cost ~$0.0005/site; covers style/communication/effort sites templates | |
| # can't). Cached on disk; optional; OFF unless a client is provided. | |
| # 4. (sibling-bootstrap) — RL-rollout-path only; not a HintContext-driven | |
| # layer (needs sibling rollouts), exposed as a flag for the trainer to use. | |
| # | |
| # All layers satisfy the HintGenerator Protocol and compose via | |
| # CompositeHintGenerator, whose .as_collator_hook() returns a callable matching | |
| # the collator's existing `hint_generator: Callable[[str, dict], str | None]` | |
| # hook — ZERO collator change. | |
| from typing import Protocol, runtime_checkable | |
| class HintGenerator(Protocol): | |
| """A hint source. Returns hint text for an error context, or None to defer | |
| to the next layer.""" | |
| def generate(self, error_kind: str, error_meta: dict) -> str | None: ... | |
| class TemplateHintGenerator: | |
| """Layer 1: the existing template registry. Free, deterministic. | |
| Preserves the exact behavior of the module-level `dispatch()` so existing | |
| callers and tests see no change. | |
| """ | |
| def generate(self, error_kind: str, error_meta: dict) -> str | None: | |
| # `dispatch` reads HintContext keys; error_meta IS that context dict | |
| # plus the kind. Merge so templates that read `error_kind` still work. | |
| ctx: HintContext = dict(error_meta) # type: ignore[assignment] | |
| ctx.setdefault("error_kind", error_kind) | |
| return dispatch(error_kind, ctx) | |
| class RawErrorHintGenerator: | |
| """Layer 2: use the raw env/tool error text itself as the hint. | |
| Covers any error site that carries a message but isn't matched by a | |
| template. Free. SDPO's "environment feedback as the conditioning signal" | |
| (arXiv:2601.20802) — the rawest form of that. | |
| """ | |
| def __init__(self, max_chars: int = 500) -> None: | |
| self.max_chars = max_chars | |
| def generate(self, error_kind: str, error_meta: dict) -> str | None: | |
| msg = error_meta.get("error_message") or error_meta.get("error") or "" | |
| msg = str(msg).strip() | |
| if not msg: | |
| return None | |
| truncated = msg[: self.max_chars] | |
| return f"Reminder: the previous action produced this error:\n{truncated}\nReconsider and retry." | |
| # --------------------------------------------------------------------------- | |
| # Error-kind routing (ADR-012 finding #2) | |
| # --------------------------------------------------------------------------- | |
| # | |
| # The default composite is template -> raw-error -> judge. The raw-error layer | |
| # fires for ANY kind carrying a message — including style/communication/effort | |
| # sites, which are EXACTLY what the LLM judge exists to cover. So we route: | |
| # tool/runtime error kinds may use the raw-error layer; style/communication/ | |
| # effort kinds skip it and fall through to the judge. | |
| # Error kinds that genuinely describe a tool/runtime failure whose raw text is a | |
| # useful, self-contained hint. The explicit registry-template kinds are included | |
| # so behavior is unchanged for them. | |
| _TOOL_RUNTIME_KINDS: frozenset[str] = frozenset({ | |
| "tool_not_found", | |
| "json_decode", | |
| "type_error", | |
| "runtime_error", | |
| "repeated_failure", | |
| }) | |
| # Substrings marking a kind as tool/runtime-ish even if not explicitly listed | |
| # (keeps generic "*_error"/"*_exception" sites flowing through raw-error, which | |
| # is where their raw text belongs). | |
| _TOOL_RUNTIME_MARKERS: tuple[str, ...] = ( | |
| "error", "exception", "fail", "decode", "timeout", "traceback", | |
| "exit_code", "nonzero", "syntax", "import", "assertion", "tool", | |
| "runtime", "crash", "exec", | |
| ) | |
| # Substrings marking a kind as a style/communication/effort site — the judge's | |
| # domain. These take precedence: a kind matching one of these skips raw-error. | |
| _STYLE_KINDS_MARKERS: tuple[str, ...] = ( | |
| "style", "communic", "verbose", "effort", "concise", "tone", | |
| "format", "wordy", "rambl", "explanation", "etiquette", "clarity", | |
| ) | |
| def is_tool_runtime_kind(error_kind: str) -> bool: | |
| """True if `error_kind` is a tool/runtime failure that the raw-error layer | |
| may serve. Style/communication/effort kinds return False (-> judge).""" | |
| k = (error_kind or "").lower() | |
| if any(m in k for m in _STYLE_KINDS_MARKERS): | |
| return False | |
| if k in _TOOL_RUNTIME_KINDS: | |
| return True | |
| return any(m in k for m in _TOOL_RUNTIME_MARKERS) | |
| class RoutingHintGenerator: | |
| """Wraps an inner layer (the raw-error layer) and only lets it fire for | |
| tool/runtime error kinds. For style/communication/effort kinds it returns | |
| None so the composite falls through to the judge — the layer those sites | |
| were always meant to reach (ADR-012 finding #2). | |
| """ | |
| def __init__(self, inner: HintGenerator, route=is_tool_runtime_kind) -> None: | |
| self.inner = inner | |
| self.route = route | |
| def generate(self, error_kind: str, error_meta: dict) -> str | None: | |
| if not self.route(error_kind): | |
| return None | |
| return self.inner.generate(error_kind, error_meta) | |
| class LLMJudgeHintGenerator: | |
| """Layer 3: an LLM produces a short corrective hint. | |
| Covers style/communication/effort sites that templates can't. Optional and | |
| OFF unless a `complete` callable is provided. Results are cached on disk | |
| keyed on a hash of the error context (so repeated identical sites cost | |
| nothing after the first). | |
| `complete(prompt: str) -> str` is an injected text-completion callable | |
| (e.g. an OpenRouter chat wrapper). Kept abstract so this module has no hard | |
| network dependency and is unit-testable with a stub. | |
| """ | |
| PROMPT_TEMPLATE = ( | |
| "An autonomous coding agent made a mistake at one step of a trajectory. " | |
| "Write a SHORT (<=2 sentences) corrective hint that, if the agent had " | |
| "seen it, would steer it to the right behavior for THIS step only. Do " | |
| "not solve the whole task; just correct the local mistake.\n\n" | |
| "Error kind: {error_kind}\n" | |
| "Error / context:\n{error_message}\n\n" | |
| "Corrective hint:" | |
| ) | |
| # Bump when PROMPT_TEMPLATE or the underlying judge model changes so stale | |
| # cached hints are invalidated rather than silently reused. | |
| _CACHE_VERSION = 2 | |
| # Hard cap on a generated hint. The judge is asked for <=2 sentences but | |
| # nothing enforced it (cross-family review 2026-05-29) — a runaway judge | |
| # could emit a full solution / prompt-leak / megabyte of text straight into | |
| # the SDPO teacher conditioning. Clamp defensively. | |
| _MAX_HINT_CHARS = 600 | |
| def __init__( | |
| self, | |
| complete: Callable[[str], str] | None = None, | |
| *, | |
| cache_dir: str | None = None, | |
| ) -> None: | |
| self.complete = complete | |
| self._cache_dir = cache_dir | |
| self._mem_cache: dict[str, str] = {} | |
| def _cache_key(self, error_kind: str, error_meta: dict) -> str: | |
| import hashlib | |
| import json | |
| import re | |
| # Strip volatile object reprs (e.g. "<Exception at 0x7f8b...>") so the | |
| # key is stable across runs/restarts. Cross-family review 2026-05-29: | |
| # `default=str` on raw Exception/context objects embedded a memory | |
| # address in the key, guaranteeing a 0% cross-process cache-hit rate and | |
| # unbounded judge cost. Also version the key so prompt/model changes | |
| # invalidate stale hints rather than serving them. | |
| blob = json.dumps( | |
| {"v": self._CACHE_VERSION, "k": error_kind, "m": error_meta}, | |
| sort_keys=True, default=str, | |
| ) | |
| blob = re.sub(r"0x[0-9a-fA-F]+", "0xADDR", blob) | |
| blob = re.sub(r"\bat 0xADDR\b", "", blob) | |
| return hashlib.sha256(blob.encode("utf-8")).hexdigest()[:32] | |
| def _disk_get(self, key: str) -> str | None: | |
| if not self._cache_dir: | |
| return None | |
| from pathlib import Path | |
| p = Path(self._cache_dir) / f"{key}.txt" | |
| return p.read_text(encoding="utf-8") if p.exists() else None | |
| def _disk_put(self, key: str, value: str) -> None: | |
| if not self._cache_dir: | |
| return | |
| import os | |
| from pathlib import Path | |
| d = Path(self._cache_dir) | |
| d.mkdir(parents=True, exist_ok=True) | |
| # Atomic write: concurrent DDP workers writing the same key would | |
| # otherwise interleave and corrupt the file (cross-family review). | |
| tmp = d / f"{key}.txt.{os.getpid()}.tmp" | |
| tmp.write_text(value, encoding="utf-8") | |
| os.replace(tmp, d / f"{key}.txt") | |
| def generate(self, error_kind: str, error_meta: dict) -> str | None: | |
| if self.complete is None: | |
| return None # judge disabled — defer | |
| key = self._cache_key(error_kind, error_meta) | |
| if key in self._mem_cache: | |
| return self._mem_cache[key] | |
| cached = self._disk_get(key) | |
| if cached is not None: | |
| self._mem_cache[key] = cached | |
| return cached | |
| prompt = self.PROMPT_TEMPLATE.format( | |
| error_kind=error_kind, | |
| error_message=str(error_meta.get("error_message") | |
| or error_meta.get("error") or "(no message)")[:1000], | |
| ) | |
| hint = self.complete(prompt).strip() | |
| if not hint: | |
| return None | |
| # Clamp to a sane length so a runaway judge can't inject a full solution | |
| # or megabyte blob into the SDPO teacher conditioning (cross-family review). | |
| if len(hint) > self._MAX_HINT_CHARS: | |
| hint = hint[: self._MAX_HINT_CHARS].rstrip() + "…" | |
| self._mem_cache[key] = hint | |
| self._disk_put(key, hint) | |
| return hint | |
| class CompositeHintGenerator: | |
| """Tries each layer in order, returning the first non-None hint. | |
| Order is cost-ascending: templates (free) -> raw error (free) -> LLM judge | |
| (paid, optional). The first layer to produce a hint wins, so the common | |
| tool-error case never reaches the LLM. | |
| """ | |
| def __init__(self, layers: list[HintGenerator]) -> None: | |
| self.layers = layers | |
| def generate(self, error_kind: str, error_meta: dict) -> str | None: | |
| for layer in self.layers: | |
| hint = layer.generate(error_kind, error_meta) | |
| if hint is not None: | |
| return hint | |
| return None | |
| def as_collator_hook(self) -> Callable[[str, dict], str | None]: | |
| """Return a callable matching CollatorConfig.hint_generator's signature | |
| (error_kind, error_meta) -> str | None. ZERO collator change.""" | |
| return self.generate | |
| def default_composite( | |
| *, | |
| llm_complete: Callable[[str], str] | None = None, | |
| cache_dir: str | None = None, | |
| enable_raw_error: bool = True, | |
| ) -> CompositeHintGenerator: | |
| """Build the recommended layered generator: templates -> raw-error -> judge. | |
| The raw-error layer is wrapped in a RoutingHintGenerator so it only fires for | |
| tool/runtime error kinds; style/communication/effort kinds skip it and fall | |
| through to the LLM judge (ADR-012 finding #2). The LLM-judge layer is | |
| included only when `llm_complete` is provided. | |
| """ | |
| layers: list[HintGenerator] = [TemplateHintGenerator()] | |
| if enable_raw_error: | |
| layers.append(RoutingHintGenerator(RawErrorHintGenerator())) | |
| if llm_complete is not None: | |
| layers.append(LLMJudgeHintGenerator(llm_complete, cache_dir=cache_dir)) | |
| return CompositeHintGenerator(layers) | |
| __all__ = [ | |
| "dispatch", | |
| "register", | |
| "HintContext", | |
| "HINT_TEMPLATES", | |
| # Layered architecture (ADR-009) | |
| "HintGenerator", | |
| "TemplateHintGenerator", | |
| "RawErrorHintGenerator", | |
| "RoutingHintGenerator", | |
| "is_tool_runtime_kind", | |
| "LLMJudgeHintGenerator", | |
| "CompositeHintGenerator", | |
| "default_composite", | |
| ] | |