""" RefereeEnvironment — the central multi-agent orchestrator for CounterFeint. Owns a turn-based state machine with three roles: - Fraudster proposes / modifies ads (actions: propose_ad, modify_pending_ad, end_turn, commit_final) - Investigator reviews ads (actions: investigate, verdict, link_accounts) - Auditor audits the trace post-hoc (actions: flag_investigator, flag_fraudster, submit_audit_report) All three WebSocket endpoints (`/ws/fraudster`, `/ws/investigator`, `/ws/auditor`) share a single `RefereeEnvironment` instance per match, so state mutations from one role are immediately visible to the others. State machine: fraudster_turn ─end_turn──────► investigator_turn ─turn_cap/all_decided──► fraudster_turn (next round) │ │ ├─commit_final───► audit_phase ◄┘ │ │ └─action_cap──► investigator_turn │ max_rounds / budget / commit_final └──────── audit_phase → done ◄───────────────── Phase 1 keeps the Auditor a no-op scaffold (flags accepted, report accepted, but graders don't consume them yet). Phase 2A/B/C plug in real audit logic. """ from __future__ import annotations import logging import random import time from typing import Any, Dict, List, Literal, Optional, Tuple from uuid import uuid4 from openenv.core.env_server.interfaces import Environment from openenv.core.env_server.types import Action, Observation try: from ..data.ad_generator import ( TASK_CONFIGS, Ad, GeneratedEpisode, generate_episode, ) from ..data.episode_loader import extend_episode_with_proposal from ..data.tool_registry import INVESTIGATION_TARGETS, InvestigationToolRegistry from ..graders.auditor_track_a import ( investigator_audit_score as track_a_score, run_track_a, ) from ..graders.base_grader import ( EpisodeRecord, LinkResult, VerdictResult, grade_episode, ) from ..graders.multi_agent_rewards import ( RewardInputs, compute_episode_rewards, ) from ..graders.plausibility_score import compute_queue_plausibility from ..models import ( AdFraudState, AdReviewAction, AdReviewObservation, AuditFlag, AuditorAction, AuditorObservation, AuditReport, FraudsterAction, FraudsterObservation, RefereeState, ) from .environment import InvestigatorEnvironment from .evidence_ledger import build_evidence_ledger except ImportError: from data.ad_generator import ( TASK_CONFIGS, Ad, GeneratedEpisode, generate_episode, ) from data.episode_loader import extend_episode_with_proposal from data.tool_registry import INVESTIGATION_TARGETS, InvestigationToolRegistry from graders.auditor_track_a import ( investigator_audit_score as track_a_score, run_track_a, ) from graders.base_grader import ( EpisodeRecord, LinkResult, VerdictResult, grade_episode, ) from graders.multi_agent_rewards import ( RewardInputs, compute_episode_rewards, ) from graders.plausibility_score import compute_queue_plausibility from models import ( AdFraudState, AdReviewAction, AdReviewObservation, AuditFlag, AuditorAction, AuditorObservation, AuditReport, FraudsterAction, FraudsterObservation, RefereeState, ) from server.environment import InvestigatorEnvironment from server.evidence_ledger import build_evidence_ledger logger = logging.getLogger(__name__) Phase = Literal["fraudster_turn", "investigator_turn", "audit_phase", "done"] Role = Literal["fraudster", "investigator", "auditor"] # Module-level grader result for parity with the Investigator env (/grader endpoint). _last_grader_result: Dict[str, Any] = {} def get_last_grader_result() -> Dict[str, Any]: return dict(_last_grader_result) # Default categories the Fraudster can declare. Combines plausible legit # categories (so a sophisticated Fraudster can camouflage) with fraud # templates (so it can propose obvious-fraud or borderline ads). DEFAULT_ALLOWED_CATEGORIES: Tuple[str, ...] = ( # Legit camouflage categories "ecommerce", "saas", "local_service", "education", "fitness", # Fraud / borderline templates "fake_giveaway", "counterfeit_goods", "miracle_cure", "advance_fee", "fake_crypto", "celebrity_endorsement_fraud", "clone_brand", "gray_area_supplements", "network_crypto", "network_ecommerce", "network_fintech", "network_health", ) class RefereeEnvironment(Environment[Action, Observation, RefereeState]): """ Multi-agent referee. Implements the OpenEnv `Environment` contract with a generic `Action`/`Observation` typing — each WebSocket route passes role-specific subclasses into `step()` via the `role` kwarg. Role-aware entry points (preferred): - `reset_match(seed, task_id, episode_id, **knobs)` - `step_as_fraudster(action)` - `step_as_investigator(action)` - `step_as_auditor(action)` - `build__observation()` """ SUPPORTS_CONCURRENT_SESSIONS = True # Default knobs (overridable via reset kwargs). DEFAULT_MAX_ROUNDS = 4 DEFAULT_MAX_PROPOSALS = 5 # Per-turn action caps. Bumped from (3, 6) to (4, 10) so the # Investigator can comfortably investigate 2-3 ads per turn AND issue # verdicts in the same turn without being force-cut to the auditor # mid-thought (the previous (6) cap was triggering the # ``max_rounds`` short-circuit on the final round before the # Investigator could close out pending verdicts). DEFAULT_MAX_FRAUDSTER_ACTIONS_PER_TURN = 4 DEFAULT_MAX_INVESTIGATOR_ACTIONS_PER_TURN = 10 # ------------------------------------------------------------------ # Lifecycle # ------------------------------------------------------------------ def __init__(self) -> None: super().__init__() self._match_id: str = str(uuid4()) self._task_id: str = "task_1" self._rng = random.Random() self._investigator = InvestigatorEnvironment() self._episode: Optional[GeneratedEpisode] = None self._registry: Optional[InvestigationToolRegistry] = None self._phase: Phase = "fraudster_turn" self._round_number: int = 0 self._max_rounds: int = self.DEFAULT_MAX_ROUNDS self._max_proposals: int = self.DEFAULT_MAX_PROPOSALS self._max_fraudster_actions_per_turn: int = ( self.DEFAULT_MAX_FRAUDSTER_ACTIONS_PER_TURN ) self._max_investigator_actions_per_turn: int = ( self.DEFAULT_MAX_INVESTIGATOR_ACTIONS_PER_TURN ) self._allowed_categories: List[str] = list(DEFAULT_ALLOWED_CATEGORIES) self._proposals_used: int = 0 self._actions_this_turn: int = 0 # Per-role logs (consumed by the Auditor). self._fraudster_log: List[Dict[str, Any]] = [] self._investigator_log: List[Dict[str, Any]] = [] self._audit_flags: List[AuditFlag] = [] self._audit_report: Optional[AuditReport] = None self._fraudster_committed: bool = False self._done: bool = False self._end_reason: Optional[str] = None self._fraudster_reward_total: float = 0.0 self._investigator_reward_total: float = 0.0 self._auditor_reward_total: float = 0.0 self._grader_score: Optional[float] = None self._per_ad_plausibility: Dict[str, float] = {} self._audit_ground_truth: Dict[str, int] = {} self._last_feedback: Dict[Role, str] = { "fraudster": "", "investigator": "", "auditor": "", } # Proposal slot_index -> ad_id map, so the Fraudster can modify its # own prior proposals without knowing the Referee's ad_id scheme. self._proposal_slot_to_ad_id: Dict[int, str] = {} # Set inside ``_fraudster_propose_ad`` on success, consumed (and # cleared) by ``_serialize_fraudster_action`` so the audit log entry # for a propose_ad always carries the resolved ``ad_id`` and slot # the env actually allocated, not just the LLM's raw payload (which # has no ad_id field for propose_ad). self._last_proposed_ad_id: Optional[str] = None self._last_proposed_slot: Optional[int] = None # ------------------------------------------------------------------ # OpenEnv surface (generic) # ------------------------------------------------------------------ def reset( self, seed: Optional[int] = None, episode_id: Optional[str] = None, **kwargs: Any, ) -> Observation: """ Generic reset. Returns the *Fraudster* observation because the Fraudster always goes first. The role-specific endpoints can also call `build__observation()` directly. """ self.reset_match(seed=seed, episode_id=episode_id, **kwargs) return self.build_fraudster_observation() def step( self, action: Action, timeout_s: Optional[float] = None, **kwargs: Any, ) -> Observation: """ Role-aware generic step. Expects `role` in kwargs, dispatches to the appropriate role-specific step method, and returns that role's observation. """ role: Optional[Role] = kwargs.get("role") if role == "fraudster": return self.step_as_fraudster(action) # type: ignore[arg-type] if role == "investigator": return self.step_as_investigator(action) # type: ignore[arg-type] if role == "auditor": return self.step_as_auditor(action) # type: ignore[arg-type] raise ValueError( "RefereeEnvironment.step(action, role=...) requires a role of " "'fraudster', 'investigator', or 'auditor'." ) @property def state(self) -> RefereeState: inv_state = self._investigator.state return RefereeState( episode_id=self._match_id, step_count=( len(self._fraudster_log) + len(self._investigator_log) + len(self._audit_flags) ), task_id=self._task_id, phase=self._phase, round_number=self._round_number, max_rounds=self._max_rounds, proposals_used=self._proposals_used, max_proposals=self._max_proposals, actions_this_turn=self._actions_this_turn, max_actions_per_turn=( self._max_fraudster_actions_per_turn if self._phase == "fraudster_turn" else self._max_investigator_actions_per_turn ), investigator_state=inv_state.model_dump() if inv_state else {}, fraudster_proposals=list(self._fraudster_log), investigator_action_log=list(self._investigator_log), fraudster_committed=self._fraudster_committed, audit_report=( self._audit_report.model_dump() if self._audit_report else None ), fraudster_reward=self._fraudster_reward_total, investigator_reward=self._investigator_reward_total, auditor_reward=self._auditor_reward_total, grader_score=self._grader_score, end_reason=self._end_reason, ) # ------------------------------------------------------------------ # Match setup # ------------------------------------------------------------------ def reset_match( self, *, seed: Optional[int] = None, episode_id: Optional[str] = None, task_id: Optional[str] = None, max_rounds: Optional[int] = None, max_proposals: Optional[int] = None, max_fraudster_actions_per_turn: Optional[int] = None, max_investigator_actions_per_turn: Optional[int] = None, allowed_categories: Optional[List[str]] = None, episode: Optional[GeneratedEpisode] = None, ) -> None: """Initialize a fresh match. Sets phase to `fraudster_turn`, round 1.""" self._match_id = episode_id or str(uuid4()) self._task_id = task_id or "task_1" if self._task_id not in TASK_CONFIGS: self._task_id = "task_1" effective_seed = ( seed if seed is not None else hash(uuid4()) & 0xFFFFFFFF ) self._rng = random.Random(effective_seed) # Resolve each knob with precedence: explicit kwarg > TaskConfig curriculum > class default. task_cfg = TASK_CONFIGS[self._task_id] def _resolve(arg_value: Optional[int], cfg_attr: str, default: int) -> int: if arg_value is not None: return arg_value cfg_val = getattr(task_cfg, cfg_attr, None) return cfg_val if cfg_val is not None else default self._max_rounds = _resolve(max_rounds, "max_rounds", self.DEFAULT_MAX_ROUNDS) self._max_proposals = _resolve( max_proposals, "max_proposals", self.DEFAULT_MAX_PROPOSALS ) self._max_fraudster_actions_per_turn = _resolve( max_fraudster_actions_per_turn, "max_fraudster_actions_per_turn", self.DEFAULT_MAX_FRAUDSTER_ACTIONS_PER_TURN, ) self._max_investigator_actions_per_turn = _resolve( max_investigator_actions_per_turn, "max_investigator_actions_per_turn", self.DEFAULT_MAX_INVESTIGATOR_ACTIONS_PER_TURN, ) cfg_categories = getattr(task_cfg, "allowed_fraud_categories", None) if allowed_categories is not None: self._allowed_categories = list(allowed_categories) elif cfg_categories: self._allowed_categories = list(cfg_categories) else: self._allowed_categories = list(DEFAULT_ALLOWED_CATEGORIES) if episode is not None: self._episode = episode else: self._episode = generate_episode(effective_seed, self._task_id) self._registry = InvestigationToolRegistry.from_episode(self._episode) self._investigator.reset( seed=effective_seed, episode_id=self._match_id, task_id=self._task_id, episode=self._episode, registry=self._registry, queue_may_grow=True, ) self._phase = "fraudster_turn" self._round_number = 1 self._proposals_used = 0 self._actions_this_turn = 0 self._fraudster_log = [] self._investigator_log = [] self._audit_flags = [] self._audit_report = None self._fraudster_committed = False self._done = False self._end_reason = None self._fraudster_reward_total = 0.0 self._investigator_reward_total = 0.0 self._auditor_reward_total = 0.0 self._grader_score = None self._per_ad_plausibility = {} self._audit_ground_truth = {} self._proposal_slot_to_ad_id = {} self._last_proposed_ad_id = None self._last_proposed_slot = None self._last_feedback = { "fraudster": ( f"Match started. Round 1 of {self._max_rounds}. " f"You may propose up to {self._max_proposals} ads total, " f"{self._max_fraudster_actions_per_turn} actions per turn." ), "investigator": ( "Waiting for Fraudster to finish their turn. The ad queue may " "grow during this episode as the Fraudster proposes new ads." ), "auditor": "Match in progress. Waiting for audit phase.", } # ------------------------------------------------------------------ # Fraudster step handler # ------------------------------------------------------------------ def step_as_fraudster(self, action: FraudsterAction) -> FraudsterObservation: self._guard_phase("fraudster_turn", role="fraudster") assert self._episode is not None and self._registry is not None reward = 0.0 feedback_parts: List[str] = [] action_type = action.action_type if action_type == "propose_ad": reward, msg = self._fraudster_propose_ad(action) feedback_parts.append(msg) self._actions_this_turn += 1 elif action_type == "modify_pending_ad": reward, msg = self._fraudster_modify_pending_ad(action) feedback_parts.append(msg) self._actions_this_turn += 1 elif action_type == "end_turn": feedback_parts.append("Fraudster ended turn. Control passes to Investigator.") self._transition(to="investigator_turn", note="fraudster end_turn") reward = 0.0 elif action_type == "commit_final": feedback_parts.append( "Fraudster committed (no more proposals). Jumping to audit phase." ) self._fraudster_committed = True self._end_reason = "commit_final" self._transition(to="audit_phase", note="fraudster commit_final") reward = 0.0 else: feedback_parts.append(f"Unknown Fraudster action_type '{action_type}'.") reward = -0.05 self._fraudster_reward_total += reward self._last_feedback["fraudster"] = " ".join(feedback_parts).strip() self._fraudster_log.append(self._serialize_fraudster_action(action, reward)) # Auto-transition guards. if ( self._phase == "fraudster_turn" and action_type in ("propose_ad", "modify_pending_ad") and self._actions_this_turn >= self._max_fraudster_actions_per_turn ): self._transition(to="investigator_turn", note="fraudster action cap") if ( self._phase == "fraudster_turn" and action_type == "propose_ad" and self._proposals_used >= self._max_proposals ): self._last_feedback["fraudster"] += ( " Proposal budget exhausted — control will pass to Investigator." ) self._transition(to="investigator_turn", note="proposal budget exhausted") return self.build_fraudster_observation(reward=reward) def _fraudster_propose_ad(self, action: FraudsterAction) -> Tuple[float, str]: if self._proposals_used >= self._max_proposals: return -0.05, ( f"Proposal budget exhausted ({self._proposals_used}/{self._max_proposals})." ) if not action.ad_copy or not action.ad_copy.strip(): return -0.05, "propose_ad requires non-empty `ad_copy`." if not action.category: return -0.05, "propose_ad requires `category`." if action.category not in self._allowed_categories: return -0.05, ( f"category '{action.category}' not in allowed_categories. " f"Use one of: {', '.join(self._allowed_categories)}." ) assert self._episode is not None and self._registry is not None proposal_seed = self._rng.randint(0, 2**31 - 1) ad = extend_episode_with_proposal( episode=self._episode, registry=self._registry, seed=proposal_seed, ad_copy=action.ad_copy, category=action.category, landing_page_blurb=action.landing_page_blurb, targeting_summary=action.targeting_summary, ) slot_index = self._proposals_used self._proposal_slot_to_ad_id[slot_index] = ad.ad_id self._proposals_used += 1 # Stash so ``_serialize_fraudster_action`` can attach the resolved # ``ad_id`` + ``slot_index`` to this propose_ad's audit log entry # (the FraudsterAction itself doesn't carry these — they're env- # allocated). Without this the auditor sees ``ad_id=None`` for # every propose_ad, which then poisons downstream Track B checks # (e.g. ``intrinsic_consistency_check`` cannot key flags onto an # ad and ``cross_ad_consistency_audit`` cannot dedupe by ad_id). self._last_proposed_ad_id = ad.ad_id self._last_proposed_slot = slot_index self._investigator.notify_queue_grew(ad.ad_id) feedback = ( f"Proposal #{slot_index + 1} accepted: ad_id={ad.ad_id}, category={ad.category}. " f"Queue is now {len(self._episode.ads)} ads." ) return 0.02, feedback def _fraudster_modify_pending_ad(self, action: FraudsterAction) -> Tuple[float, str]: if action.slot_index is None: return -0.05, "modify_pending_ad requires `slot_index`." slot = action.slot_index if slot not in self._proposal_slot_to_ad_id: return -0.05, f"Unknown slot_index {slot}. Propose an ad first." ad_id = self._proposal_slot_to_ad_id[slot] assert self._episode is not None and self._registry is not None # Locked once the Investigator has already rendered a verdict. already_decided = self._investigator.verdicts.get(ad_id, {}).get("verdict") if already_decided: return ( -0.05, f"Cannot modify {ad_id}: Investigator already rendered verdict " f"'{already_decided}'.", ) target_ad: Optional[Ad] = None for a in self._episode.ads: if a.ad_id == ad_id: target_ad = a break if target_ad is None: return -0.05, f"Internal error: ad {ad_id} not in episode." changes: List[str] = [] if action.new_ad_copy is not None and action.new_ad_copy.strip(): target_ad.ad_copy = action.new_ad_copy.strip()[:2000] changes.append("ad_copy") if action.new_landing_page_blurb is not None and action.new_landing_page_blurb.strip(): lp = self._episode.landing_pages.get(ad_id) if lp is not None: from dataclasses import replace new_lp = replace( lp, content_summary=action.new_landing_page_blurb.strip()[:2000] ) self._episode.landing_pages[ad_id] = new_lp updated_text = new_lp.to_investigation_text() self._episode.investigation_data.setdefault(ad_id, {})["landing_page"] = updated_text self._registry.update_ad(ad_id, {"landing_page": updated_text}) changes.append("landing_page") if not changes: return -0.02, "modify_pending_ad had nothing to change." return 0.01, f"Modified {ad_id} fields: {', '.join(changes)}." # ------------------------------------------------------------------ # Investigator step handler # ------------------------------------------------------------------ def step_as_investigator(self, action: AdReviewAction) -> AdReviewObservation: self._guard_phase("investigator_turn", role="investigator") assert self._episode is not None obs = self._investigator.step(action) reward = float(obs.reward or 0.0) self._investigator_reward_total += reward self._investigator_log.append(self._serialize_investigator_action(action, obs)) self._actions_this_turn += 1 self._last_feedback["investigator"] = obs.feedback or "" # Episode termination paths: # 1. Fraudster already committed AND all ads decided -> audit_phase. # 2. Max rounds reached AND no more proposals allowed -> audit_phase. # 3. Investigator budget exhausted (obs.done) -> audit_phase. # 4. Action cap for this turn hit -> fraudster_turn (next round, unless commit_final). all_decided = self._all_ads_decided() inv_done = bool(obs.done) if inv_done: self._end_reason = self._end_reason or "investigator_done" self._transition(to="audit_phase", note="investigator env signalled done") obs.done = False # match isn't over until Auditor submits return obs if all_decided and ( self._fraudster_committed or self._round_number >= self._max_rounds or self._proposals_used >= self._max_proposals ): self._end_reason = self._end_reason or "all_decided" self._transition(to="audit_phase", note="all ads decided") obs.done = False return obs if self._actions_this_turn >= self._max_investigator_actions_per_turn: if self._round_number >= self._max_rounds or self._fraudster_committed: self._end_reason = self._end_reason or "max_rounds" self._transition(to="audit_phase", note="max rounds reached") obs.done = False else: self._round_number += 1 self._transition(to="fraudster_turn", note="investigator action cap") # One-line warning when the next investigator turn will be # the LAST one — gives a slow-to-verdict policy a clear # signal that pending ads will get auto-approved otherwise. if self._round_number == self._max_rounds: self._last_feedback["investigator"] = ( "Final round next: pending ads not given an explicit " "verdict will auto-approve at audit time." ) obs.done = self._phase == "done" return obs def _all_ads_decided(self) -> bool: if self._episode is None: return False verdicts = self._investigator.verdicts return all(a.ad_id in verdicts for a in self._episode.ads) # ------------------------------------------------------------------ # Auditor step handler # ------------------------------------------------------------------ def step_as_auditor(self, action: AuditorAction) -> AuditorObservation: self._guard_phase("audit_phase", role="auditor") feedback = "" if action.action_type == "flag_investigator": flag = AuditFlag( track="A", target_ad_id=action.target_ad_id, flag_type=action.flag_type or "unspecified", severity=action.severity if action.severity is not None else 0.5, note=action.note or "", ) self._audit_flags.append(flag) feedback = ( f"Track A flag recorded: {flag.flag_type} (severity={flag.severity:.2f})." ) elif action.action_type == "flag_fraudster": flag = AuditFlag( track="B", target_ad_id=action.target_ad_id, flag_type=action.flag_type or "unspecified", severity=action.severity if action.severity is not None else 0.5, note=action.note or "", ) self._audit_flags.append(flag) feedback = ( f"Track B flag recorded: {flag.flag_type} (severity={flag.severity:.2f})." ) elif action.action_type == "submit_audit_report": report_payload = action.audit_report or {} track_a_flags = [f for f in self._audit_flags if f.track == "A"] track_b_flags = [f for f in self._audit_flags if f.track == "B"] # Track A/B score *defaults* come from the real graders running # over the episode record — so even a dumb Auditor that submits an # empty report gets a principled score. Caller-supplied values # override these (used by tests and LLM Auditors that compute # their own). default_a, default_b = self._compute_default_track_scores() investigator_score = float( report_payload.get("investigator_audit_score", default_a) ) fraudster_score = float( report_payload.get("fraudster_plausibility_score", default_b) ) investigator_score = min(1.0, max(0.0, investigator_score)) fraudster_score = min(1.0, max(0.0, fraudster_score)) self._audit_report = AuditReport( track_a_flags=track_a_flags, track_b_flags=track_b_flags, investigator_audit_score=investigator_score, fraudster_plausibility_score=fraudster_score, notes=str(report_payload.get("notes", "") or action.note or "")[:4000], ) feedback = ( "Audit report submitted. " f"Track A flags: {len(track_a_flags)}. " f"Track B flags: {len(track_b_flags)}. " f"investigator_audit_score={investigator_score:.2f}, " f"fraudster_plausibility_score={fraudster_score:.2f}." ) self._finalize_audit() else: feedback = f"Unknown Auditor action_type '{action.action_type}'." self._last_feedback["auditor"] = feedback return self.build_auditor_observation(feedback=feedback) def _finalize_audit(self) -> None: """ Compute grader score and per-role rewards using the multi-agent reward model (graders/multi_agent_rewards.py), close out the match, and transition to `done`. """ if self._episode is None: return record = self._build_episode_record() self._grader_score = grade_episode(record) audit_report = self._audit_report or AuditReport( track_a_flags=[], track_b_flags=[], investigator_audit_score=1.0, fraudster_plausibility_score=1.0, notes="", ) reward_inputs = RewardInputs( record=record, audit_report=audit_report, fraudster_proposal_log=list(self._fraudster_log), investigator_action_log=list(self._investigator_log), investigation_data_seen=( self._registry.to_dict() if self._registry else {} ), fraudster_ad_ids=list(self._proposal_slot_to_ad_id.values()), ) rewards = compute_episode_rewards(reward_inputs) self._fraudster_reward_total = float(rewards["fraudster"]) self._investigator_reward_total = float(rewards["investigator"]) self._auditor_reward_total = float(rewards["auditor"]) self._per_ad_plausibility = dict(rewards.get("per_ad_plausibility") or {}) self._audit_ground_truth = dict(rewards.get("audit_ground_truth") or {}) global _last_grader_result _last_grader_result = { "match_id": self._match_id, "task_id": self._task_id, "grader_score": self._grader_score, "phase": "done", "total_steps": ( len(self._fraudster_log) + len(self._investigator_log) + len(self._audit_flags) ), "fraudster_reward": self._fraudster_reward_total, "investigator_reward": self._investigator_reward_total, "auditor_reward": self._auditor_reward_total, "per_ad_plausibility": self._per_ad_plausibility, "audit_ground_truth": self._audit_ground_truth, "proposals_used": self._proposals_used, "end_reason": self._end_reason, "audit_report": ( self._audit_report.model_dump() if self._audit_report else None ), } self._transition(to="done", note="audit report submitted") self._done = True def _compute_default_track_scores(self) -> Tuple[float, float]: """ Derive default investigator_audit_score and fraudster_plausibility_score from the Track A and Track B graders. Used when the Auditor submits an empty report payload. """ if self._episode is None: return 1.0, 1.0 record = self._build_episode_record() investigation_data_seen = ( self._registry.to_dict() if self._registry else {} ) track_a_flags = run_track_a( record, investigator_actions=list(self._investigator_log), investigation_data_seen=investigation_data_seen, ) investigator_score = track_a_score(track_a_flags) _per_ad, _flags, queue_plaus = compute_queue_plausibility( self._fraudster_log ) # If the Fraudster never proposed anything, plausibility doesn't # apply — treat as 1.0 (no evidence the Fraudster was unrealistic). return investigator_score, queue_plaus if _per_ad else 1.0 def _build_episode_record(self) -> EpisodeRecord: """Assemble an EpisodeRecord from Investigator's view, mirroring R1.""" assert self._episode is not None verdicts = self._investigator.verdicts links = self._investigator.links inv_state: AdFraudState = self._investigator.state verdict_results = [] for ad in self._episode.ads: v = verdicts.get(ad.ad_id) if v: verdict_results.append( VerdictResult( ad_id=ad.ad_id, verdict=v["verdict"], confidence=v.get("confidence", 0.5), ground_truth=v["ground_truth"], auto_approved=v.get("auto_approved", False), ) ) link_results = [ LinkResult(ad_id_1=l["ad_id_1"], ad_id_2=l["ad_id_2"], correct=l["correct"]) for l in links ] ads_metadata = [ { "ad_id": ad.ad_id, "ground_truth": ad.ground_truth_label, "severity": ad.severity, } for ad in self._episode.ads ] return EpisodeRecord( task_id=self._task_id, total_steps=inv_state.step_count, action_budget=self._episode.task_config.action_budget, verdicts=verdict_results, links=link_results, ads_metadata=ads_metadata, n_fraud_rings=len(self._episode.fraud_rings), ring_sizes=[len(r.member_ad_ids) for r in self._episode.fraud_rings], ) # ------------------------------------------------------------------ # Observation builders # ------------------------------------------------------------------ def build_fraudster_observation( self, *, reward: float = 0.0 ) -> FraudsterObservation: phase = self._phase done = phase == "done" if self._episode is None: return FraudsterObservation( done=done, reward=reward, feedback="No episode loaded. Call reset() first.", phase=phase, ) current_queue = self._build_queue_summary() prior_verdicts = self._build_verdict_history() investigations = self._investigator.investigations rounds_remaining = max(0, self._max_rounds - self._round_number + 1) actions_left = max( 0, self._max_fraudster_actions_per_turn - self._actions_this_turn, ) if phase == "fraudster_turn" else 0 my_proposal_signals = self._build_my_proposal_signals() return FraudsterObservation( done=done, reward=reward, feedback=self._last_feedback["fraudster"], phase=phase, task_id=getattr(self._episode.task_config, "task_id", ""), round_number=self._round_number, rounds_remaining=rounds_remaining, proposals_used=self._proposals_used, proposals_remaining=max(0, self._max_proposals - self._proposals_used), actions_left_this_turn=actions_left, current_queue=current_queue, prior_verdicts=prior_verdicts, investigation_targets_used=investigations, allowed_categories=list(self._allowed_categories), my_proposal_signals=my_proposal_signals, ) def _build_my_proposal_signals(self) -> Dict[str, Dict[str, Any]]: """Per-proposal structured signals for the Fraudster's own ads. For every Fraudster-proposed ad, expose the auto-assigned underlying signals (payment_id, registrar, domain, country, account_age_days, targeting_fingerprint) by reusing the same extraction logic the Investigator's evidence ledger uses. We synthesise an "investigations" dict that pretends *all* targets were pulled — the Fraudster authored these ads, so it is allowed to know everything the env auto-assigned to them. The Fraudster never sees signals for synthetic / non-self-proposed ads, only for its own slate. """ if self._episode is None: return {} proposal_ad_ids = list(self._proposal_slot_to_ad_id.values()) if not proposal_ad_ids: return {} full_targets = [ "payment_method", "landing_page", "targeting_overlap", "advertiser_history", ] ledger = build_evidence_ledger( episode=self._episode, registry=self._registry, ad_ids=proposal_ad_ids, investigations={ad_id: full_targets for ad_id in proposal_ad_ids}, ) slot_by_ad_id = { ad_id: slot for slot, ad_id in self._proposal_slot_to_ad_id.items() } verdicts = self._investigator.verdicts for ad_id, entry in ledger.items(): if ad_id in slot_by_ad_id: entry["slot_index"] = slot_by_ad_id[ad_id] v = verdicts.get(ad_id) entry["investigator_verdict"] = ( v.get("verdict") if v else "pending" ) return ledger def build_investigator_observation(self) -> AdReviewObservation: obs = self._investigator._build_observation( # noqa: SLF001 reward=0.0, done=(self._phase == "done") ) obs.feedback = ( self._last_feedback["investigator"] or obs.feedback ) return obs def build_auditor_observation( self, *, feedback: str = "" ) -> AuditorObservation: phase = self._phase done = phase == "done" investigation_data_seen: Dict[str, Dict[str, str]] = {} if self._registry is not None: investigation_data_seen = self._registry.to_dict() record: Dict[str, Any] = {} if self._episode is not None: record = { "task_id": self._task_id, "round_number": self._round_number, "proposals_used": self._proposals_used, "end_reason": self._end_reason, "ads": [ { "ad_id": ad.ad_id, "ad_copy": ad.ad_copy, "category": ad.category, "ground_truth": ad.ground_truth_label, "severity": ad.severity, "fraud_type": ad.fraud_type, "difficulty": ad.difficulty, "is_fraudster_proposal": ad.ad_id in self._proposal_slot_to_ad_id.values(), } for ad in self._episode.ads ], "verdicts": [ {"ad_id": ad_id, **v} for ad_id, v in self._investigator.verdicts.items() ], "links": list(self._investigator.links), "grader_score": self._grader_score, "fraud_rings": [ { "ring_id": ring.ring_id, "topology": ring.topology, "case_name": ring.case_name, "provenance": ring.provenance, "member_ad_ids": list(ring.member_ad_ids), "shared_signal_types": list(ring.shared_signals.keys()), } for ring in self._episode.fraud_rings ], } return AuditorObservation( done=done, reward=self._auditor_reward_total, feedback=feedback or self._last_feedback["auditor"], phase=phase, full_episode_record=record, investigator_actions=list(self._investigator_log), fraudster_proposals=list(self._fraudster_log), investigation_data_seen=investigation_data_seen, pending_flags=[f.model_dump() for f in self._audit_flags], ) # ------------------------------------------------------------------ # State-machine helpers # ------------------------------------------------------------------ def _guard_phase(self, expected: Phase, *, role: Role) -> None: if self._phase != expected: raise PermissionError( f"{role} cannot act during phase '{self._phase}' " f"(expected '{expected}')." ) def _transition(self, *, to: Phase, note: str) -> None: if self._phase == to: return logger.debug("[referee] %s -> %s (%s)", self._phase, to, note) self._phase = to self._actions_this_turn = 0 def _build_queue_summary(self) -> List[Dict[str, Any]]: assert self._episode is not None verdicts = self._investigator.verdicts proposal_ad_ids = set(self._proposal_slot_to_ad_id.values()) slot_by_ad_id = { ad_id: slot for slot, ad_id in self._proposal_slot_to_ad_id.items() } out: List[Dict[str, Any]] = [] for ad in self._episode.ads: v = verdicts.get(ad.ad_id) entry = { "ad_id": ad.ad_id, "ad_copy": ad.ad_copy, "category": ad.category, "status": (v["verdict"] if v else "pending"), "is_my_proposal": ad.ad_id in proposal_ad_ids, } if ad.ad_id in slot_by_ad_id: entry["slot_index"] = slot_by_ad_id[ad.ad_id] out.append(entry) return out def _build_verdict_history(self) -> List[Dict[str, Any]]: proposal_ad_ids = set(self._proposal_slot_to_ad_id.values()) history: List[Dict[str, Any]] = [] for entry in self._investigator_log: if entry.get("action_type") != "verdict": continue history.append( { "ad_id": entry.get("ad_id"), "verdict": entry.get("verdict"), "confidence": entry.get("confidence"), "rationale": entry.get("rationale"), "was_my_proposal": entry.get("ad_id") in proposal_ad_ids, } ) return history def _serialize_fraudster_action( self, action: FraudsterAction, reward: float ) -> Dict[str, Any]: payload: Dict[str, Any] = { "ts": time.time(), "phase": self._phase, "round_number": self._round_number, "action_type": action.action_type, "ad_copy": action.ad_copy, "category": action.category, "landing_page_blurb": action.landing_page_blurb, "targeting_summary": action.targeting_summary, "slot_index": action.slot_index, "new_ad_copy": action.new_ad_copy, "new_landing_page_blurb": action.new_landing_page_blurb, "rationale": action.rationale, "reward": reward, "ad_id": None, } # Enrich queue actions with the env-resolved ad context so the # auditor + downstream graders can key flags onto a real ad_id and # see the AD'S CURRENT STATE (not just the LLM's payload, which # for ``modify_pending_ad`` only carries the *delta* fields). if action.action_type == "propose_ad" and self._last_proposed_ad_id is not None: payload["ad_id"] = self._last_proposed_ad_id payload["slot_index"] = self._last_proposed_slot self._last_proposed_ad_id = None self._last_proposed_slot = None elif ( action.action_type == "modify_pending_ad" and action.slot_index is not None and action.slot_index in self._proposal_slot_to_ad_id ): ad_id = self._proposal_slot_to_ad_id[action.slot_index] payload["ad_id"] = ad_id ad = self._find_episode_ad(ad_id) if ad is not None: # Always inject the ad's CURRENT state — the modify only # carries deltas, and post-modify the ad's authoritative # ``ad_copy`` / ``targeting_summary`` live on the # ``Ad`` object the env mutated in # ``_fraudster_modify_pending_ad``. payload.setdefault("category", ad.category) if not payload.get("ad_copy"): payload["ad_copy"] = action.new_ad_copy or ad.ad_copy if not payload.get("targeting_summary"): payload["targeting_summary"] = ad.targeting_summary if ( not payload.get("landing_page_blurb") and self._episode is not None ): lp = self._episode.landing_pages.get(ad_id) if lp is not None: payload["landing_page_blurb"] = ( action.new_landing_page_blurb or lp.content_summary ) return payload def _find_episode_ad(self, ad_id: str) -> Optional[Ad]: if self._episode is None: return None for ad in self._episode.ads: if ad.ad_id == ad_id: return ad return None def _serialize_investigator_action( self, action: AdReviewAction, obs: AdReviewObservation ) -> Dict[str, Any]: return { "ts": time.time(), "phase": self._phase, "round_number": self._round_number, "action_type": action.action_type, "ad_id": action.ad_id, "investigation_target": action.investigation_target, "verdict": action.verdict, "confidence": action.confidence, "rationale": action.rationale, "linked_ad_id": action.linked_ad_id, "link_reason": action.link_reason, "reward": float(obs.reward or 0.0), "findings_excerpt": (obs.feedback or "")[:500], } # ------------------------------------------------------------------ # Introspection helpers for the driver / clients # ------------------------------------------------------------------ @property def phase(self) -> Phase: return self._phase @property def done(self) -> bool: return self._done @property def match_id(self) -> str: return self._match_id @property def episode(self) -> Optional[GeneratedEpisode]: return self._episode @property def registry(self) -> Optional[InvestigationToolRegistry]: return self._registry @property def investigator(self) -> InvestigatorEnvironment: return self._investigator def grader_score(self) -> Optional[float]: return self._grader_score