""" CorrectionPatch — Immutable correction with dual coordinate spaces. PatchSet — Deterministic container with greedy overlap resolution. TERMINOLOGY: ORIGINAL_TEXT = user's raw input (immutable) CURRENT_TEXT = pipeline's working copy (mutated by each stage) COORDINATE OWNERSHIP: start_original / end_original → PatchSet overlap resolution + API response start_current / end_current → StageLocker + pipeline internals """ import uuid import logging from dataclasses import dataclass, field logger = logging.getLogger(__name__) PRIORITY = {'autocomplete': 0, 'punctuation': 1, 'spelling': 2, 'grammar': 3} @dataclass class CorrectionPatch: """ Immutable correction suggestion with dual coordinate spaces. ORIGINAL coords (start_original, end_original): → Used by PatchSet.resolve_overlaps() for conflict resolution → Exported to frontend via to_dict() as 'start'/'end' → NEVER used for StageLocker or pipeline mutation CURRENT coords (start_current, end_current): → Used by StageLocker.lock() / is_locked() → Pipeline-internal range checking → NEVER sent to frontend """ stage: str start_original: int end_original: int start_current: int end_current: int original: str replacement: str priority: int confidence: float = 1.0 locked: bool = True alternatives: list = field(default_factory=list) id: str = field(default_factory=lambda: str(uuid.uuid4())) def to_dict(self) -> dict: """ Serialize for API response. Exports ORIGINAL_TEXT coordinates ONLY as 'start' and 'end'. CURRENT_TEXT coordinates are pipeline-internal and never exposed. """ return { 'id': self.id, 'start': self.start_original, 'end': self.end_original, 'original': self.original, 'correction': self.replacement, 'type': self.stage, 'priority': self.priority, 'confidence': self.confidence, 'locked': self.locked, 'alternatives': self.alternatives, } class PatchSet: """ Deterministic overlap resolution using greedy first-fit strategy. Resolution order: priority DESC → confidence DESC → start ASC → id ASC The id tiebreaker guarantees identical ordering for identical inputs. Strategy: Greedy — first non-overlapping patch wins its range. One range = one owner. No stacking. This is deterministic and sufficient for ≤3 pipeline stages. # FUTURE: If pipeline grows beyond 5 stages or requires minimal-loss # coverage optimization, consider weighted interval scheduling: # - Model as weighted job scheduling problem # - Use dynamic programming on sorted intervals # - Maximize sum(priority * confidence) of selected non-overlapping patches # Not needed now — greedy is correct for the current architecture. """ def __init__(self): self.patches: list = [] def add(self, patch: CorrectionPatch): self.patches.append(patch) def resolve_overlaps(self) -> list: """ Single owner per range. Deterministic resolution. Uses ORIGINAL coordinates for overlap detection. Phase 14: Relaxed overlap rules: 1. Patches with < 50% overlap of the smaller patch coexist freely 2. Spelling + Punctuation patches from different stages always coexist (they're compatible: one fixes the word, the other adds punct) 3. Same-stage overlaps are always resolved (higher confidence wins) 4. FIX-36: Grammar + Punctuation — merge trailing punct into grammar """ sorted_patches = sorted( self.patches, key=lambda p: (-p.priority, -p.confidence, p.start_original, p.id) ) claimed_ranges = [] # list of (start, end, stage, patch_index) resolved = [] # FIX-36: Punctuation chars that can be merged into grammar corrections _PUNCT_CHARS = set('.,،؛;:!؟?') for patch in sorted_patches: has_substantial_overlap = False overlapping_resolved_idx = None for ci, (cs, ce, claimed_stage, res_idx) in enumerate(claimed_ranges): # Check if there's any overlap at all if patch.start_original < ce and patch.end_original > cs: # ── FIX-36 & Phase 14: Generalized Punctuation Merge ── # If punctuation adds characters to a grammar or spelling correction, # merge them instead of coexisting. Coexisting overlapping patches # break _apply_patches_to_original. if patch.stage == 'punctuation' and claimed_stage in ('grammar', 'spelling'): claimed_patch = resolved[res_idx] punc_correction = patch.replacement prev_correction = claimed_patch.replacement # Check if punctuation is just appending trailing punctuation # Scenario A: Exact match merge (prev_correction is prefix) if (len(punc_correction) > len(prev_correction) and punc_correction.startswith(prev_correction) and all(c in _PUNCT_CHARS for c in punc_correction[len(prev_correction):])): claimed_patch.replacement = punc_correction logger.info( f"[OVERLAP] Merged trailing punctuation into {claimed_stage} " f"[{cs}:{ce}]: '{claimed_patch.original}' → " f"'{claimed_patch.replacement}'" ) has_substantial_overlap = True break # Scenario B: Punctuation just adds punct to its own original text # (e.g. original='المدرسة', replacement='المدرسة.', but prev_correction is a split like 'في المدرسة') if (len(punc_correction) > len(patch.original) and punc_correction.startswith(patch.original) and all(c in _PUNCT_CHARS for c in punc_correction[len(patch.original):])): added_punct = punc_correction[len(patch.original):] # Only append if it doesn't already end with that punct if not claimed_patch.replacement.endswith(added_punct): claimed_patch.replacement += added_punct logger.info( f"[OVERLAP] Appended trailing punctuation into {claimed_stage} " f"[{cs}:{ce}]: '{claimed_patch.original}' → " f"'{claimed_patch.replacement}'" ) has_substantial_overlap = True break # Check if punctuation is just prepending leading punctuation if (len(punc_correction) > len(prev_correction) and punc_correction.endswith(prev_correction) and all(c in _PUNCT_CHARS for c in punc_correction[:-len(prev_correction)])): claimed_patch.replacement = punc_correction logger.info( f"[OVERLAP] Merged leading punctuation into {claimed_stage} " f"[{cs}:{ce}]: '{claimed_patch.original}' → " f"'{claimed_patch.replacement}'" ) has_substantial_overlap = True break # Calculate overlap amount overlap_start = max(patch.start_original, cs) overlap_end = min(patch.end_original, ce) overlap_width = overlap_end - overlap_start if overlap_width > 0: # STRICT NON-OVERLAP RULE: ANY overlap causes the lower priority patch to be dropped. # Overlapping patches cannot be safely applied sequentially by standard frontend/benchmark clients. has_substantial_overlap = True overlapping_resolved_idx = res_idx break if not has_substantial_overlap: res_idx = len(resolved) resolved.append(patch) claimed_ranges.append((patch.start_original, patch.end_original, patch.stage, res_idx)) else: # Only log "Dropped" if we didn't merge if overlapping_resolved_idx is not None or patch.stage != 'punctuation': logger.info( f"[OVERLAP] Dropped {patch.stage} [{patch.start_original}:{patch.end_original}] " f"'{patch.original}' — conflicts with higher-priority span" ) dropped = len(self.patches) - len(resolved) if dropped > 0: logger.info(f"[OVERLAP] Resolved {dropped} overlapping suggestions") return resolved def to_list(self) -> list: """Serialize resolved patches for API response.""" return [p.to_dict() for p in self.resolve_overlaps()]