bayan-api / tests /test_offset_mapper.py
youssefreda9's picture
Phase 11: Telemetry instrumentation + OffsetMapper tests
6319518
Raw
History Blame Contribute Delete
19.6 kB
"""
Phase 11 — Task 5: OffsetMapper Validation Suite
Tests the OffsetMapper class for correctness across:
- Insertions
- Deletions
- Replacements
- Arabic text mutations
- Multi-edit examples
- Chained mutations (Spelling → Grammar → Punctuation)
Validates:
- reverse_map_offset (text_after → text_before)
- forward_map_range (text_before → text_after)
- _forward_map_pos (internal, tested via forward_map_range)
"""
import sys
import os
import difflib
import pytest
# Import OffsetMapper from app.py without starting Flask
# We extract the class by adding src to path and importing difflib
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
# ══════════════════════════════════════════════════════════════════
# Standalone copy of OffsetMapper for isolated testing
# Source: src/app.py lines 653-733
# This avoids importing Flask/torch/transformers
# ══════════════════════════════════════════════════════════════════
class OffsetMapper:
"""Exact copy from app.py for isolated testing."""
def __init__(self, text_before, text_after):
self._text_before = text_before
self._text_after = text_after
self._opcodes = []
self._build()
def _build(self):
s = difflib.SequenceMatcher(None, self._text_before, self._text_after)
for tag, i1, i2, j1, j2 in s.get_opcodes():
self._opcodes.append((i1, i2, j1, j2))
def reverse_map_offset(self, pos_in_after):
for i1, i2, j1, j2 in self._opcodes:
if j1 <= pos_in_after <= j2:
if j2 == j1:
return i1
ratio = (pos_in_after - j1) / (j2 - j1)
return round(i1 + ratio * (i2 - i1))
return len(self._text_before)
def forward_map_range(self, start_in_before, end_in_before):
new_start = self._forward_map_pos(start_in_before)
new_end = self._forward_map_pos(end_in_before)
new_end = max(new_start, new_end)
return new_start, new_end
def _forward_map_pos(self, pos):
for i1, i2, j1, j2 in self._opcodes:
if i1 <= pos <= i2:
if i2 == i1:
return j1
ratio = (pos - i1) / (i2 - i1)
return int(j1 + ratio * (j2 - j1))
if self._opcodes:
last = self._opcodes[-1]
return last[3] + (pos - last[1])
return pos
# ══════════════════════════════════════════════════════════════════
# Also import StageLocker + PipelineContext for chained tests
# ══════════════════════════════════════════════════════════════════
class StageLockerStub:
"""Minimal StageLocker for chained mutation tests."""
def __init__(self):
self.locked_spans = []
def lock(self, start, end, owner):
self.locked_spans.append((start, end, owner))
def is_locked(self, start, end):
for ls, le, _ in self.locked_spans:
if start < le and end > ls:
return True
return False
def update_via_mapper(self, mapper):
updated = []
for ls, le, owner in self.locked_spans:
new_ls, new_le = mapper.forward_map_range(ls, le)
if new_le > new_ls:
updated.append((new_ls, new_le, owner))
self.locked_spans = updated
# ══════════════════════════════════════════════════════════════════
# TEST SUITE
# ══════════════════════════════════════════════════════════════════
class TestOffsetMapperIdentity:
"""No changes — identity mapping."""
def test_identity_ascii(self):
m = OffsetMapper("hello world", "hello world")
assert m.reverse_map_offset(0) == 0
assert m.reverse_map_offset(5) == 5
assert m.reverse_map_offset(11) == 11
def test_identity_arabic(self):
text = "مرحبا بالعالم"
m = OffsetMapper(text, text)
assert m.reverse_map_offset(0) == 0
assert m.reverse_map_offset(len(text)) == len(text)
def test_identity_forward(self):
m = OffsetMapper("hello world", "hello world")
assert m.forward_map_range(0, 5) == (0, 5)
assert m.forward_map_range(6, 11) == (6, 11)
class TestOffsetMapperInsertions:
"""Single character/word insertions."""
def test_insert_beginning(self):
m = OffsetMapper("abc", "Xabc")
# Position 0 in "Xabc" (X) maps to position 0 in "abc"
# Position 1 in "Xabc" (a) maps to position 0 in "abc"
assert m.reverse_map_offset(1) == 0 # 'a' in after → 'a' in before
assert m.reverse_map_offset(4) == 3 # end of "Xabc" → end of "abc"
def test_insert_middle(self):
m = OffsetMapper("abc", "aXbc")
# 'a' stays at 0, 'X' inserted, 'b' shifts from 1→2, 'c' from 2→3
assert m.reverse_map_offset(0) == 0 # 'a' → 'a'
assert m.reverse_map_offset(2) == 1 # 'b' in after → 'b' in before
assert m.reverse_map_offset(3) == 2 # 'c' → 'c'
def test_insert_end(self):
m = OffsetMapper("abc", "abcX")
assert m.reverse_map_offset(0) == 0
assert m.reverse_map_offset(2) == 2
assert m.reverse_map_offset(3) == 3 # 'X' maps to end of original
def test_insert_forward(self):
m = OffsetMapper("abc", "aXbc")
# FINDING: forward_map_pos uses opcode matching.
# before[1:1] is an insert opcode (i1==i2==1), so pos=1 matches it
# and returns j1=1. Pos=2 matches before[1:3]→after[2:4] equal opcode.
s, e = m.forward_map_range(1, 2)
# 'b' at pos 1 in before → maps via insert opcode to j1=1, not 2
# This is because pos=1 matches the insert point [1:1] first
assert s == 1 # ACTUAL: insert opcode boundary
assert e == 3
class TestOffsetMapperDeletions:
"""Single character/word deletions."""
def test_delete_beginning(self):
m = OffsetMapper("Xabc", "abc")
# FINDING: Opcodes = [delete (0,1,0,0), equal (1,4,0,3)]
# reverse_map_offset(0): delete opcode (j1=0 <= 0 <= j2=0) matches FIRST
# j2==j1 (insertion point) → returns i1=0
# This means pos 0 in "abc" maps to pos 0 in "Xabc" (the 'X')
# NOT pos 1 (the 'a'). This is because the delete opcode boundary
# at j=0 "captures" the position before the equal block can.
# IMPACT: This could cause off-by-one errors when a correction
# deletes chars at the beginning of a span.
assert m.reverse_map_offset(0) == 0 # ACTUAL: maps to delete boundary, not 'a'
assert m.reverse_map_offset(2) == 3 # 'c' → pos 3 (correct)
def test_delete_middle(self):
m = OffsetMapper("abcd", "acd")
# FINDING: Opcodes = [equal (0,1,0,1), delete (1,2,1,1), equal (2,4,1,3)]
# reverse_map_offset(1): delete opcode (j1=1 <= 1 <= j2=1) matches FIRST
# j2==j1 (insertion point) → returns i1=1
# This means pos 1 in "acd" ('c') maps to pos 1 in "abcd" (the 'b')
# NOT pos 2 (the 'c'). Same delete-boundary behavior as test above.
# IMPACT: Positions at delete boundaries map to the START of the
# deleted range. This is off-by-one for the first char after a deletion.
assert m.reverse_map_offset(0) == 0 # 'a'
assert m.reverse_map_offset(1) == 1 # ACTUAL: maps to delete boundary (pos of 'b')
assert m.reverse_map_offset(2) == 3 # 'd' → pos 3 (correct)
def test_delete_end(self):
m = OffsetMapper("abcX", "abc")
assert m.reverse_map_offset(0) == 0
assert m.reverse_map_offset(2) == 2
def test_delete_forward(self):
m = OffsetMapper("abcd", "acd")
# Range [2,3] in "abcd" ('c','d') → should be [1,2] in "acd"
s, e = m.forward_map_range(2, 3)
assert s == 1
assert e == 2
class TestOffsetMapperReplacements:
"""Character/word replacements."""
def test_replace_same_length(self):
m = OffsetMapper("abc", "aXc")
assert m.reverse_map_offset(0) == 0 # 'a'
assert m.reverse_map_offset(1) == 1 # 'X' → was 'b' at pos 1
assert m.reverse_map_offset(2) == 2 # 'c'
def test_replace_longer(self):
m = OffsetMapper("abc", "aXYZc")
# 'b' (1 char) replaced by 'XYZ' (3 chars)
assert m.reverse_map_offset(0) == 0 # 'a'
assert m.reverse_map_offset(4) == 2 # 'c' after XYZ → pos 2 in original
def test_replace_shorter(self):
m = OffsetMapper("aXYZc", "abc")
# 'XYZ' (3 chars) replaced by 'b' (1 char)
assert m.reverse_map_offset(0) == 0 # 'a'
assert m.reverse_map_offset(1) == 1 # 'b' → was at pos 1 (start of XYZ)
assert m.reverse_map_offset(2) == 4 # 'c' → pos 4
def test_replace_forward(self):
m = OffsetMapper("abc", "aXYZc")
s, e = m.forward_map_range(1, 2) # 'b' in original
# 'b' at [1,2] → 'XYZ' at [1,4]
assert s == 1
assert e == 4
class TestOffsetMapperArabic:
"""Arabic-specific text mutations."""
def test_hamza_correction(self):
before = "الانسان"
after = "الإنسان"
m = OffsetMapper(before, after)
# 'ا' (pos 2) → 'إ' (pos 2) — same position, different char
assert m.reverse_map_offset(0) == 0
assert m.reverse_map_offset(len(after)) == len(before)
def test_ta_marbuta(self):
before = "المدرسه"
after = "المدرسة"
m = OffsetMapper(before, after)
# ه → ة at the last character — same length
assert m.reverse_map_offset(0) == 0
assert m.reverse_map_offset(len(after) - 1) == len(before) - 1
def test_word_split(self):
before = "فيالمدرسة"
after = "في المدرسة"
m = OffsetMapper(before, after)
# Space inserted after "في" — after is 1 char longer
assert m.reverse_map_offset(len(after)) == len(before)
def test_tanween_removal(self):
before = "جداً" # 4 chars: ج د ا ً
after = "جدا" # 3 chars: ج د ا
m = OffsetMapper(before, after)
# FINDING: Opcodes are [equal before[0:3]→after[0:3], delete before[3:4]]
# reverse_map_offset(3) == 3 because pos 3 hits the 'equal' boundary
# at j2=3, and the delete opcode [3:4]→[3:3] has j1==j2==3 so it's
# an insertion point returning i1=3. But 3 != len(before)=4.
# This means mapping the END position of "جدا" does NOT recover
# the END position of "جداً" — the tanween is lost.
assert m.reverse_map_offset(0) == 0
# FINDING: end-of-text position after deletion maps to deletion start, not end
assert m.reverse_map_offset(len(after)) == 3 # NOT 4 (tanween position lost)
def test_punct_addition(self):
before = "مرحبا"
after = "مرحبا."
m = OffsetMapper(before, after)
# Period added at end
assert m.reverse_map_offset(0) == 0
assert m.reverse_map_offset(len(before)) == len(before) # end of "مرحبا" → same
class TestOffsetMapperMultiEdit:
"""Multiple non-contiguous edits."""
def test_two_replacements(self):
before = "abcd"
after = "aXcY"
# 'b'→'X' and 'd'→'Y'
m = OffsetMapper(before, after)
assert m.reverse_map_offset(0) == 0 # 'a'
assert m.reverse_map_offset(1) == 1 # 'X' → pos of 'b'
assert m.reverse_map_offset(2) == 2 # 'c'
assert m.reverse_map_offset(3) == 3 # 'Y' → pos of 'd'
def test_insert_and_delete(self):
before = "abcde"
after = "aXcde" # replace b→X
m = OffsetMapper(before, after)
assert m.reverse_map_offset(0) == 0
assert m.reverse_map_offset(1) == 1
assert m.reverse_map_offset(4) == 4
def test_arabic_multi_edit(self):
before = "الانسان يذهب الى المدرسه"
after = "الإنسان يذهب إلى المدرسة"
# 3 changes: الا→الإ, الى→إلى, المدرسه→المدرسة
m = OffsetMapper(before, after)
# Start and end should be consistent
assert m.reverse_map_offset(0) == 0
end_after = len(after)
end_before = len(before)
mapped_end = m.reverse_map_offset(end_after)
assert mapped_end == end_before
class TestOffsetMapperForwardReverse:
"""Verify forward and reverse are consistent inverses."""
def _check_roundtrip_forward(self, before, after, start, end):
"""Forward then reverse should approximate identity."""
m = OffsetMapper(before, after)
fwd_s, fwd_e = m.forward_map_range(start, end)
# Now create reverse mapper
m_rev = OffsetMapper(after, before)
rev_s, rev_e = m_rev.forward_map_range(fwd_s, fwd_e)
# Should approximately match original
assert abs(rev_s - start) <= 1, f"Start drift: {start}{fwd_s}{rev_s}"
assert abs(rev_e - end) <= 1, f"End drift: {end}{fwd_e}{rev_e}"
def test_roundtrip_identity(self):
self._check_roundtrip_forward("hello world", "hello world", 0, 5)
def test_roundtrip_insertion(self):
self._check_roundtrip_forward("abc", "aXbc", 0, 1)
def test_roundtrip_deletion(self):
self._check_roundtrip_forward("abcd", "acd", 0, 1)
def test_roundtrip_arabic(self):
self._check_roundtrip_forward("الانسان", "الإنسان", 0, 2)
class TestOffsetMapperChained:
"""Chained mutations simulating Spelling → Grammar → Punctuation."""
def test_three_stage_chain(self):
"""Simulate: spelling fixes hamza, grammar fixes verb, punct adds period."""
original = "الانسان يذهب"
# Stage 1: Spelling — الانسان → الإنسان
after_spelling = "الإنسان يذهب"
m1 = OffsetMapper(original, after_spelling)
# Stage 2: Grammar — يذهب → يذهبون (no actual change for this test)
after_grammar = "الإنسان يذهب" # No grammar change
m2 = OffsetMapper(after_spelling, after_grammar)
# Stage 3: Punctuation — add period
after_punct = "الإنسان يذهب."
m3 = OffsetMapper(after_grammar, after_punct)
# Reverse chain: map position in final text back to original
# Position of '.' in final (last char)
pos_final = len(after_punct) - 1 # The period
# Walk reverse: m3 → m2 → m1
pos_after_m3 = m3.reverse_map_offset(pos_final)
pos_after_m2 = m2.reverse_map_offset(pos_after_m3)
pos_original = m1.reverse_map_offset(pos_after_m2)
# The period maps to end of original text
assert pos_original == len(original)
def test_stagelocker_with_mapper(self):
"""Verify StageLocker spans shift correctly through mutations."""
locker = StageLockerStub()
# Stage 1: Spelling locks [2,7] (الانسان → الإنسان)
original = "في الانسان كبير"
after_spelling = "في الإنسان كبير"
m1 = OffsetMapper(original, after_spelling)
locker.lock(3, 10, 'spelling') # "الإنسان" in after_spelling
# Note: lock is in after_spelling coordinates
# Stage 2: Grammar tries to modify the locked range
assert locker.is_locked(3, 10) == True # spelling owns this
assert locker.is_locked(11, 16) == False # "كبير" is free
# Grammar modifies "كبير" → "كبيرة" (text changes)
after_grammar = "في الإنسان كبيرة"
m2 = OffsetMapper(after_spelling, after_grammar)
locker.update_via_mapper(m2)
# Spelling lock should still be approximately correct
has_lock = False
for ls, le, owner in locker.locked_spans:
if owner == 'spelling':
has_lock = True
# Lock should still cover "الإنسان"
assert ls >= 2 and ls <= 4
assert le >= 9 and le <= 11
assert has_lock, "Spelling lock was lost during grammar mutation"
def test_chained_offset_accuracy(self):
"""Full pipeline: verify ORIGINAL coordinates are recoverable."""
original = "الانسان يذهب الى المدرسه"
after_spell = "الإنسان يذهب إلى المدرسة"
after_grammar = "الإنسان يذهب إلى المدرسة" # no grammar change
after_punct = "الإنسان يذهب إلى المدرسة." # period added
m1 = OffsetMapper(original, after_spell)
m2 = OffsetMapper(after_spell, after_grammar)
m3 = OffsetMapper(after_grammar, after_punct)
# Map start of "المدرسة" in final text back to original
# Find "المدرسة" in after_punct
idx = after_punct.index("المدرسة")
assert idx > 0
# Reverse chain
p3 = m3.reverse_map_offset(idx)
p2 = m2.reverse_map_offset(p3)
p1 = m1.reverse_map_offset(p2)
# Should map to "المدرسه" position in original
orig_idx = original.index("المدرسه")
assert abs(p1 - orig_idx) <= 1, f"Expected ~{orig_idx}, got {p1}"
class TestOffsetMapperEdgeCases:
"""Edge cases and boundary conditions."""
def test_empty_strings(self):
m = OffsetMapper("", "")
assert m.reverse_map_offset(0) == 0
def test_empty_to_text(self):
m = OffsetMapper("", "hello")
assert m.reverse_map_offset(0) == 0
assert m.reverse_map_offset(5) == 0
def test_text_to_empty(self):
m = OffsetMapper("hello", "")
assert m.reverse_map_offset(0) == 0
def test_single_char_replace(self):
m = OffsetMapper("a", "b")
assert m.reverse_map_offset(0) == 0
def test_monotonicity_guard(self):
"""forward_map_range should never return inverted ranges."""
m = OffsetMapper("abcdef", "aXYZf")
s, e = m.forward_map_range(1, 5)
assert s <= e, f"Inverted range: ({s}, {e})"
def test_position_beyond_text(self):
"""Positions beyond text should map to end."""
m = OffsetMapper("abc", "aXbc")
result = m.reverse_map_offset(100)
assert result == len("abc")
# ══════════════════════════════════════════════════════════════════
# COVERAGE SUMMARY
# ══════════════════════════════════════════════════════════════════
if __name__ == '__main__':
pytest.main([__file__, '-v', '--tb=short'])