Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -299,9 +299,71 @@ def harvest_backup_terms(text: str, limit: int = 400) -> List[str]:
|
|
| 299 |
top = [w for w,_ in sorted(stats.items(), key=lambda kv: -kv[1])]
|
| 300 |
return top[:limit]
|
| 301 |
|
| 302 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
def smart_distractors(correct: str, phrase_pool: List[str], sentence_with_blank: str, backup_terms: List[str], k: int = 3) -> List[str]:
|
| 304 |
target = correct.strip()
|
|
|
|
|
|
|
| 305 |
# 1) مصادر متعددة
|
| 306 |
neigh = nearest_terms(target, phrase_pool, k=48)
|
| 307 |
mlm = mlm_fill(sentence_with_blank, target, k=24)
|
|
@@ -337,19 +399,30 @@ def smart_distractors(correct: str, phrase_pool: List[str], sentence_with_blank:
|
|
| 337 |
if not filtered:
|
| 338 |
filtered = raw_pool[:max(24, k*6)]
|
| 339 |
|
| 340 |
-
# 4)
|
| 341 |
-
|
| 342 |
shaped = []
|
| 343 |
for w in filtered:
|
| 344 |
-
|
| 345 |
-
if
|
| 346 |
-
|
| 347 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 348 |
|
| 349 |
# 5) ترتيب بالانسجام
|
| 350 |
ranked = rank_by_sentence_coherence(sentence_with_blank, target, shaped, topk=max(k, 12))
|
| 351 |
|
| 352 |
-
# 6) خذ أفضل k؛ وإن لم يكفِ، أكمل من shaped ثم
|
| 353 |
out = []
|
| 354 |
for src in [ranked, shaped, filtered, raw_pool, backup_terms]:
|
| 355 |
for w in src:
|
|
@@ -360,7 +433,6 @@ def smart_distractors(correct: str, phrase_pool: List[str], sentence_with_blank:
|
|
| 360 |
|
| 361 |
# ضمان العدد بدون أي placeholders
|
| 362 |
if len(out) < k:
|
| 363 |
-
# آخر حل: كرّر أقرب عناصر (أفضل من فراغ)
|
| 364 |
while len(out) < k and ranked:
|
| 365 |
out.append(ranked[len(out) % len(ranked)])
|
| 366 |
|
|
@@ -407,7 +479,7 @@ def make_mcqs(text:str, n:int=6)->List[MCQ]:
|
|
| 407 |
|
| 408 |
q = re2.sub(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", "_____", s, count=1)
|
| 409 |
|
| 410 |
-
# مشتّتات ذكية تضمن دائمًا ≥3
|
| 411 |
pool = [x for x in keyphrases if x != kp]
|
| 412 |
distracts = smart_distractors(kp, pool, q, backup_terms, k=3)
|
| 413 |
|
|
@@ -429,7 +501,7 @@ def to_records(items:List[MCQ])->List[dict]:
|
|
| 429 |
for i,lbl in enumerate(["A","B","C","D"]):
|
| 430 |
txt=(it.choices[i] if i<len(it.choices) else "—").strip()
|
| 431 |
txt=txt.replace(",", "،").replace("?", "؟").replace(";", "؛")
|
| 432 |
-
# منع أي Placeholder يظهر
|
| 433 |
if txt == "—" or not txt:
|
| 434 |
txt = "خيار"
|
| 435 |
opts.append({"id":lbl,"text":txt or "خيار","is_correct":(i==it.answer_index)})
|
|
@@ -512,7 +584,7 @@ textarea{min-height:120px}
|
|
| 512 |
.q-header{display:flex;gap:10px;align-items:center;justify-content:space-between;margin-bottom:6px}
|
| 513 |
.q-title{color:#eaeaf2;font-weight:800}
|
| 514 |
.q-badge{padding:8px 12px;border-radius:10px;font-weight:700}
|
| 515 |
-
.q-badge.ok{background:#
|
| 516 |
.q-badge.err{background:#3a0d14;color:#ffd1d6;border:1px solid #6a1e2b}
|
| 517 |
|
| 518 |
.q-text{color:var(--text);font-size:1.06rem;line-height:1.8;margin:8px 0 12px}
|
|
|
|
| 299 |
top = [w for w,_ in sorted(stats.items(), key=lambda kv: -kv[1])]
|
| 300 |
return top[:limit]
|
| 301 |
|
| 302 |
+
# ================== (NEW) موازنة الطول والتطويل ==================
|
| 303 |
+
|
| 304 |
+
# كاش صغير لعبارة الصحيحة لاستخدامها أثناء التطويل
|
| 305 |
+
ref_phrase_cache = {}
|
| 306 |
+
|
| 307 |
+
def word_len(s: str) -> int:
|
| 308 |
+
return len([w for w in re2.split(r"\s+", s.strip()) if w])
|
| 309 |
+
|
| 310 |
+
def within_ratio(cand: str, target_len: int, tol: float = 0.2) -> bool:
|
| 311 |
+
L = word_len(cand)
|
| 312 |
+
return (target_len*(1-tol) <= L <= target_len*(1+tol))
|
| 313 |
+
|
| 314 |
+
# قوالب عربية عامة للتطويل عند غياب الـMLM أو فشل توقع مناسب
|
| 315 |
+
GENERIC_PREFIXES = ["تقنيات", "مجال", "أنظمة", "تطبيقات", "مفاهيم", "ممارسات", "نماذج", "آليات"]
|
| 316 |
+
GENERIC_SUFFIXES = ["الذكية", "التعليمية", "الحديثة", "المتقدمة", "المبتكرة", "الرقمية"]
|
| 317 |
+
|
| 318 |
+
def shape_phrase_like(ref: str, cand: str) -> str:
|
| 319 |
+
"""مواءمة التعريف/التنكير لتقارب الشكل العام."""
|
| 320 |
+
return with_same_definiteness(ref, cand)
|
| 321 |
+
|
| 322 |
+
def try_mlm_expand(cand: str, sentence_with_blank: str, target_len: int) -> Optional[str]:
|
| 323 |
+
"""توسيع المشتّت عبر MLم بإضافة كلمة قبل/بعد ليقترب الطول من الصحيحة."""
|
| 324 |
+
masker = get_masker()
|
| 325 |
+
if not masker:
|
| 326 |
+
return None
|
| 327 |
+
trials = []
|
| 328 |
+
trials.append(sentence_with_blank.replace("_____", f"{masker.tokenizer.mask_token} {cand}"))
|
| 329 |
+
trials.append(sentence_with_blank.replace("_____", f"{cand} {masker.tokenizer.mask_token}"))
|
| 330 |
+
|
| 331 |
+
for masked_sent in trials:
|
| 332 |
+
try:
|
| 333 |
+
outs = masker(masked_sent, top_k=8)
|
| 334 |
+
except Exception:
|
| 335 |
+
continue
|
| 336 |
+
for o in outs:
|
| 337 |
+
tok = o["token_str"].strip()
|
| 338 |
+
if not tok or re2.match(r"^[\p{P}\p{S}\d_]+$", tok):
|
| 339 |
+
continue
|
| 340 |
+
if masked_sent.strip().startswith(masker.tokenizer.mask_token):
|
| 341 |
+
phrase = f"{tok} {cand}"
|
| 342 |
+
else:
|
| 343 |
+
phrase = f"{cand} {tok}"
|
| 344 |
+
# تجنب التطابق مع الصحيحة بعد التطبيع
|
| 345 |
+
if within_ratio(phrase, target_len) and norm_ar(phrase) != norm_ar(ref_phrase_cache.get("correct","")):
|
| 346 |
+
return phrase
|
| 347 |
+
return None
|
| 348 |
+
|
| 349 |
+
def fallback_expand(cand: str, target_len: int) -> str:
|
| 350 |
+
"""تطويل بسيط بقوالب عامة إذا فشل الـMLM."""
|
| 351 |
+
for p in GENERIC_PREFIXES:
|
| 352 |
+
phrase = f"{p} {cand}"
|
| 353 |
+
if within_ratio(phrase, target_len):
|
| 354 |
+
return phrase
|
| 355 |
+
for sfx in GENERIC_SUFFIXES:
|
| 356 |
+
phrase = f"{cand} {sfx}"
|
| 357 |
+
if within_ratio(phrase, target_len):
|
| 358 |
+
return phrase
|
| 359 |
+
phrase = f"{random.choice(GENERIC_PREFIXES)} {cand} {random.choice(GENERIC_SUFFIXES)}"
|
| 360 |
+
return phrase
|
| 361 |
+
|
| 362 |
+
# --- (J) مشتّتات ذكية تضمن دائمًا ≥3 خيارات فعلية + موازنة الطول ---
|
| 363 |
def smart_distractors(correct: str, phrase_pool: List[str], sentence_with_blank: str, backup_terms: List[str], k: int = 3) -> List[str]:
|
| 364 |
target = correct.strip()
|
| 365 |
+
ref_phrase_cache["correct"] = target # لتجنّب مطابقة بعد التطويل
|
| 366 |
+
|
| 367 |
# 1) مصادر متعددة
|
| 368 |
neigh = nearest_terms(target, phrase_pool, k=48)
|
| 369 |
mlm = mlm_fill(sentence_with_blank, target, k=24)
|
|
|
|
| 399 |
if not filtered:
|
| 400 |
filtered = raw_pool[:max(24, k*6)]
|
| 401 |
|
| 402 |
+
# 4) موازنة الطول (أساسي): اجعل المشتّت قريب طولًا من الصحيحة
|
| 403 |
+
target_words = word_len(target)
|
| 404 |
shaped = []
|
| 405 |
for w in filtered:
|
| 406 |
+
cand = shape_phrase_like(target, w) # مواءمة "الـ"
|
| 407 |
+
if within_ratio(cand, target_words, tol=0.2):
|
| 408 |
+
shaped.append(cand)
|
| 409 |
+
continue
|
| 410 |
+
# جرّب توسيع بالـMLM
|
| 411 |
+
expanded = try_mlm_expand(cand, sentence_with_blank, target_words)
|
| 412 |
+
if expanded and within_ratio(expanded, target_words, tol=0.2):
|
| 413 |
+
shaped.append(expanded)
|
| 414 |
+
continue
|
| 415 |
+
# fallback بقوالب عامة
|
| 416 |
+
fb = fallback_expand(cand, target_words)
|
| 417 |
+
shaped.append(fb)
|
| 418 |
+
|
| 419 |
+
# إزالة أي تطويل خرج متطابقًا مع الصحيحة بعد التطبيع
|
| 420 |
+
shaped = [s for s in shaped if norm_ar(s) != norm_ar(target)]
|
| 421 |
|
| 422 |
# 5) ترتيب بالانسجام
|
| 423 |
ranked = rank_by_sentence_coherence(sentence_with_blank, target, shaped, topk=max(k, 12))
|
| 424 |
|
| 425 |
+
# 6) خذ أفضل k؛ وإن لم يكفِ، أكمل من shaped ثم filtered ثم raw_pool ثم backup_terms
|
| 426 |
out = []
|
| 427 |
for src in [ranked, shaped, filtered, raw_pool, backup_terms]:
|
| 428 |
for w in src:
|
|
|
|
| 433 |
|
| 434 |
# ضمان العدد بدون أي placeholders
|
| 435 |
if len(out) < k:
|
|
|
|
| 436 |
while len(out) < k and ranked:
|
| 437 |
out.append(ranked[len(out) % len(ranked)])
|
| 438 |
|
|
|
|
| 479 |
|
| 480 |
q = re2.sub(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", "_____", s, count=1)
|
| 481 |
|
| 482 |
+
# مشتّتات ذكية تضمن دائمًا ≥3 ومتوازنة الطول
|
| 483 |
pool = [x for x in keyphrases if x != kp]
|
| 484 |
distracts = smart_distractors(kp, pool, q, backup_terms, k=3)
|
| 485 |
|
|
|
|
| 501 |
for i,lbl in enumerate(["A","B","C","D"]):
|
| 502 |
txt=(it.choices[i] if i<len(it.choices) else "—").strip()
|
| 503 |
txt=txt.replace(",", "،").replace("?", "؟").replace(";", "؛")
|
| 504 |
+
# منع أي Placeholder يظهر للمستخدم
|
| 505 |
if txt == "—" or not txt:
|
| 506 |
txt = "خيار"
|
| 507 |
opts.append({"id":lbl,"text":txt or "خيار","is_correct":(i==it.answer_index)})
|
|
|
|
| 584 |
.q-header{display:flex;gap:10px;align-items:center;justify-content:space-between;margin-bottom:6px}
|
| 585 |
.q-title{color:#eaeaf2;font-weight:800}
|
| 586 |
.q-badge{padding:8px 12px;border-radius:10px;font-weight:700}
|
| 587 |
+
.q-badge.ok{background:#0f2f22;color:#b6f4db;border:1px solid #145b44}
|
| 588 |
.q-badge.err{background:#3a0d14;color:#ffd1d6;border:1px solid #6a1e2b}
|
| 589 |
|
| 590 |
.q-text{color:var(--text);font-size:1.06rem;line-height:1.8;margin:8px 0 12px}
|