Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -398,12 +398,54 @@ def similarity_caps(difficulty: str):
|
|
| 398 |
if difficulty == "صعب":
|
| 399 |
return 0.95
|
| 400 |
return 0.92
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 401 |
|
| 402 |
# ====== مشتّتات ذكية مع الصعوبة وBM25 ======
|
| 403 |
def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: int = 3,
|
| 404 |
all_sentences: Optional[List[str]] = None, difficulty: str = "متوسط") -> List[str]:
|
| 405 |
base = []
|
| 406 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 407 |
# (أ) جيران دلاليين
|
| 408 |
base.extend([w for w,_ in nearest_terms(correct, phrase_pool, k=24)])
|
| 409 |
|
|
@@ -478,36 +520,45 @@ def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: in
|
|
| 478 |
out.extend(legacy_distractors(correct, phrase_pool, k=k-len(out)))
|
| 479 |
return out[:k]
|
| 480 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 481 |
def best_keyword_in_sentence(sentence: str, global_text: str) -> Optional[str]:
|
| 482 |
-
|
|
|
|
|
|
|
|
|
|
| 483 |
try:
|
| 484 |
-
ex = yake.KeywordExtractor(lan='ar', n=3, top=
|
| 485 |
pairs = ex.extract_keywords(sentence)
|
| 486 |
except Exception:
|
| 487 |
pairs = []
|
| 488 |
-
# رتب حسب طول العبارة (أطول غالبًا أوضح) مع وزن خفيف لتكرارها بالنص
|
| 489 |
cands = []
|
| 490 |
for w, _ in pairs:
|
| 491 |
w = re2.sub(r"\s+", " ", w.strip())
|
| 492 |
-
if not w:
|
| 493 |
-
continue
|
| 494 |
-
if not good_kw(w) or not safe_keyword(w):
|
| 495 |
continue
|
| 496 |
-
if
|
| 497 |
continue
|
| 498 |
-
# لازم تظهر فعليًا ضمن الجملة نصيًا
|
| 499 |
if not re2.search(rf"(?<!\p{{L}}){re2.escape(w)}(?!\p{{L}})", sentence):
|
| 500 |
continue
|
| 501 |
-
# وزن بالتكرار العام لتمييز الأهم
|
| 502 |
freq_weight = global_text.count(w)
|
| 503 |
-
cands.append((w, len(w) + 0.
|
| 504 |
if not cands:
|
| 505 |
-
# fallback أبسط: التقط أطول “كلمة” معقولة
|
| 506 |
toks = [t for t in re2.findall(r"\p{L}+", sentence) if good_kw(t) and safe_keyword(t)]
|
|
|
|
| 507 |
toks.sort(key=len, reverse=True)
|
| 508 |
return toks[0] if toks else None
|
| 509 |
cands.sort(key=lambda x: -x[1])
|
| 510 |
-
return cands[0][0]
|
|
|
|
| 511 |
|
| 512 |
# ====== (4-أ) مُولِّد أسئلة "فراغ" ======
|
| 513 |
def make_mcqs(text:str, n:int=6, difficulty: str = "متوسط")->List[MCQ]:
|
|
@@ -536,6 +587,17 @@ def make_mcqs(text:str, n:int=6, difficulty: str = "متوسط")->List[MCQ]:
|
|
| 536 |
used_paras = set()
|
| 537 |
|
| 538 |
items: List[MCQ] = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 539 |
|
| 540 |
# (أ) تمريرة: سؤال واحد من كل فقرة
|
| 541 |
for kp in sorted(sent_for.keys(), key=lambda x: (-len(x), x)):
|
|
@@ -591,6 +653,31 @@ def make_mcqs(text:str, n:int=6, difficulty: str = "متوسط")->List[MCQ]:
|
|
| 591 |
q = re2.sub(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", "_____", s, count=1)
|
| 592 |
pool = [x for x in keyphrases if x != kp] or keyphrases[:]
|
| 593 |
ch = smart_distractors(kp, pool, s, k=3, all_sentences=all_sents, difficulty=difficulty) + [kp]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 594 |
|
| 595 |
uniq, seen = [], set()
|
| 596 |
for c in ch:
|
|
|
|
| 398 |
if difficulty == "صعب":
|
| 399 |
return 0.95
|
| 400 |
return 0.92
|
| 401 |
+
def typo_like_variants(answer: str, k: int = 4) -> List[str]:
|
| 402 |
+
"""مشتّتات شكلية: تعديلات طفيفة على الجواب (تعريف/تنكير، ي/ى، ة/ه، حذف حرف)."""
|
| 403 |
+
a = norm_ar(answer)
|
| 404 |
+
vars = set()
|
| 405 |
+
# تعريف/تنكير
|
| 406 |
+
if a.startswith("ال"):
|
| 407 |
+
vars.add(a[2:])
|
| 408 |
+
else:
|
| 409 |
+
vars.add("ال" + a)
|
| 410 |
+
# ي/ى
|
| 411 |
+
vars.add(a.replace("ي", "ى"))
|
| 412 |
+
vars.add(a.replace("ى", "ي"))
|
| 413 |
+
# ة/ه
|
| 414 |
+
vars.add(a.replace("ة", "ه"))
|
| 415 |
+
vars.add(a.replace("ه", "ة"))
|
| 416 |
+
# حذف حرف وسطي إن الكلمة طويلة
|
| 417 |
+
if len(a) > 5:
|
| 418 |
+
mid = len(a)//2
|
| 419 |
+
vars.add(a[:mid] + a[mid+1:])
|
| 420 |
+
# تنظيف نهائي
|
| 421 |
+
out = [v for v in vars if v and norm_ar(v) != norm_ar(a)]
|
| 422 |
+
return out[:k]
|
| 423 |
|
| 424 |
# ====== مشتّتات ذكية مع الصعوبة وBM25 ======
|
| 425 |
def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: int = 3,
|
| 426 |
all_sentences: Optional[List[str]] = None, difficulty: str = "متوسط") -> List[str]:
|
| 427 |
base = []
|
| 428 |
|
| 429 |
+
# (0) مشتّتات شكلية أولاً (تعريف/تنكير، ي/ى، ة/ه، حذف حرف...)
|
| 430 |
+
for v in typo_like_variants(correct, k=4):
|
| 431 |
+
base.append(v)
|
| 432 |
+
|
| 433 |
+
# (أ) جيران دلاليين من العبارات
|
| 434 |
+
base.extend([w for w, _ in nearest_terms(correct, phrase_pool, k=24)])
|
| 435 |
+
|
| 436 |
+
# (ب) FILL-MASK على الجملة
|
| 437 |
+
for w in mlm_distractors(sentence.replace(correct, "_____"), correct, k=18):
|
| 438 |
+
if w not in base:
|
| 439 |
+
base.append(w)
|
| 440 |
+
|
| 441 |
+
# (ج) BM25 من النص كله
|
| 442 |
+
if all_sentences:
|
| 443 |
+
bm, corp = bm25_build(all_sentences)
|
| 444 |
+
for w in bm25_candidates(correct, all_sentences, bm, corp, top=18):
|
| 445 |
+
if w not in base:
|
| 446 |
+
base.append(w)
|
| 447 |
+
|
| 448 |
+
|
| 449 |
# (أ) جيران دلاليين
|
| 450 |
base.extend([w for w,_ in nearest_terms(correct, phrase_pool, k=24)])
|
| 451 |
|
|
|
|
| 520 |
out.extend(legacy_distractors(correct, phrase_pool, k=k-len(out)))
|
| 521 |
return out[:k]
|
| 522 |
|
| 523 |
+
def looks_like_title_fragment(s: str) -> bool:
|
| 524 |
+
return ":" in s and s.index(":") < max(10, len(s)//6)
|
| 525 |
+
|
| 526 |
+
def is_nouny_phrase(w: str) -> bool:
|
| 527 |
+
# heuristics بسيطة: 1–4 كلمات، بدون أفعال ظاهرة، وتنتهي باسم/صفة شائعة
|
| 528 |
+
toks = word_tokens(w)
|
| 529 |
+
if not (1 <= len(toks) <= 4): return False
|
| 530 |
+
if re2.search(r"(يفعل|تفعل|يشهد|تقوم|يمكن|قد|سوف)$", w): return False
|
| 531 |
+
return True
|
| 532 |
+
|
| 533 |
def best_keyword_in_sentence(sentence: str, global_text: str) -> Optional[str]:
|
| 534 |
+
if looks_like_title_fragment(sentence):
|
| 535 |
+
# لا تفرّغ من شق العنوان قبل النقطتين
|
| 536 |
+
parts = sentence.split(":", 1)
|
| 537 |
+
sentence = parts[1] if len(parts) > 1 else sentence
|
| 538 |
try:
|
| 539 |
+
ex = yake.KeywordExtractor(lan='ar', n=3, top=24)
|
| 540 |
pairs = ex.extract_keywords(sentence)
|
| 541 |
except Exception:
|
| 542 |
pairs = []
|
|
|
|
| 543 |
cands = []
|
| 544 |
for w, _ in pairs:
|
| 545 |
w = re2.sub(r"\s+", " ", w.strip())
|
| 546 |
+
if not w or not good_kw(w) or not safe_keyword(w):
|
|
|
|
|
|
|
| 547 |
continue
|
| 548 |
+
if not is_nouny_phrase(w):
|
| 549 |
continue
|
|
|
|
| 550 |
if not re2.search(rf"(?<!\p{{L}}){re2.escape(w)}(?!\p{{L}})", sentence):
|
| 551 |
continue
|
|
|
|
| 552 |
freq_weight = global_text.count(w)
|
| 553 |
+
cands.append((w, len(w) + 0.7*freq_weight))
|
| 554 |
if not cands:
|
|
|
|
| 555 |
toks = [t for t in re2.findall(r"\p{L}+", sentence) if good_kw(t) and safe_keyword(t)]
|
| 556 |
+
toks = [t for t in toks if is_nouny_phrase(t)]
|
| 557 |
toks.sort(key=len, reverse=True)
|
| 558 |
return toks[0] if toks else None
|
| 559 |
cands.sort(key=lambda x: -x[1])
|
| 560 |
+
return cands[0][0]
|
| 561 |
+
|
| 562 |
|
| 563 |
# ====== (4-أ) مُولِّد أسئلة "فراغ" ======
|
| 564 |
def make_mcqs(text:str, n:int=6, difficulty: str = "متوسط")->List[MCQ]:
|
|
|
|
| 587 |
used_paras = set()
|
| 588 |
|
| 589 |
items: List[MCQ] = []
|
| 590 |
+
MAX_PER_PARA = 2 # كان فعلياً 1 ضمنياً، هيك بنسمح بحد أقصى 2
|
| 591 |
+
|
| 592 |
+
para_count = {}
|
| 593 |
+
...
|
| 594 |
+
# عند إضافة سؤال جديد:
|
| 595 |
+
pid = para_map.get(s, -1)
|
| 596 |
+
if para_count.get(pid, 0) >= MAX_PER_PARA:
|
| 597 |
+
continue
|
| 598 |
+
...
|
| 599 |
+
items.append(... )
|
| 600 |
+
para_count[pid] = para_count.get(pid, 0) + 1
|
| 601 |
|
| 602 |
# (أ) تمريرة: سؤال واحد من كل فقرة
|
| 603 |
for kp in sorted(sent_for.keys(), key=lambda x: (-len(x), x)):
|
|
|
|
| 653 |
q = re2.sub(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", "_____", s, count=1)
|
| 654 |
pool = [x for x in keyphrases if x != kp] or keyphrases[:]
|
| 655 |
ch = smart_distractors(kp, pool, s, k=3, all_sentences=all_sents, difficulty=difficulty) + [kp]
|
| 656 |
+
# بعد حساب ch = smart_distractors(...)+[kp]
|
| 657 |
+
choices = []
|
| 658 |
+
seen = set()
|
| 659 |
+
for c in ch:
|
| 660 |
+
c = (c or "").strip()
|
| 661 |
+
if not c or c == "…" or c in seen:
|
| 662 |
+
continue
|
| 663 |
+
if not choice_length_ok(c):
|
| 664 |
+
continue
|
| 665 |
+
if appears_as_long_fragment_in_sentence(c, s):
|
| 666 |
+
continue
|
| 667 |
+
if is_sub_or_super(c, kp) or jaccard(c, kp) >= 0.5:
|
| 668 |
+
continue
|
| 669 |
+
seen.add(c); choices.append(c)
|
| 670 |
+
# تأكيد 3 مشتّتات حقيقيّة
|
| 671 |
+
if kp not in choices:
|
| 672 |
+
choices.append(kp)
|
| 673 |
+
seen.add(kp)
|
| 674 |
+
# لو ما قدرنا نوصل 4 خيارات بدون "…" نحذف السؤال بدل تشويهه
|
| 675 |
+
if len(choices) < 4:
|
| 676 |
+
continue
|
| 677 |
+
choices = choices[:4]
|
| 678 |
+
random.shuffle(choices)
|
| 679 |
+
ans = choices.index(kp)
|
| 680 |
+
|
| 681 |
|
| 682 |
uniq, seen = [], set()
|
| 683 |
for c in ch:
|