Spaces:

Leen172
/

Question_generator

Sleeping

App Files Files Community

Leen172 commited on Nov 3

Commit

1839da1

verified ·

1 Parent(s): 5d84ed2

Update app.py

Browse files

Files changed (1) hide show

app.py +99 -12

app.py CHANGED Viewed

@@ -398,12 +398,54 @@ def similarity_caps(difficulty: str):
     if difficulty == "صعب":
         return 0.95
     return 0.92
 # ====== مشتّتات ذكية مع الصعوبة وBM25 ======
 def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: int = 3,
                       all_sentences: Optional[List[str]] = None, difficulty: str = "متوسط") -> List[str]:
     base = []
     # (أ) جيران دلاليين
     base.extend([w for w,_ in nearest_terms(correct, phrase_pool, k=24)])
@@ -478,36 +520,45 @@ def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: in
         out.extend(legacy_distractors(correct, phrase_pool, k=k-len(out)))
     return out[:k]
 def best_keyword_in_sentence(sentence: str, global_text: str) -> Optional[str]:
-    """اختيار هدف مناسب للفراغ من نفس الجملة: YAKE على الجملة نفسها مع فلترة."""
     try:
-        ex = yake.KeywordExtractor(lan='ar', n=3, top=20)
         pairs = ex.extract_keywords(sentence)
     except Exception:
         pairs = []
-    # رتب حسب طول العبارة (أطول غالبًا أوضح) مع وزن خفيف لتكرارها بالنص
     cands = []
     for w, _ in pairs:
         w = re2.sub(r"\s+", " ", w.strip())
-        if not w:
-            continue
-        if not good_kw(w) or not safe_keyword(w):
             continue
-        if len(w) < 2 or len(w) > 40:
             continue
-        # لازم تظهر فعليًا ضمن الجملة نصيًا
         if not re2.search(rf"(?<!\p{{L}}){re2.escape(w)}(?!\p{{L}})", sentence):
             continue
-        # وزن بالتكرار العام لتمييز الأهم
         freq_weight = global_text.count(w)
-        cands.append((w, len(w) + 0.5*freq_weight))
     if not cands:
-        # fallback أبسط: التقط أطول “كلمة” معقولة
         toks = [t for t in re2.findall(r"\p{L}+", sentence) if good_kw(t) and safe_keyword(t)]
         toks.sort(key=len, reverse=True)
         return toks[0] if toks else None
     cands.sort(key=lambda x: -x[1])
-    return cands[0][0] if cands else None
 # ====== (4-أ) مُولِّد أسئلة "فراغ" ======
 def make_mcqs(text:str, n:int=6, difficulty: str = "متوسط")->List[MCQ]:
@@ -536,6 +587,17 @@ def make_mcqs(text:str, n:int=6, difficulty: str = "متوسط")->List[MCQ]:
     used_paras = set()
     items: List[MCQ] = []
     # (أ) تمريرة: سؤال واحد من كل فقرة
     for kp in sorted(sent_for.keys(), key=lambda x: (-len(x), x)):
@@ -591,6 +653,31 @@ def make_mcqs(text:str, n:int=6, difficulty: str = "متوسط")->List[MCQ]:
             q = re2.sub(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", "_____", s, count=1)
             pool = [x for x in keyphrases if x != kp] or keyphrases[:]
             ch  = smart_distractors(kp, pool, s, k=3, all_sentences=all_sents, difficulty=difficulty) + [kp]
             uniq, seen = [], set()
             for c in ch:

     if difficulty == "صعب":
         return 0.95
     return 0.92
+def typo_like_variants(answer: str, k: int = 4) -> List[str]:
+    """مشتّتات شكلية: تعديلات طفيفة على الجواب (تعريف/تنكير، ي/ى، ة/ه، حذف حرف)."""
+    a = norm_ar(answer)
+    vars = set()
+    # تعريف/تنكير
+    if a.startswith("ال"):
+        vars.add(a[2:])
+    else:
+        vars.add("ال" + a)
+    # ي/ى
+    vars.add(a.replace("ي", "ى"))
+    vars.add(a.replace("ى", "ي"))
+    # ة/ه
+    vars.add(a.replace("ة", "ه"))
+    vars.add(a.replace("ه", "ة"))
+    # حذف حرف وسطي إن الكلمة طويلة
+    if len(a) > 5:
+        mid = len(a)//2
+        vars.add(a[:mid] + a[mid+1:])
+    # تنظيف نهائي
+    out = [v for v in vars if v and norm_ar(v) != norm_ar(a)]
+    return out[:k]
 # ====== مشتّتات ذكية مع الصعوبة وBM25 ======
 def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: int = 3,
                       all_sentences: Optional[List[str]] = None, difficulty: str = "متوسط") -> List[str]:
     base = []
+    # (0) مشتّتات شكلية أولاً (تعريف/تنكير، ي/ى، ة/ه، حذف حرف...)
+    for v in typo_like_variants(correct, k=4):
+        base.append(v)
+    # (أ) جيران دلاليين من العبارات
+    base.extend([w for w, _ in nearest_terms(correct, phrase_pool, k=24)])
+    # (ب) FILL-MASK على الجملة
+    for w in mlm_distractors(sentence.replace(correct, "_____"), correct, k=18):
+        if w not in base:
+            base.append(w)
+    # (ج) BM25 من النص كله
+    if all_sentences:
+        bm, corp = bm25_build(all_sentences)
+        for w in bm25_candidates(correct, all_sentences, bm, corp, top=18):
+            if w not in base:
+                base.append(w)
     # (أ) جيران دلاليين
     base.extend([w for w,_ in nearest_terms(correct, phrase_pool, k=24)])
         out.extend(legacy_distractors(correct, phrase_pool, k=k-len(out)))
     return out[:k]
+def looks_like_title_fragment(s: str) -> bool:
+    return ":" in s and s.index(":") < max(10, len(s)//6)
+def is_nouny_phrase(w: str) -> bool:
+    # heuristics بسيطة: 1–4 كلمات، بدون أفعال ظاهرة، وتنتهي باسم/صفة شائعة
+    toks = word_tokens(w)
+    if not (1 <= len(toks) <= 4): return False
+    if re2.search(r"(يفعل|تفعل|يشهد|تقوم|يمكن|قد|سوف)$", w): return False
+    return True
 def best_keyword_in_sentence(sentence: str, global_text: str) -> Optional[str]:
+    if looks_like_title_fragment(sentence):
+        # لا تفرّغ من شق العنوان قبل النقطتين
+        parts = sentence.split(":", 1)
+        sentence = parts[1] if len(parts) > 1 else sentence
     try:
+        ex = yake.KeywordExtractor(lan='ar', n=3, top=24)
         pairs = ex.extract_keywords(sentence)
     except Exception:
         pairs = []
     cands = []
     for w, _ in pairs:
         w = re2.sub(r"\s+", " ", w.strip())
+        if not w or not good_kw(w) or not safe_keyword(w):
             continue
+        if not is_nouny_phrase(w):
             continue
         if not re2.search(rf"(?<!\p{{L}}){re2.escape(w)}(?!\p{{L}})", sentence):
             continue
         freq_weight = global_text.count(w)
+        cands.append((w, len(w) + 0.7*freq_weight))
     if not cands:
         toks = [t for t in re2.findall(r"\p{L}+", sentence) if good_kw(t) and safe_keyword(t)]
+        toks = [t for t in toks if is_nouny_phrase(t)]
         toks.sort(key=len, reverse=True)
         return toks[0] if toks else None
     cands.sort(key=lambda x: -x[1])
+    return cands[0][0]
 # ====== (4-أ) مُولِّد أسئلة "فراغ" ======
 def make_mcqs(text:str, n:int=6, difficulty: str = "متوسط")->List[MCQ]:
     used_paras = set()
     items: List[MCQ] = []
+    MAX_PER_PARA = 2  # كان فعلياً 1 ضمنياً، هيك بنسمح بحد أقصى 2
+    para_count = {}
+    ...
+    # عند إضافة سؤال جديد:
+    pid = para_map.get(s, -1)
+    if para_count.get(pid, 0) >= MAX_PER_PARA:
+        continue
+    ...
+    items.append(... )
+    para_count[pid] = para_count.get(pid, 0) + 1
     # (أ) تمريرة: سؤال واحد من كل فقرة
     for kp in sorted(sent_for.keys(), key=lambda x: (-len(x), x)):
             q = re2.sub(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", "_____", s, count=1)
             pool = [x for x in keyphrases if x != kp] or keyphrases[:]
             ch  = smart_distractors(kp, pool, s, k=3, all_sentences=all_sents, difficulty=difficulty) + [kp]
+            # بعد حساب ch = smart_distractors(...)+[kp]
+            choices = []
+            seen = set()
+            for c in ch:
+                c = (c or "").strip()
+                if not c or c == "…" or c in seen:
+                    continue
+                if not choice_length_ok(c):
+                    continue
+                if appears_as_long_fragment_in_sentence(c, s):
+                    continue
+                if is_sub_or_super(c, kp) or jaccard(c, kp) >= 0.5:
+                    continue
+                seen.add(c); choices.append(c)
+            # تأكيد 3 مشتّتات حقيقيّة
+            if kp not in choices:
+                choices.append(kp)
+                seen.add(kp)
+            # لو ما قدرنا نوصل 4 خيارات بدون "…" نحذف السؤال بدل تشويهه
+            if len(choices) < 4:
+                continue
+            choices = choices[:4]
+            random.shuffle(choices)
+            ans = choices.index(kp)
             uniq, seen = [], set()
             for c in ch: