Spaces:

Leen172
/

Question_generator

Sleeping

App Files Files Community

Leen172 commited on Nov 2

Commit

82f92ea

verified ·

1 Parent(s): 8285ac1

Update app.py

Browse files

Files changed (1) hide show

app.py +196 -57

app.py CHANGED Viewed

@@ -132,9 +132,9 @@ def yake_keywords(t: str, k: int = 160) -> List[str]:
             pairs = []
         for w, _ in pairs:
             w = re2.sub(r"\s+", " ", w.strip())
-            if not w or w in seen:
                 continue
-            if re2.match(r"^[\p{P}\p{S}\d_]+$", w):
                 continue
             if 2 <= len(w) <= 40:
                 phrases.append(w)
@@ -144,6 +144,58 @@ def yake_keywords(t: str, k: int = 160) -> List[str]:
 def good_kw(kw:str)->bool:
     return kw and len(kw)>=2 and kw not in AR_STOP and not re2.match(r"^[\p{P}\p{S}\d_]+$", kw)
 # ====== (2) جيران دلاليًا + (3) FILL-MASK كبديل ======
 _EMB = None
 def get_embedder():
@@ -211,79 +263,130 @@ def legacy_distractors(correct:str, pool:List[str], k:int=3)->List[str]:
         if abs(len(w)-L)<=3: cand.append(w)
     random.shuffle(cand)
     out=cand[:k]
-    while len(out)<k: out.append("—")
     return out
-def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: int = 3) -> List[str]:
-    # 1) جيران دلاليًا
-    neigh = nearest_terms(correct, phrase_pool, k=12)
-    neigh = [w for w,sim in neigh if w != correct][:k+4]
-    # 2) FILL-MASK على الجملة (بديل)
-    if len(neigh) < k:
-        mlm = mlm_distractors(sentence.replace(correct, "_____"), correct, k=10)
-        for w in mlm:
-            if w not in neigh and w != correct:
-                neigh.append(w)
-            if len(neigh) >= k+4:
-                break
-    # 3) فلترة خفيفة
-    out = []
-    L = len(correct)
-    for w in neigh:
-        if w in AR_STOP:
             continue
-        if abs(len(w) - L) > max(6, L//2):
             continue
         if norm_ar(w) == norm_ar(correct):
             continue
-        out.append(w)
-        if len(out) >= k:
             break
-    # 4) رجوع للخطة القديمة إذا ما كفى
     if len(out) < k:
-        extra = legacy_distractors(correct, phrase_pool, k=k-len(out))
-        out.extend(extra)
-    while len(out) < k:
-        out.append("—")
-    return out
 # ====== (4) مُولِّد أسئلة جديد بمحافظته على نفس الواجهة تمامًا ======
 def make_mcqs(text:str, n:int=6)->List[MCQ]:
     sents=split_sents(text)
-    if not sents:
         raise ValueError("النص قصير أو غير صالح.")
-    # عبارات مفتاحية 1–3 كلمات
     keyphrases = yake_keywords(text, k=160)
-    keyphrases = [kp for kp in keyphrases if good_kw(kp) and 2 <= len(kp) <= 40]
-    # ربط العبارة بجملة مناسبة (طول معقول ≥ 60) لضمان سياق واضح
     sent_for={}
     for s in sents:
-        if len(s) < 60:
             continue
         for kp in keyphrases:
-            if kp in sent_for:
                 continue
-            if re2.search(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", s):
                 sent_for[kp]=s
     if not sent_for:
-        # fallback: لو ما لقينا مطابقات جيدة، نرجع للمفردات العامة من النص
         tokens = [t for t in re2.findall(r"[\p{L}\p{N}_]+", text) if good_kw(t)]
         freq = [w for w,_ in sorted(((t, text.count(t)) for t in tokens), key=lambda x:-x[1])]
-        keyphrases = freq[:120]
         for s in sents:
-            if len(s) < 60:
                 continue
             for kp in keyphrases:
                 if kp in sent_for:
                     continue
-                if re2.search(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", s):
                     sent_for[kp]=s
             if len(sent_for)>=n*2:
                 break
@@ -296,7 +399,7 @@ def make_mcqs(text:str, n:int=6)->List[MCQ]:
     for kp in sorted(sent_for.keys(), key=lambda x: (-len(x), x)):
         if len(items)>=n: break
         s=sent_for[kp]
-        if s in used_sents or kp in used_keys:
             continue
         # ابنِ سؤال الفراغ
@@ -305,23 +408,47 @@ def make_mcqs(text:str, n:int=6)->List[MCQ]:
         # مشتتات أذكى (مع رجوع تلقائي لو النماذج مش متاحة)
         pool = [x for x in keyphrases if x != kp]
         ch = smart_distractors(kp, pool, s, k=3) + [kp]
-        random.shuffle(ch); ans=ch.index(kp)
         items.append(MCQ(id=str(uuid.uuid4())[:8], question=q, choices=ch, answer_index=ans))
         used_sents.add(s); used_keys.add(kp)
-    if not items:
         raise RuntimeError("تعذّر توليد أسئلة.")
     return items
 def to_records(items:List[MCQ])->List[dict]:
     recs=[]
     for it in items:
         opts=[]
         for i,lbl in enumerate(["A","B","C","D"]):
-            txt=(it.choices[i] if i<len(it.choices) else "—").strip()
-            txt=txt.replace(",", "،").replace("?", "؟").replace(";", "؛")
-            opts.append({"id":lbl,"text":txt or "—","is_correct":(i==it.answer_index)})
         recs.append({"id":it.id,"question":it.question.strip(),"options":opts})
     return recs
@@ -333,7 +460,7 @@ def render_quiz_html(records: List[dict]) -> str:
         qtxt = rec["question"]
         cor  = next((o["id"] for o in rec["options"] if o["is_correct"]), "")
         opts_html=[]
-        for o in rec["options"]:
             lid, txt = o["id"], o["text"]
             opts_html.append(f"""
                 <label class="opt" data-letter="{lid}">
@@ -372,7 +499,7 @@ def build_quiz(text_area, file_path, n, model_id, zoom):
     recs    = to_records(items)
     return render_quiz_html(recs), gr.update(visible=False), gr.update(visible=True), ""
-# ------------------ CSS ------------------
 CSS = """
 :root{
   --bg:#0e0e11; --panel:#15161a; --card:#1a1b20; --muted:#a7b0be;
@@ -404,7 +531,7 @@ textarea{min-height:120px}
 .q-badge.ok{background:#083a2a;color:#b6f4db;border:1px solid #145b44}
 .q-badge.err{background:#3a0d14;color:#ffd1d6;border:1px solid #6a1e2b}
-.q-text{color:var(--text);font-size:1.06rem;line-height:1.8;margin:8px 0 12px}
 .opts{display:flex;flex-direction:column;gap:8px}
 .opt{display:flex;gap:10px;align-items:center;background:#14161c;border:1px solid #2a2d3a;border-radius:12px;padding:10px;transition:background .15s,border-color .15s}
 .opt input{accent-color:var(--accent2)}
@@ -422,7 +549,7 @@ textarea{min-height:120px}
 .q-note.warn{color:#ffd1d6}
 """
-# ------------------ JS: ربط Submit بعد الرندر (مع Output مخفي لضمان التنفيذ) ------------------
 ATTACH_LISTENERS_JS = """
 () => {
   // اربط مرة واحدة فقط
@@ -448,7 +575,7 @@ ATTACH_LISTENERS_JS = """
     const chosenLabel = chosen.closest('.opt');
-    // حالة صحيحة: لوّن أخضر وأقفل السؤال كاملاً
     if (chosen.value === correct) {
       chosenLabel.classList.add('ok');
       if (badge){ badge.hidden=false; badge.className='q-badge ok'; badge.textContent='Correct!'; }
@@ -456,21 +583,33 @@ ATTACH_LISTENERS_JS = """
       card.querySelectorAll('input[type="radio"]').forEach(i => i.disabled = true);
       e.target.disabled = true;
       if (note) note.textContent = '';
       return;
     }
     // حالة خاطئة: لوّن أحمر فقط، ولا تعطل أي شيء — ليقدر يجرّب خيار آخر
-    chosenLabel.classList.add('err');         // اتركه أحمر
     if (badge){ badge.hidden=false; badge.className='q-badge err'; badge.textContent='Incorrect.'; }
     if (note) note.textContent = '';
-    // مهم: لا تعطّل الراديو ولا الزر
   });
   return 'wired-multi2';
 }
 """
-# ------------------ واجهة Gradio ------------------
 with gr.Blocks(title="Question Generator", css=CSS) as demo:
     gr.Markdown("<h2 class='top'>Question Generator</h2>")

             pairs = []
         for w, _ in pairs:
             w = re2.sub(r"\s+", " ", w.strip())
+            if not w or w in seen:
                 continue
+            if re2.match(r"^[\p{P}\p{S}\d_]+$", w):
                 continue
             if 2 <= len(w) <= 40:
                 phrases.append(w)
 def good_kw(kw:str)->bool:
     return kw and len(kw)>=2 and kw not in AR_STOP and not re2.match(r"^[\p{P}\p{S}\d_]+$", kw)
+# ====== تحسينات "الذكاء": POS/NER اختياري مع fallback ======
+_HAS_CAMEL = False
+try:
+    from camel_tools.tokenizers.word import simple_word_tokenize
+    from camel_tools.morphology.analyzer import Analyzer
+    from camel_tools.ner import NERecognizer
+    _HAS_CAMEL = True
+    _AN = Analyzer.builtin_analyzer()
+    _NER = NERecognizer.pretrained()
+except Exception:
+    _HAS_CAMEL = False
+NER_TAGS = {"PER","LOC","ORG","MISC"}  # أسماء علم
+def ar_pos(word: str) -> str:
+    if not _HAS_CAMEL:
+        # fallback مبسّط
+        if re2.match(r"^(في|على|الى|إلى|من|عن|حتى|ثم|بل|لكن|أو|و)$", word): return "PART"
+        if re2.match(r"^[\p{N}]+$", word): return "NUM"
+        if re2.search(r"(ة|ات|ون|ين|ان)$", word): return "NOUN"
+        return "X"
+    try:
+        ana = _AN.analyze(word)
+        if not ana: return "X"
+        pos_candidates = [a.get('pos','X') for a in ana]
+        # خذ الأكثر تكرارًا
+        from collections import Counter
+        return Counter(pos_candidates).most_common(1)[0][0] if pos_candidates else "X"
+    except Exception:
+        return "X"
+def is_named_entity(token: str) -> bool:
+    if not _HAS_CAMEL:
+        return False
+    try:
+        tag = _NER.predict_sentence([token])[0]
+        return tag in NER_TAGS
+    except Exception:
+        return False
+def is_clean_sentence(s: str) -> bool:
+    if not (70 <= len(s) <= 220): return False
+    if re2.search(r"https?://|www\.", s): return False
+    if re2.search(r"\d{2,}", s): return False
+    return True
+def safe_keyword(k: str) -> bool:
+    if not good_kw(k): return False
+    if is_named_entity(k): return False
+    if ar_pos(k) in {"PRON","PART"}: return False
+    return True
 # ====== (2) جيران دلاليًا + (3) FILL-MASK كبديل ======
 _EMB = None
 def get_embedder():
         if abs(len(w)-L)<=3: cand.append(w)
     random.shuffle(cand)
     out=cand[:k]
+    while len(out)<k: out.append("…")
     return out
+# ====== Cross-Encoder اختياري للترتيب ======
+_CE = None
+def get_cross_encoder():
+    global _CE
+    if _CE is None:
+        try:
+            from sentence_transformers import CrossEncoder
+            _CE = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
+        except Exception:
+            _CE = False
+    return _CE
+def pos_compatible(a: str, b: str) -> bool:
+    pa, pb = ar_pos(a), ar_pos(b)
+    if "X" in (pa, pb):
+        return True
+    return pa == pb
+def length_close(a: str, b: str) -> bool:
+    return abs(len(a) - len(b)) <= max(6, len(b)//2)
+def rank_by_ce(sentence_with_blank: str, candidates: List[str]) -> List[str]:
+    ce = get_cross_encoder()
+    if not ce or not candidates:
+        return candidates
+    pairs = [(sentence_with_blank.replace("_____", c), c) for c in candidates]
+    try:
+        scores = ce.predict([p[0] for p in pairs])
+        ranked = [c for _, c in sorted(zip(scores, [p[1] for p in pairs]), key=lambda x:-x[0])]
+        return ranked
+    except Exception:
+        return candidates
+def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: int = 3) -> List[str]:
+    base = []
+    # 1) جيران دلاليين
+    neigh = nearest_terms(correct, phrase_pool, k=20)
+    base.extend([w for w,_ in neigh])
+    # 2) FILL-MASK بديل
+    mlm = mlm_distractors(sentence.replace(correct, "_____"), correct, k=15)
+    for w in mlm:
+        if w not in base:
+            base.append(w)
+    # 3) فلترة POS/NER وطول وتشابه/تطبيع
+    clean = []
+    for w in base:
+        w = w.strip()
+        if not w or w == correct:
+            continue
+        if is_named_entity(w):
             continue
+        if not pos_compatible(w, correct):
+            continue
+        if not length_close(w, correct):
             continue
         if norm_ar(w) == norm_ar(correct):
             continue
+        clean.append(w)
+    # 4) ترتيب Cross-Encoder اختياري
+    clean = rank_by_ce(sentence.replace(correct, "_____"), clean)[:max(k*2, k)]
+    # 5) إزالة المتشابه جداً مع الجواب
+    try:
+        emb = get_embedder()
+        if emb and clean:
+            vecs = emb.encode([correct] + clean, normalize_embeddings=True)
+            c, others = vecs[0], vecs[1:]
+            import numpy as np
+            sims = others @ c
+            filtered = [w for w, s in zip(clean, sims) if s < 0.92]
+            if len(filtered) >= k:
+                clean = filtered
+    except Exception:
+        pass
+    out = clean[:k]
+    while len(out) < k:
+        extra = [w for w in phrase_pool if w not in out and w != correct and length_close(w, correct)]
+        if not extra:
             break
+        out.extend(extra[:(k-len(out))])
+        break
     if len(out) < k:
+        out.extend(legacy_distractors(correct, phrase_pool, k=k-len(out)))
+    return out[:k]
 # ====== (4) مُولِّد أسئلة جديد بمحافظته على نفس الواجهة تمامًا ======
 def make_mcqs(text:str, n:int=6)->List[MCQ]:
     sents=split_sents(text)
+    if not sents:
         raise ValueError("النص قصير أو غير صالح.")
+    # عبارات مفتاحية 1–3 كلمات + فلترة أذكى
     keyphrases = yake_keywords(text, k=160)
+    keyphrases = [kp for kp in keyphrases if safe_keyword(kp) and 2 <= len(kp) <= 40]
+    # ربط العبارة بجملة مناسبة (نظيفة، ظهور وحيد للعبارة)
     sent_for={}
     for s in sents:
+        if not is_clean_sentence(s):
             continue
         for kp in keyphrases:
+            if kp in sent_for:
                 continue
+            hits = re2.findall(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", s)
+            if len(hits) == 1:
                 sent_for[kp]=s
+        if len(sent_for)>=n*3:
+            break
     if not sent_for:
+        # fallback: لو ما لقينا مطابقات جيدة، نرجع للمفردات من النص
         tokens = [t for t in re2.findall(r"[\p{L}\p{N}_]+", text) if good_kw(t)]
         freq = [w for w,_ in sorted(((t, text.count(t)) for t in tokens), key=lambda x:-x[1])]
+        keyphrases = [w for w in freq if safe_keyword(w)][:120]
         for s in sents:
+            if not is_clean_sentence(s):
                 continue
             for kp in keyphrases:
                 if kp in sent_for:
                     continue
+                hits = re2.findall(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", s)
+                if len(hits) == 1:
                     sent_for[kp]=s
             if len(sent_for)>=n*2:
                 break
     for kp in sorted(sent_for.keys(), key=lambda x: (-len(x), x)):
         if len(items)>=n: break
         s=sent_for[kp]
+        if s in used_sents or kp in used_keys:
             continue
         # ابنِ سؤال الفراغ
         # مشتتات أذكى (مع رجوع تلقائي لو النماذج مش متاحة)
         pool = [x for x in keyphrases if x != kp]
         ch = smart_distractors(kp, pool, s, k=3) + [kp]
+        # تنظيف سريع وخلوّ من التكرار
+        clean_choices=[]
+        seen=set()
+        for c in ch:
+            c = c.strip()
+            if not c: continue
+            if c in seen: continue
+            seen.add(c)
+            clean_choices.append(c)
+        ch = clean_choices[:4]
+        # تأكيد وجود 4 خيارات
+        while len(ch)<4:
+            ch.append("…")
+        random.shuffle(ch); ans=ch.index(kp) if kp in ch else 3
         items.append(MCQ(id=str(uuid.uuid4())[:8], question=q, choices=ch, answer_index=ans))
         used_sents.add(s); used_keys.add(kp)
+    if not items:
         raise RuntimeError("تعذّر توليد أسئلة.")
     return items
+def clean_option_text(t: str) -> str:
+    t = (t or "").strip()
+    t = re2.sub(AR_DIAC, "", t)
+    t = re2.sub(r"\s+", " ", t)
+    t = re2.sub(r"^[\p{P}\p{S}_-]+|[\p{P}\p{S}_-]+$", "", t)
+    return t or "…"
 def to_records(items:List[MCQ])->List[dict]:
     recs=[]
     for it in items:
         opts=[]
+        used=set()
         for i,lbl in enumerate(["A","B","C","D"]):
+            txt=(it.choices[i] if i<len(it.choices) else "…")
+            txt=clean_option_text(txt.replace(",", "،").replace("?", "؟").replace(";", "؛"))
+            if txt in used:
+                txt = f"…{i+1}"
+            used.add(txt)
+            opts.append({"id":lbl,"text":txt,"is_correct":(i==it.answer_index)})
         recs.append({"id":it.id,"question":it.question.strip(),"options":opts})
     return recs
         qtxt = rec["question"]
         cor  = next((o["id"] for o in rec["options"] if o["is_correct"]), "")
         opts_html=[]
+        for o in rec["options"]]:
             lid, txt = o["id"], o["text"]
             opts_html.append(f"""
                 <label class="opt" data-letter="{lid}">
     recs    = to_records(items)
     return render_quiz_html(recs), gr.update(visible=False), gr.update(visible=True), ""
+# ------------------ CSS (كما هو) ------------------
 CSS = """
 :root{
   --bg:#0e0e11; --panel:#15161a; --card:#1a1b20; --muted:#a7b0be;
 .q-badge.ok{background:#083a2a;color:#b6f4db;border:1px solid #145b44}
 .q-badge.err{background:#3a0d14;color:#ffd1d6;border:1px solid #6a1e2b}
+.q-text{color:#eaeaf2;font-size:1.06rem;line-height:1.8;margin:8px 0 12px}
 .opts{display:flex;flex-direction:column;gap:8px}
 .opt{display:flex;gap:10px;align-items:center;background:#14161c;border:1px solid #2a2d3a;border-radius:12px;padding:10px;transition:background .15s,border-color .15s}
 .opt input{accent-color:var(--accent2)}
 .q-note.warn{color:#ffd1d6}
 """
+# ------------------ JS: ربط Submit بعد الرندر (مع تحسين إبراز الصحيحة) ------------------
 ATTACH_LISTENERS_JS = """
 () => {
   // اربط مرة واحدة فقط
     const chosenLabel = chosen.closest('.opt');
+    // حالة صحيحة: لوّن أخضر وأقفل السؤال كاملاً + إبراز الكلمة الصحيحة داخل الجملة
     if (chosen.value === correct) {
       chosenLabel.classList.add('ok');
       if (badge){ badge.hidden=false; badge.className='q-badge ok'; badge.textContent='Correct!'; }
       card.querySelectorAll('input[type="radio"]').forEach(i => i.disabled = true);
       e.target.disabled = true;
       if (note) note.textContent = '';
+      // إبراز الجواب الصحيح ضمن الجملة الحالية دون تغيير البنية
+      const qNode = card.querySelector('.q-text');
+      if (qNode){
+        const full = qNode.textContent || '';
+        const correctText = [...card.querySelectorAll('.opt')].find(o =>
+          o.querySelector('input').value === correct
+        )?.querySelector('.opt-text')?.textContent || '';
+        if (full && correctText){
+          const highlighted = full.replace('_____', `<mark style="background:#2dd4bf22;border:1px solid #2dd4bf55;border-radius:6px;padding:0 4px">${correctText}</mark>`);
+          qNode.innerHTML = highlighted;
+        }
+      }
       return;
     }
     // حالة خاطئة: لوّن أحمر فقط، ولا تعطل أي شيء — ليقدر يجرّب خيار آخر
+    chosenLabel.classList.add('err');
     if (badge){ badge.hidden=false; badge.className='q-badge err'; badge.textContent='Incorrect.'; }
     if (note) note.textContent = '';
   });
   return 'wired-multi2';
 }
 """
+# ------------------ واجهة Gradio (بدون تغيير بنية الواجهات) ------------------
 with gr.Blocks(title="Question Generator", css=CSS) as demo:
     gr.Markdown("<h2 class='top'>Question Generator</h2>")