Spaces:

Leen172
/

Question_generator

Sleeping

App Files Files Community

Leen172 commited on Nov 2

Commit

3ca8fa1

verified ·

1 Parent(s): 4988947

Update app.py

Browse files

Files changed (1) hide show

app.py +128 -47

app.py CHANGED Viewed

@@ -420,71 +420,125 @@ def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: in
     if len(out) < k:
         out.extend(legacy_distractors(correct, phrase_pool, k=k-len(out)))
     return out[:k]
 # ====== (4-أ) مُولِّد أسئلة "فراغ" ======
 def make_mcqs(text:str, n:int=6, difficulty: str = "متوسط")->List[MCQ]:
     all_sents = split_sents(text)
     sents = pick_clean_sentences(all_sents, difficulty)
     if not sents:
-        raise ValueError("النص قصير أو غير صالح.")
-    keyphrases = yake_keywords(text, k=200)
     keyphrases = [kp for kp in keyphrases if safe_keyword(kp) and 2 <= len(kp) <= 40]
-    # ربط العبارة بجملة مناسبة (ظهور وحيد)
-    sent_for={}
     for s in sents:
         for kp in keyphrases:
-            if kp in sent_for: continue
-            hits = re2.findall(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", s)
-            if len(hits) == 1:
-                sent_for[kp]=s
-        if len(sent_for)>=n*3:
             break
-    if not sent_for:
-        tokens = [t for t in re2.findall(r"[\p{L}\p{N}_]+", text) if good_kw(t)]
-        freq = [w for w,_ in sorted(((t, text.count(t)) for t in tokens), key=lambda x:-x[1])]
-        keyphrases = [w for w in freq if safe_keyword(w)][:150]
-        for s in sents:
-            for kp in keyphrases:
-                if kp in sent_for: continue
-                hits = re2.findall(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", s)
-                if len(hits) == 1: sent_for[kp]=s
-            if len(sent_for)>=n*2:
-                break
-    if not sent_for:
-        raise RuntimeError("تعذّر توليد أسئلة من هذا النص.")
-    # أولوية للعبارات الأطول (أعلميّة أعلى)
-    items=[]; used_sents=set(); used_keys=set()
-    for kp in sorted(sent_for.keys(), key=lambda x: (-len(x), x)):
-        if len(items)>=n: break
-        s=sent_for[kp]
-        if s in used_sents or kp in used_keys: continue
-        q=re2.sub(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", "_____", s, count=1)
         pool = [x for x in keyphrases if x != kp]
         ch = smart_distractors(kp, pool, s, k=3, all_sentences=all_sents, difficulty=difficulty) + [kp]
-        # تنظيف ومنع تكرار وضمان أربع خيارات
-        clean_choices=[]; seen=set()
         for c in ch:
             c = c.strip()
-            if not c or c in seen: continue
-            seen.add(c); clean_choices.append(c)
-        ch = clean_choices[:4]
-        while len(ch)<4: ch.append("…")
-        random.shuffle(ch); ans=ch.index(kp) if kp in ch else 3
-        items.append(MCQ(id=str(uuid.uuid4())[:8], question=q, choices=ch, answer_index=ans))
-        used_sents.add(s); used_keys.add(kp)
     if not items:
         raise RuntimeError("تعذّر توليد أسئلة.")
-    return items
 # ====== (4-ب) مُولِّد أسئلة "فهم مباشر" (mT5) ======
 _MT5 = {"tok": None, "model": None, "ok": False}
@@ -557,18 +611,36 @@ def gen_one_comp_q(sentence: str, tok, model, max_new_tokens=128) -> Optional[MC
 def make_comp_mcqs(text: str, n: int = 6, difficulty: str = "متوسط") -> List[MCQ]:
     tok, model, ok = get_mt5()
     if not ok:
         return make_mcqs(text, n, difficulty=difficulty)
     sents_all = split_sents(text)
     sents = pick_clean_sentences(sents_all, difficulty)
     if not sents:
         return make_mcqs(text, n, difficulty=difficulty)
-    random.shuffle(sents)
     items: List[MCQ] = []
     tried = 0
-    for s in sents:
         if len(items) >= n: break
         mcq = gen_one_comp_q(s, tok, model)
         tried += 1
@@ -583,15 +655,15 @@ def make_comp_mcqs(text: str, n: int = 6, difficulty: str = "متوسط") -> Lis
                     seen.add(c); clean.append(c)
             clean = (clean + ["…","…","…","…"])[:4]
             ai = mcq.answer_index if isinstance(mcq.answer_index,int) and 0<=mcq.answer_index<4 else 0
             items.append(MCQ(id=str(uuid.uuid4())[:8], question=q, choices=clean, answer_index=ai))
-        if tried >= n * 7:
             break
     if not items:
         return make_mcqs(text, n, difficulty=difficulty)
     return items[:n]
 # ------------------ تحويل إلى سجلات العرض ------------------
 def clean_option_text(t: str) -> str:
     t = (t or "").strip()
@@ -656,16 +728,25 @@ def build_quiz(text_area, file_path, n, model_id, zoom, mode, difficulty):
     raw = text_area if text_area else file_to_text(file_path, model_id=model_id, zoom=float(zoom))[0]
     cleaned = postprocess(raw)
     try:
         if mode == "فهم مباشر":
-            items = make_comp_mcqs(cleaned, n=int(n), difficulty=difficulty)
         else:
             items = make_mcqs(cleaned, n=int(n), difficulty=difficulty)
     except Exception:
         items = make_mcqs(cleaned, n=int(n), difficulty=difficulty)
     recs = to_records(items)
-    return render_quiz_html(recs), gr.update(visible=False), gr.update(visible=True), ""
 # ------------------ CSS ------------------
 CSS = """

     if len(out) < k:
         out.extend(legacy_distractors(correct, phrase_pool, k=k-len(out)))
     return out[:k]
+def best_keyword_in_sentence(sentence: str, global_text: str) -> Optional[str]:
+    """اختيار هدف مناسب للفراغ من نفس الجملة: YAKE على الجملة نفسها مع فلترة."""
+    try:
+        ex = yake.KeywordExtractor(lan='ar', n=3, top=20)
+        pairs = ex.extract_keywords(sentence)
+    except Exception:
+        pairs = []
+    # رتب حسب طول العبارة (أطول غالبًا أوضح) مع وزن خفيف لتكرارها بالنص
+    cands = []
+    for w, _ in pairs:
+        w = re2.sub(r"\s+", " ", w.strip())
+        if not w:
+            continue
+        if not good_kw(w) or not safe_keyword(w):
+            continue
+        if len(w) < 2 or len(w) > 40:
+            continue
+        # لازم تظهر فعليًا ضمن الجملة نصيًا
+        if not re2.search(rf"(?<!\p{{L}}){re2.escape(w)}(?!\p{{L}})", sentence):
+            continue
+        # وزن بالتكرار العام لتمييز الأهم
+        freq_weight = global_text.count(w)
+        cands.append((w, len(w) + 0.5*freq_weight))
+    if not cands:
+        # fallback أبسط: التقط أطول “كلمة” معقولة
+        toks = [t for t in re2.findall(r"\p{L}+", sentence) if good_kw(t) and safe_keyword(t)]
+        toks.sort(key=len, reverse=True)
+        return toks[0] if toks else None
+    cands.sort(key=lambda x: -x[1])
+    return cands[0][0] if cands else None
 # ====== (4-أ) مُولِّد أسئلة "فراغ" ======
 def make_mcqs(text:str, n:int=6, difficulty: str = "متوسط")->List[MCQ]:
     all_sents = split_sents(text)
     sents = pick_clean_sentences(all_sents, difficulty)
     if not sents:
+        # لو ما في جمل “نظيفة” كفاية، استعمل كل الجمل المتاحة
+        sents = all_sents[:]
+    keyphrases = yake_keywords(text, k=240)
     keyphrases = [kp for kp in keyphrases if safe_keyword(kp) and 2 <= len(kp) <= 40]
+    # ربط العبارة بجملة مناسبة (بس ما نقيّد بظهور وحيد فقط)
+    sent_for = {}
     for s in sents:
         for kp in keyphrases:
+            if kp in sent_for:
+                continue
+            if re2.search(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", s):
+                sent_for[kp] = s
+        if len(sent_for) >= n * 4:  # خزّن أكتر من الحاجة
             break
+    items: List[MCQ] = []
+    used_pairs = set()  # (sentence, keyword)
+    # (أ) استهلك المطابقات المتاحة أولاً
+    for kp in sorted(sent_for.keys(), key=lambda x: (-len(x), x)):
+        if len(items) >= n: break
+        s = sent_for[kp]
+        pair = (s, kp)
+        if pair in used_pairs:
+            continue
+        q = re2.sub(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", "_____", s, count=1)
         pool = [x for x in keyphrases if x != kp]
         ch = smart_distractors(kp, pool, s, k=3, all_sentences=all_sents, difficulty=difficulty) + [kp]
+        # نظّف وثبّت 4 خيارات
+        uniq, seen = [], set()
         for c in ch:
             c = c.strip()
+            if not c or c in seen:
+                continue
+            seen.add(c); uniq.append(c)
+        while len(uniq) < 4: uniq.append("…")
+        uniq = uniq[:4]
+        random.shuffle(uniq)
+        ans = uniq.index(kp) if kp in uniq else 3
+        items.append(MCQ(id=str(uuid.uuid4())[:8], question=q, choices=uniq, answer_index=ans))
+        used_pairs.add(pair)
+    # (ب) إن ما كفّى العدد، فعّل fallback “من نفس الجملة”
+    si = 0
+    while len(items) < n and si < len(sents):
+        s = sents[si]; si += 1
+        # اختَر هدفًا مناسبًا من الجملة نفسها
+        kp = best_keyword_in_sentence(s, text)
+        if not kp:
+            continue
+        pair = (s, kp)
+        if pair in used_pairs:
+            continue
+        # ابنِ سؤال الفراغ
+        if not re2.search(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", s):
+            # لو الهدف ما انوجد كما هو، جرّب أقرب صيغة مبسّطة ضمن s
+            continue
+        q = re2.sub(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", "_____", s, count=1)
+        pool = [x for x in keyphrases if x != kp] or keyphrases[:]
+        ch = smart_distractors(kp, pool, s, k=3, all_sentences=all_sents, difficulty=difficulty) + [kp]
+        uniq, seen = [], set()
+        for c in ch:
+            c = c.strip()
+            if not c or c in seen:
+                continue
+            seen.add(c); uniq.append(c)
+        while len(uniq) < 4: uniq.append("…")
+        uniq = uniq[:4]
+        random.shuffle(uniq)
+        ans = uniq.index(kp) if kp in uniq else 3
+        items.append(MCQ(id=str(uuid.uuid4())[:8], question=q, choices=uniq, answer_index=ans))
+        used_pairs.add(pair)
     if not items:
         raise RuntimeError("تعذّر توليد أسئلة.")
+    return items[:n]
 # ====== (4-ب) مُولِّد أسئلة "فهم مباشر" (mT5) ======
 _MT5 = {"tok": None, "model": None, "ok": False}
 def make_comp_mcqs(text: str, n: int = 6, difficulty: str = "متوسط") -> List[MCQ]:
     tok, model, ok = get_mt5()
     if not ok:
+        # عدم توفر mT5 → ارجعي لأسئلة الفراغ
         return make_mcqs(text, n, difficulty=difficulty)
     sents_all = split_sents(text)
     sents = pick_clean_sentences(sents_all, difficulty)
+    if not sents:
+        sents = sents_all[:]
     if not sents:
         return make_mcqs(text, n, difficulty=difficulty)
+    # جرّبي أولًا على جمل مفردة، ثم على “مقاطع” (دمج 2–3 جمل) إذا لزم
+    def make_chunks(sents, max_len=260):
+        chunks = []
+        i = 0
+        while i < len(sents):
+            cur = sents[i]
+            j = i + 1
+            while j < len(sents) and len(cur) + 1 + len(sents[j]) <= max_len:
+                cur = cur + " " + sents[j]
+                j += 1
+            chunks.append(cur)
+            i = j
+        return chunks
+    candidates = sents[:] + make_chunks(sents, max_len=220)
+    random.shuffle(candidates)
     items: List[MCQ] = []
     tried = 0
+    for s in candidates:
         if len(items) >= n: break
         mcq = gen_one_comp_q(s, tok, model)
         tried += 1
                     seen.add(c); clean.append(c)
             clean = (clean + ["…","…","…","…"])[:4]
             ai = mcq.answer_index if isinstance(mcq.answer_index,int) and 0<=mcq.answer_index<4 else 0
             items.append(MCQ(id=str(uuid.uuid4())[:8], question=q, choices=clean, answer_index=ai))
+        if tried >= n * 12:
             break
     if not items:
         return make_mcqs(text, n, difficulty=difficulty)
     return items[:n]
 # ------------------ تحويل إلى سجلات العرض ------------------
 def clean_option_text(t: str) -> str:
     t = (t or "").strip()
     raw = text_area if text_area else file_to_text(file_path, model_id=model_id, zoom=float(zoom))[0]
     cleaned = postprocess(raw)
+    used_mode = mode
     try:
         if mode == "فهم مباشر":
+            tok, model, ok = get_mt5()
+            if ok:
+                items = make_comp_mcqs(cleaned, n=int(n), difficulty=difficulty)
+            else:
+                items = make_mcqs(cleaned, n=int(n), difficulty=difficulty)
+                used_mode = "فراغ (fallback)"
         else:
             items = make_mcqs(cleaned, n=int(n), difficulty=difficulty)
     except Exception:
         items = make_mcqs(cleaned, n=int(n), difficulty=difficulty)
+        used_mode = "فراغ (fallback)"
     recs = to_records(items)
+    warn = f"نمط مُستخدَم: **{used_mode}** — عدد الأسئلة: {len(items)}"
+    return render_quiz_html(recs), gr.update(visible=False), gr.update(visible=True), warn
 # ------------------ CSS ------------------
 CSS = """