Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
# صفحتان ثابتتان + Submit لكل سؤال يعمل فعليًا + منع تغيّر أبعاد صفحة الإدخال
|
| 3 |
# + طور اختياري لأسئلة فهم مباشر باستخدام mT5 (تحميل كسول + fallback)
|
|
|
|
| 4 |
|
| 5 |
import os, json, uuid, random, unicodedata
|
| 6 |
from dataclasses import dataclass
|
|
@@ -19,17 +20,29 @@ random.seed(42)
|
|
| 19 |
DEFAULT_NUM_QUESTIONS = 6
|
| 20 |
DEFAULT_TROCR_MODEL = "microsoft/trocr-base-printed"
|
| 21 |
DEFAULT_TROCR_ZOOM = 2.6
|
| 22 |
-
QUESTION_MODES = ["فراغ", "فهم مباشر"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
# ------------------ OCR (تحميل كسول) ------------------
|
| 25 |
_OCR = {}
|
| 26 |
def get_ocr(model_id: str):
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
# ------------------ PDF/TXT → نص ------------------
|
| 35 |
def extract_text_with_pypdf(path: str) -> str:
|
|
@@ -123,7 +136,7 @@ def split_sents(t:str)->List[str]:
|
|
| 123 |
return [x for x in s if len(x)>=25]
|
| 124 |
|
| 125 |
# ====== (1) عبارات مفتاحية أذكى: n=3 ثم 2 ثم 1، مع فلترة ======
|
| 126 |
-
def yake_keywords(t: str, k: int =
|
| 127 |
phrases = []
|
| 128 |
seen = set()
|
| 129 |
for n in [3, 2, 1]:
|
|
@@ -208,7 +221,7 @@ def get_embedder():
|
|
| 208 |
_EMB = False
|
| 209 |
return _EMB
|
| 210 |
|
| 211 |
-
def nearest_terms(target: str, pool: List[str], k: int =
|
| 212 |
emb = get_embedder()
|
| 213 |
if not emb:
|
| 214 |
return []
|
|
@@ -218,7 +231,7 @@ def nearest_terms(target: str, pool: List[str], k: int = 12) -> List[Tuple[str,
|
|
| 218 |
vecs = emb.encode([target] + cand, normalize_embeddings=True)
|
| 219 |
t, C = vecs[0], vecs[1:]
|
| 220 |
import numpy as np
|
| 221 |
-
sims = (C @ t)
|
| 222 |
idx = np.argsort(-sims)[:k]
|
| 223 |
return [(cand[i], float(sims[i])) for i in idx]
|
| 224 |
|
|
@@ -233,19 +246,19 @@ def get_masker():
|
|
| 233 |
_MLM = False
|
| 234 |
return _MLM
|
| 235 |
|
| 236 |
-
def mlm_distractors(sentence_with_blank: str, correct: str, k: int =
|
| 237 |
masker = get_masker()
|
| 238 |
if not masker:
|
| 239 |
return []
|
| 240 |
masked = sentence_with_blank.replace("_____", masker.tokenizer.mask_token)
|
| 241 |
try:
|
| 242 |
-
outs = masker(masked, top_k=max(
|
| 243 |
cands = []
|
| 244 |
for o in outs:
|
| 245 |
tok = o["token_str"].strip()
|
| 246 |
if tok and tok != correct and len(tok) >= 2 and not re2.match(r"^[\p{P}\p{S}\d_]+$", tok):
|
| 247 |
cands.append(tok)
|
| 248 |
-
seen = set()
|
| 249 |
for w in cands:
|
| 250 |
if w not in seen:
|
| 251 |
uniq.append(w); seen.add(w)
|
|
@@ -298,21 +311,94 @@ def rank_by_ce(sentence_with_blank: str, candidates: List[str]) -> List[str]:
|
|
| 298 |
except Exception:
|
| 299 |
return candidates
|
| 300 |
|
| 301 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
base = []
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
clean = []
|
| 307 |
for w in base:
|
| 308 |
w = w.strip()
|
| 309 |
-
if not w or w == correct:
|
| 310 |
-
|
| 311 |
-
if
|
| 312 |
-
|
| 313 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
clean.append(w)
|
| 315 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
try:
|
| 317 |
emb = get_embedder()
|
| 318 |
if emb and clean:
|
|
@@ -320,11 +406,12 @@ def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: in
|
|
| 320 |
c, others = vecs[0], vecs[1:]
|
| 321 |
import numpy as np
|
| 322 |
sims = others @ c
|
| 323 |
-
filtered = [w for w, s in zip(clean, sims) if s <
|
| 324 |
if len(filtered) >= k:
|
| 325 |
clean = filtered
|
| 326 |
except Exception:
|
| 327 |
pass
|
|
|
|
| 328 |
out = clean[:k]
|
| 329 |
while len(out) < k:
|
| 330 |
extra = [w for w in phrase_pool if w not in out and w != correct and length_close(w, correct)]
|
|
@@ -334,43 +421,55 @@ def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: in
|
|
| 334 |
out.extend(legacy_distractors(correct, phrase_pool, k=k-len(out)))
|
| 335 |
return out[:k]
|
| 336 |
|
| 337 |
-
# ====== (4-أ) مُولِّد أسئلة "فراغ"
|
| 338 |
-
def make_mcqs(text:str, n:int=6)->List[MCQ]:
|
| 339 |
-
|
| 340 |
-
|
|
|
|
| 341 |
raise ValueError("النص قصير أو غير صالح.")
|
| 342 |
-
|
|
|
|
| 343 |
keyphrases = [kp for kp in keyphrases if safe_keyword(kp) and 2 <= len(kp) <= 40]
|
|
|
|
|
|
|
| 344 |
sent_for={}
|
| 345 |
for s in sents:
|
| 346 |
-
if not is_clean_sentence(s): continue
|
| 347 |
for kp in keyphrases:
|
| 348 |
if kp in sent_for: continue
|
| 349 |
hits = re2.findall(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", s)
|
| 350 |
if len(hits) == 1:
|
| 351 |
sent_for[kp]=s
|
| 352 |
-
if len(sent_for)>=n*3:
|
|
|
|
|
|
|
| 353 |
if not sent_for:
|
| 354 |
tokens = [t for t in re2.findall(r"[\p{L}\p{N}_]+", text) if good_kw(t)]
|
| 355 |
freq = [w for w,_ in sorted(((t, text.count(t)) for t in tokens), key=lambda x:-x[1])]
|
| 356 |
-
keyphrases = [w for w in freq if safe_keyword(w)][:
|
| 357 |
for s in sents:
|
| 358 |
-
if not is_clean_sentence(s): continue
|
| 359 |
for kp in keyphrases:
|
| 360 |
if kp in sent_for: continue
|
| 361 |
hits = re2.findall(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", s)
|
| 362 |
if len(hits) == 1: sent_for[kp]=s
|
| 363 |
-
if len(sent_for)>=n*2:
|
|
|
|
|
|
|
| 364 |
if not sent_for:
|
| 365 |
raise RuntimeError("تعذّر توليد أسئلة من هذا النص.")
|
|
|
|
|
|
|
| 366 |
items=[]; used_sents=set(); used_keys=set()
|
| 367 |
for kp in sorted(sent_for.keys(), key=lambda x: (-len(x), x)):
|
| 368 |
if len(items)>=n: break
|
| 369 |
s=sent_for[kp]
|
| 370 |
if s in used_sents or kp in used_keys: continue
|
|
|
|
| 371 |
q=re2.sub(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", "_____", s, count=1)
|
|
|
|
| 372 |
pool = [x for x in keyphrases if x != kp]
|
| 373 |
-
ch = smart_distractors(kp, pool, s, k=3) + [kp]
|
|
|
|
|
|
|
| 374 |
clean_choices=[]; seen=set()
|
| 375 |
for c in ch:
|
| 376 |
c = c.strip()
|
|
@@ -379,13 +478,15 @@ def make_mcqs(text:str, n:int=6)->List[MCQ]:
|
|
| 379 |
ch = clean_choices[:4]
|
| 380 |
while len(ch)<4: ch.append("…")
|
| 381 |
random.shuffle(ch); ans=ch.index(kp) if kp in ch else 3
|
|
|
|
| 382 |
items.append(MCQ(id=str(uuid.uuid4())[:8], question=q, choices=ch, answer_index=ans))
|
| 383 |
used_sents.add(s); used_keys.add(kp)
|
| 384 |
-
|
|
|
|
| 385 |
raise RuntimeError("تعذّر توليد أسئلة.")
|
| 386 |
return items
|
| 387 |
|
| 388 |
-
# ====== (4-ب) مُولِّد أسئلة "فهم مباشر" (
|
| 389 |
_MT5 = {"tok": None, "model": None, "ok": False}
|
| 390 |
def get_mt5():
|
| 391 |
if _MT5["tok"] is not None or _MT5["model"] is not None or _MT5["ok"]:
|
|
@@ -400,13 +501,10 @@ def get_mt5():
|
|
| 400 |
return _MT5["tok"], _MT5["model"], _MT5["ok"]
|
| 401 |
|
| 402 |
def parse_json_block(s: str) -> Optional[dict]:
|
| 403 |
-
# حاول التقاط أول كائن JSON صالح
|
| 404 |
try:
|
| 405 |
-
# إن وُجد JSON مباشر
|
| 406 |
return json.loads(s)
|
| 407 |
except Exception:
|
| 408 |
pass
|
| 409 |
-
# التقط أقواس {} الأولى والأخيرة
|
| 410 |
m = re2.search(r"\{.*\}", s, flags=re2.DOTALL)
|
| 411 |
if m:
|
| 412 |
try:
|
|
@@ -416,7 +514,6 @@ def parse_json_block(s: str) -> Optional[dict]:
|
|
| 416 |
return None
|
| 417 |
|
| 418 |
def comp_prompt(sentence: str) -> str:
|
| 419 |
-
# تعليمات قصيرة ومحددة مع تنسيق JSON
|
| 420 |
return (
|
| 421 |
"أنت منشئ أسئلة متعددة الخيارات باللغة العربية.\n"
|
| 422 |
"من الجملة التالية، أنشئ سؤال فهم مباشر واحدًا مع أربع خيارات وإشارة للجواب الصحيح.\n"
|
|
@@ -457,15 +554,18 @@ def gen_one_comp_q(sentence: str, tok, model, max_new_tokens=128) -> Optional[MC
|
|
| 457 |
except Exception:
|
| 458 |
return None
|
| 459 |
|
| 460 |
-
def make_comp_mcqs(text: str, n: int = 6) -> List[MCQ]:
|
| 461 |
tok, model, ok = get_mt5()
|
| 462 |
if not ok:
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
|
|
|
| 466 |
if not sents:
|
| 467 |
-
return make_mcqs(text, n)
|
|
|
|
| 468 |
random.shuffle(sents)
|
|
|
|
| 469 |
items: List[MCQ] = []
|
| 470 |
tried = 0
|
| 471 |
for s in sents:
|
|
@@ -473,23 +573,24 @@ def make_comp_mcqs(text: str, n: int = 6) -> List[MCQ]:
|
|
| 473 |
mcq = gen_one_comp_q(s, tok, model)
|
| 474 |
tried += 1
|
| 475 |
if mcq:
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 481 |
break
|
|
|
|
| 482 |
if not items:
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
# توحيد البنية (A..D) بنفس الشكل
|
| 486 |
-
normed=[]
|
| 487 |
-
for it in items[:n]:
|
| 488 |
-
# القص إلى 4 خيارات وتأمين الفهارس
|
| 489 |
-
ch = (it.choices + ["…","…","…","…"])[:4]
|
| 490 |
-
ai = it.answer_index if 0 <= it.answer_index < 4 else 0
|
| 491 |
-
normed.append(MCQ(id=it.id, question=it.question, choices=ch, answer_index=ai))
|
| 492 |
-
return normed
|
| 493 |
|
| 494 |
# ------------------ تحويل إلى سجلات العرض ------------------
|
| 495 |
def clean_option_text(t: str) -> str:
|
|
@@ -548,30 +649,25 @@ def render_quiz_html(records: List[dict]) -> str:
|
|
| 548 |
return f"""<div id="quiz" class="quiz-wrap">{''.join(parts)}</div>"""
|
| 549 |
|
| 550 |
# ------------------ توليد الامتحان وتبديل الصفحات ------------------
|
| 551 |
-
def build_quiz(text_area, file_path, n, model_id, zoom, mode):
|
| 552 |
text_area = (text_area or "").strip()
|
| 553 |
if not text_area and not file_path:
|
| 554 |
return "", gr.update(visible=True), gr.update(visible=False), "🛈 الصق نصًا أو ارفع ملفًا أولًا."
|
| 555 |
-
if text_area
|
| 556 |
-
raw = text_area
|
| 557 |
-
else:
|
| 558 |
-
raw, _ = file_to_text(file_path, model_id=model_id, zoom=float(zoom))
|
| 559 |
cleaned = postprocess(raw)
|
| 560 |
|
| 561 |
-
# اختيار الطور
|
| 562 |
try:
|
| 563 |
if mode == "فهم مباشر":
|
| 564 |
-
items = make_comp_mcqs(cleaned, n=int(n))
|
| 565 |
else:
|
| 566 |
-
items = make_mcqs(cleaned, n=int(n))
|
| 567 |
-
except Exception
|
| 568 |
-
|
| 569 |
-
items = make_mcqs(cleaned, n=int(n))
|
| 570 |
|
| 571 |
-
recs
|
| 572 |
return render_quiz_html(recs), gr.update(visible=False), gr.update(visible=True), ""
|
| 573 |
|
| 574 |
-
# ------------------ CSS
|
| 575 |
CSS = """
|
| 576 |
:root{
|
| 577 |
--bg:#0e0e11; --panel:#15161a; --card:#1a1b20; --muted:#a7b0be;
|
|
@@ -618,10 +714,10 @@ textarea{min-height:120px}
|
|
| 618 |
}
|
| 619 |
.q-actions .q-submit:disabled{opacity:.5;cursor:not-allowed}
|
| 620 |
.q-note{color:#ffd1d6}
|
| 621 |
-
.q-note.warn{color:#
|
| 622 |
"""
|
| 623 |
|
| 624 |
-
# ------------------ JS: ربط Submit بعد الرندر (
|
| 625 |
ATTACH_LISTENERS_JS = """
|
| 626 |
() => {
|
| 627 |
if (window.__q_submit_bound_multi2) { return 'already'; }
|
|
@@ -676,7 +772,7 @@ ATTACH_LISTENERS_JS = """
|
|
| 676 |
}
|
| 677 |
"""
|
| 678 |
|
| 679 |
-
# ------------------ واجهة Gradio
|
| 680 |
with gr.Blocks(title="Question Generator", css=CSS) as demo:
|
| 681 |
gr.Markdown("<h2 class='top'>Question Generator</h2>")
|
| 682 |
|
|
@@ -688,8 +784,10 @@ with gr.Blocks(title="Question Generator", css=CSS) as demo:
|
|
| 688 |
file_comp = gr.File(label="أو ارفع ملف (PDF / TXT)", file_count="single",
|
| 689 |
file_types=[".pdf",".txt"], type="filepath", elem_classes=["upload-like"])
|
| 690 |
num_q = gr.Slider(4, 20, value=DEFAULT_NUM_QUESTIONS, step=1, label="عدد الأسئلة")
|
| 691 |
-
|
|
|
|
| 692 |
mode_radio = gr.Radio(choices=QUESTION_MODES, value="فراغ", label="نوع السؤال")
|
|
|
|
| 693 |
|
| 694 |
with gr.Accordion("خيارات PDF المصوّر (اختياري)", open=False):
|
| 695 |
trocr_model = gr.Dropdown(
|
|
@@ -702,6 +800,7 @@ with gr.Blocks(title="Question Generator", css=CSS) as demo:
|
|
| 702 |
value=DEFAULT_TROCR_MODEL, label="نموذج TrOCR"
|
| 703 |
)
|
| 704 |
trocr_zoom = gr.Slider(2.0, 3.5, value=DEFAULT_TROCR_ZOOM, step=0.1, label="Zoom OCR")
|
|
|
|
| 705 |
btn_build = gr.Button("generate quistion", elem_classes=["button-primary"])
|
| 706 |
warn = gr.Markdown("", elem_classes=["small"])
|
| 707 |
|
|
@@ -714,7 +813,7 @@ with gr.Blocks(title="Question Generator", css=CSS) as demo:
|
|
| 714 |
# بناء الامتحان + تبديل الصفحات + ربط الـJS
|
| 715 |
btn_build.click(
|
| 716 |
build_quiz,
|
| 717 |
-
inputs=[text_area, file_comp, num_q, trocr_model, trocr_zoom, mode_radio],
|
| 718 |
outputs=[quiz_html, page1, page2, warn]
|
| 719 |
).then(
|
| 720 |
None, inputs=None, outputs=[js_wired], js=ATTACH_LISTENERS_JS
|
|
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
# صفحتان ثابتتان + Submit لكل سؤال يعمل فعليًا + منع تغيّر أبعاد صفحة الإدخال
|
| 3 |
# + طور اختياري لأسئلة فهم مباشر باستخدام mT5 (تحميل كسول + fallback)
|
| 4 |
+
# + تحكم بدرجة الصعوبة + مشتّتات أقوى (BM25) + فلاتر POS/NER وCross-Encoder اختياريين
|
| 5 |
|
| 6 |
import os, json, uuid, random, unicodedata
|
| 7 |
from dataclasses import dataclass
|
|
|
|
| 20 |
DEFAULT_NUM_QUESTIONS = 6
|
| 21 |
DEFAULT_TROCR_MODEL = "microsoft/trocr-base-printed"
|
| 22 |
DEFAULT_TROCR_ZOOM = 2.6
|
| 23 |
+
QUESTION_MODES = ["فراغ", "فهم مباشر"]
|
| 24 |
+
DIFFICULTY_MODES = ["سهل", "متوسط", "صعب"]
|
| 25 |
+
|
| 26 |
+
# NEW: BM25 (اختياري)
|
| 27 |
+
try:
|
| 28 |
+
from rank_bm25 import BM25Okapi
|
| 29 |
+
_HAS_BM25 = True
|
| 30 |
+
except Exception:
|
| 31 |
+
_HAS_BM25 = False
|
| 32 |
|
| 33 |
# ------------------ OCR (تحميل كسول) ------------------
|
| 34 |
_OCR = {}
|
| 35 |
def get_ocr(model_id: str):
|
| 36 |
+
try:
|
| 37 |
+
from transformers import pipeline
|
| 38 |
+
import torch
|
| 39 |
+
dev = 0 if torch.cuda.is_available() else -1
|
| 40 |
+
if model_id not in _OCR:
|
| 41 |
+
_OCR[model_id] = pipeline("image-to-text", model=model_id, device=dev)
|
| 42 |
+
return _OCR[model_id]
|
| 43 |
+
except Exception:
|
| 44 |
+
# ارجعي دالة وهمية تعيد نصًا فارغًا بدل التعطّل
|
| 45 |
+
return lambda im: [{"generated_text": ""}]
|
| 46 |
|
| 47 |
# ------------------ PDF/TXT → نص ------------------
|
| 48 |
def extract_text_with_pypdf(path: str) -> str:
|
|
|
|
| 136 |
return [x for x in s if len(x)>=25]
|
| 137 |
|
| 138 |
# ====== (1) عبارات مفتاحية أذكى: n=3 ثم 2 ثم 1، مع فلترة ======
|
| 139 |
+
def yake_keywords(t: str, k: int = 200) -> List[str]:
|
| 140 |
phrases = []
|
| 141 |
seen = set()
|
| 142 |
for n in [3, 2, 1]:
|
|
|
|
| 221 |
_EMB = False
|
| 222 |
return _EMB
|
| 223 |
|
| 224 |
+
def nearest_terms(target: str, pool: List[str], k: int = 24) -> List[Tuple[str, float]]:
|
| 225 |
emb = get_embedder()
|
| 226 |
if not emb:
|
| 227 |
return []
|
|
|
|
| 231 |
vecs = emb.encode([target] + cand, normalize_embeddings=True)
|
| 232 |
t, C = vecs[0], vecs[1:]
|
| 233 |
import numpy as np
|
| 234 |
+
sims = (C @ t) # cosine لأن المتجهات مُطبّعة
|
| 235 |
idx = np.argsort(-sims)[:k]
|
| 236 |
return [(cand[i], float(sims[i])) for i in idx]
|
| 237 |
|
|
|
|
| 246 |
_MLM = False
|
| 247 |
return _MLM
|
| 248 |
|
| 249 |
+
def mlm_distractors(sentence_with_blank: str, correct: str, k: int = 18) -> List[str]:
|
| 250 |
masker = get_masker()
|
| 251 |
if not masker:
|
| 252 |
return []
|
| 253 |
masked = sentence_with_blank.replace("_____", masker.tokenizer.mask_token)
|
| 254 |
try:
|
| 255 |
+
outs = masker(masked, top_k=max(25, k+7))
|
| 256 |
cands = []
|
| 257 |
for o in outs:
|
| 258 |
tok = o["token_str"].strip()
|
| 259 |
if tok and tok != correct and len(tok) >= 2 and not re2.match(r"^[\p{P}\p{S}\d_]+$", tok):
|
| 260 |
cands.append(tok)
|
| 261 |
+
uniq, seen = [], set()
|
| 262 |
for w in cands:
|
| 263 |
if w not in seen:
|
| 264 |
uniq.append(w); seen.add(w)
|
|
|
|
| 311 |
except Exception:
|
| 312 |
return candidates
|
| 313 |
|
| 314 |
+
# --------- أدوات BM25 للصعوبة ---------
|
| 315 |
+
def tokenize_ar(s: str) -> List[str]:
|
| 316 |
+
s = norm_ar(s)
|
| 317 |
+
toks = re2.findall(r"\p{L}+", s)
|
| 318 |
+
return [t for t in toks if len(t) >= 2 and t not in AR_STOP]
|
| 319 |
+
|
| 320 |
+
def bm25_build(sentences: List[str]):
|
| 321 |
+
if not _HAS_BM25 or not sentences:
|
| 322 |
+
return None, []
|
| 323 |
+
corpus_tokens = [tokenize_ar(s) for s in sentences]
|
| 324 |
+
bm = BM25Okapi(corpus_tokens)
|
| 325 |
+
return bm, corpus_tokens
|
| 326 |
+
|
| 327 |
+
def bm25_candidates(correct: str, sentences: List[str], bm, corpus_tokens, top: int = 20) -> List[str]:
|
| 328 |
+
if not bm: return []
|
| 329 |
+
q = tokenize_ar(correct)
|
| 330 |
+
scores = bm.get_scores(q)
|
| 331 |
+
idxs = sorted(range(len(scores)), key=lambda i: -scores[i])[:min(top, len(scores))]
|
| 332 |
+
pool = set()
|
| 333 |
+
for i in idxs:
|
| 334 |
+
for tok in corpus_tokens[i]:
|
| 335 |
+
if tok != correct and good_kw(tok):
|
| 336 |
+
pool.add(tok)
|
| 337 |
+
return list(pool)
|
| 338 |
+
|
| 339 |
+
def pick_clean_sentences(sents: List[str], difficulty: str) -> List[str]:
|
| 340 |
+
out = []
|
| 341 |
+
for s in sents:
|
| 342 |
+
if not is_clean_sentence(s):
|
| 343 |
+
continue
|
| 344 |
+
L = len(s)
|
| 345 |
+
if difficulty == "سهل" and not (70 <= L <= 180):
|
| 346 |
+
continue
|
| 347 |
+
if difficulty == "متوسط" and not (70 <= L <= 220):
|
| 348 |
+
continue
|
| 349 |
+
if difficulty == "صعب" and not (60 <= L <= 240):
|
| 350 |
+
continue
|
| 351 |
+
out.append(s)
|
| 352 |
+
return out
|
| 353 |
+
|
| 354 |
+
def similarity_caps(difficulty: str):
|
| 355 |
+
if difficulty == "سهل":
|
| 356 |
+
return 0.88
|
| 357 |
+
if difficulty == "صعب":
|
| 358 |
+
return 0.95
|
| 359 |
+
return 0.92
|
| 360 |
+
|
| 361 |
+
# ====== مشتّتات ذكية مع الصعوبة وBM25 ======
|
| 362 |
+
def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: int = 3,
|
| 363 |
+
all_sentences: Optional[List[str]] = None, difficulty: str = "متوسط") -> List[str]:
|
| 364 |
base = []
|
| 365 |
+
|
| 366 |
+
# (أ) جيران دلاليين
|
| 367 |
+
base.extend([w for w,_ in nearest_terms(correct, phrase_pool, k=24)])
|
| 368 |
+
|
| 369 |
+
# (ب) FILL-MASK
|
| 370 |
+
for w in mlm_distractors(sentence.replace(correct, "_____"), correct, k=18):
|
| 371 |
+
if w not in base:
|
| 372 |
+
base.append(w)
|
| 373 |
+
|
| 374 |
+
# (ج) BM25 من النص
|
| 375 |
+
if all_sentences:
|
| 376 |
+
bm, corp = bm25_build(all_sentences)
|
| 377 |
+
for w in bm25_candidates(correct, all_sentences, bm, corp, top=18):
|
| 378 |
+
if w not in base:
|
| 379 |
+
base.append(w)
|
| 380 |
+
|
| 381 |
+
# فلترة POS/NER وطول وتطبيع
|
| 382 |
clean = []
|
| 383 |
for w in base:
|
| 384 |
w = w.strip()
|
| 385 |
+
if not w or w == correct:
|
| 386 |
+
continue
|
| 387 |
+
if is_named_entity(w):
|
| 388 |
+
continue
|
| 389 |
+
if not pos_compatible(w, correct):
|
| 390 |
+
continue
|
| 391 |
+
if not length_close(w, correct):
|
| 392 |
+
continue
|
| 393 |
+
if norm_ar(w) == norm_ar(correct):
|
| 394 |
+
continue
|
| 395 |
clean.append(w)
|
| 396 |
+
|
| 397 |
+
# ترتيب Cross-Encoder (اختياري)
|
| 398 |
+
clean = rank_by_ce(sentence.replace(correct, "_____"), clean)[:max(k*3, k)]
|
| 399 |
+
|
| 400 |
+
# حذف المشتّت شديد القرب دلالياً
|
| 401 |
+
cap = similarity_caps(difficulty)
|
| 402 |
try:
|
| 403 |
emb = get_embedder()
|
| 404 |
if emb and clean:
|
|
|
|
| 406 |
c, others = vecs[0], vecs[1:]
|
| 407 |
import numpy as np
|
| 408 |
sims = others @ c
|
| 409 |
+
filtered = [w for w, s in zip(clean, sims) if s < cap]
|
| 410 |
if len(filtered) >= k:
|
| 411 |
clean = filtered
|
| 412 |
except Exception:
|
| 413 |
pass
|
| 414 |
+
|
| 415 |
out = clean[:k]
|
| 416 |
while len(out) < k:
|
| 417 |
extra = [w for w in phrase_pool if w not in out and w != correct and length_close(w, correct)]
|
|
|
|
| 421 |
out.extend(legacy_distractors(correct, phrase_pool, k=k-len(out)))
|
| 422 |
return out[:k]
|
| 423 |
|
| 424 |
+
# ====== (4-أ) مُولِّد أسئلة "فراغ" ======
|
| 425 |
+
def make_mcqs(text:str, n:int=6, difficulty: str = "متوسط")->List[MCQ]:
|
| 426 |
+
all_sents = split_sents(text)
|
| 427 |
+
sents = pick_clean_sentences(all_sents, difficulty)
|
| 428 |
+
if not sents:
|
| 429 |
raise ValueError("النص قصير أو غير صالح.")
|
| 430 |
+
|
| 431 |
+
keyphrases = yake_keywords(text, k=200)
|
| 432 |
keyphrases = [kp for kp in keyphrases if safe_keyword(kp) and 2 <= len(kp) <= 40]
|
| 433 |
+
|
| 434 |
+
# ربط العبارة بجملة مناسبة (ظهور وحيد)
|
| 435 |
sent_for={}
|
| 436 |
for s in sents:
|
|
|
|
| 437 |
for kp in keyphrases:
|
| 438 |
if kp in sent_for: continue
|
| 439 |
hits = re2.findall(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", s)
|
| 440 |
if len(hits) == 1:
|
| 441 |
sent_for[kp]=s
|
| 442 |
+
if len(sent_for)>=n*3:
|
| 443 |
+
break
|
| 444 |
+
|
| 445 |
if not sent_for:
|
| 446 |
tokens = [t for t in re2.findall(r"[\p{L}\p{N}_]+", text) if good_kw(t)]
|
| 447 |
freq = [w for w,_ in sorted(((t, text.count(t)) for t in tokens), key=lambda x:-x[1])]
|
| 448 |
+
keyphrases = [w for w in freq if safe_keyword(w)][:150]
|
| 449 |
for s in sents:
|
|
|
|
| 450 |
for kp in keyphrases:
|
| 451 |
if kp in sent_for: continue
|
| 452 |
hits = re2.findall(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", s)
|
| 453 |
if len(hits) == 1: sent_for[kp]=s
|
| 454 |
+
if len(sent_for)>=n*2:
|
| 455 |
+
break
|
| 456 |
+
|
| 457 |
if not sent_for:
|
| 458 |
raise RuntimeError("تعذّر توليد أسئلة من هذا النص.")
|
| 459 |
+
|
| 460 |
+
# أولوية للعبارات الأطول (أعلميّة أعلى)
|
| 461 |
items=[]; used_sents=set(); used_keys=set()
|
| 462 |
for kp in sorted(sent_for.keys(), key=lambda x: (-len(x), x)):
|
| 463 |
if len(items)>=n: break
|
| 464 |
s=sent_for[kp]
|
| 465 |
if s in used_sents or kp in used_keys: continue
|
| 466 |
+
|
| 467 |
q=re2.sub(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", "_____", s, count=1)
|
| 468 |
+
|
| 469 |
pool = [x for x in keyphrases if x != kp]
|
| 470 |
+
ch = smart_distractors(kp, pool, s, k=3, all_sentences=all_sents, difficulty=difficulty) + [kp]
|
| 471 |
+
|
| 472 |
+
# تنظيف ومنع تكرار وضمان أربع خيارات
|
| 473 |
clean_choices=[]; seen=set()
|
| 474 |
for c in ch:
|
| 475 |
c = c.strip()
|
|
|
|
| 478 |
ch = clean_choices[:4]
|
| 479 |
while len(ch)<4: ch.append("…")
|
| 480 |
random.shuffle(ch); ans=ch.index(kp) if kp in ch else 3
|
| 481 |
+
|
| 482 |
items.append(MCQ(id=str(uuid.uuid4())[:8], question=q, choices=ch, answer_index=ans))
|
| 483 |
used_sents.add(s); used_keys.add(kp)
|
| 484 |
+
|
| 485 |
+
if not items:
|
| 486 |
raise RuntimeError("تعذّر توليد أسئلة.")
|
| 487 |
return items
|
| 488 |
|
| 489 |
+
# ====== (4-ب) مُولِّد أسئلة "فهم مباشر" (mT5) ======
|
| 490 |
_MT5 = {"tok": None, "model": None, "ok": False}
|
| 491 |
def get_mt5():
|
| 492 |
if _MT5["tok"] is not None or _MT5["model"] is not None or _MT5["ok"]:
|
|
|
|
| 501 |
return _MT5["tok"], _MT5["model"], _MT5["ok"]
|
| 502 |
|
| 503 |
def parse_json_block(s: str) -> Optional[dict]:
|
|
|
|
| 504 |
try:
|
|
|
|
| 505 |
return json.loads(s)
|
| 506 |
except Exception:
|
| 507 |
pass
|
|
|
|
| 508 |
m = re2.search(r"\{.*\}", s, flags=re2.DOTALL)
|
| 509 |
if m:
|
| 510 |
try:
|
|
|
|
| 514 |
return None
|
| 515 |
|
| 516 |
def comp_prompt(sentence: str) -> str:
|
|
|
|
| 517 |
return (
|
| 518 |
"أنت منشئ أسئلة متعددة الخيارات باللغة العربية.\n"
|
| 519 |
"من الجملة التالية، أنشئ سؤال فهم مباشر واحدًا مع أربع خيارات وإشارة للجواب الصحيح.\n"
|
|
|
|
| 554 |
except Exception:
|
| 555 |
return None
|
| 556 |
|
| 557 |
+
def make_comp_mcqs(text: str, n: int = 6, difficulty: str = "متوسط") -> List[MCQ]:
|
| 558 |
tok, model, ok = get_mt5()
|
| 559 |
if not ok:
|
| 560 |
+
return make_mcqs(text, n, difficulty=difficulty)
|
| 561 |
+
|
| 562 |
+
sents_all = split_sents(text)
|
| 563 |
+
sents = pick_clean_sentences(sents_all, difficulty)
|
| 564 |
if not sents:
|
| 565 |
+
return make_mcqs(text, n, difficulty=difficulty)
|
| 566 |
+
|
| 567 |
random.shuffle(sents)
|
| 568 |
+
|
| 569 |
items: List[MCQ] = []
|
| 570 |
tried = 0
|
| 571 |
for s in sents:
|
|
|
|
| 573 |
mcq = gen_one_comp_q(s, tok, model)
|
| 574 |
tried += 1
|
| 575 |
if mcq:
|
| 576 |
+
q = re2.sub(r"\s+", " ", mcq.question).strip()
|
| 577 |
+
if not (12 <= len(q) <= 220):
|
| 578 |
+
continue
|
| 579 |
+
choices = [re2.sub(r"\s+", " ", c).strip() for c in mcq.choices]
|
| 580 |
+
seen=set(); clean=[]
|
| 581 |
+
for c in choices:
|
| 582 |
+
if c and c not in seen:
|
| 583 |
+
seen.add(c); clean.append(c)
|
| 584 |
+
clean = (clean + ["…","…","…","…"])[:4]
|
| 585 |
+
ai = mcq.answer_index if isinstance(mcq.answer_index,int) and 0<=mcq.answer_index<4 else 0
|
| 586 |
+
|
| 587 |
+
items.append(MCQ(id=str(uuid.uuid4())[:8], question=q, choices=clean, answer_index=ai))
|
| 588 |
+
if tried >= n * 7:
|
| 589 |
break
|
| 590 |
+
|
| 591 |
if not items:
|
| 592 |
+
return make_mcqs(text, n, difficulty=difficulty)
|
| 593 |
+
return items[:n]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 594 |
|
| 595 |
# ------------------ تحويل إلى سجلات العرض ------------------
|
| 596 |
def clean_option_text(t: str) -> str:
|
|
|
|
| 649 |
return f"""<div id="quiz" class="quiz-wrap">{''.join(parts)}</div>"""
|
| 650 |
|
| 651 |
# ------------------ توليد الامتحان وتبديل الصفحات ------------------
|
| 652 |
+
def build_quiz(text_area, file_path, n, model_id, zoom, mode, difficulty):
|
| 653 |
text_area = (text_area or "").strip()
|
| 654 |
if not text_area and not file_path:
|
| 655 |
return "", gr.update(visible=True), gr.update(visible=False), "🛈 الصق نصًا أو ارفع ملفًا أولًا."
|
| 656 |
+
raw = text_area if text_area else file_to_text(file_path, model_id=model_id, zoom=float(zoom))[0]
|
|
|
|
|
|
|
|
|
|
| 657 |
cleaned = postprocess(raw)
|
| 658 |
|
|
|
|
| 659 |
try:
|
| 660 |
if mode == "فهم مباشر":
|
| 661 |
+
items = make_comp_mcqs(cleaned, n=int(n), difficulty=difficulty)
|
| 662 |
else:
|
| 663 |
+
items = make_mcqs(cleaned, n=int(n), difficulty=difficulty)
|
| 664 |
+
except Exception:
|
| 665 |
+
items = make_mcqs(cleaned, n=int(n), difficulty=difficulty)
|
|
|
|
| 666 |
|
| 667 |
+
recs = to_records(items)
|
| 668 |
return render_quiz_html(recs), gr.update(visible=False), gr.update(visible=True), ""
|
| 669 |
|
| 670 |
+
# ------------------ CSS ------------------
|
| 671 |
CSS = """
|
| 672 |
:root{
|
| 673 |
--bg:#0e0e11; --panel:#15161a; --card:#1a1b20; --muted:#a7b0be;
|
|
|
|
| 714 |
}
|
| 715 |
.q-actions .q-submit:disabled{opacity:.5;cursor:not-allowed}
|
| 716 |
.q-note{color:#ffd1d6}
|
| 717 |
+
.q-note.warn{color:#ffd1d6}
|
| 718 |
"""
|
| 719 |
|
| 720 |
+
# ------------------ JS: ربط Submit بعد الرندر (مع إبراز الصح لأسئلة الفراغ) ------------------
|
| 721 |
ATTACH_LISTENERS_JS = """
|
| 722 |
() => {
|
| 723 |
if (window.__q_submit_bound_multi2) { return 'already'; }
|
|
|
|
| 772 |
}
|
| 773 |
"""
|
| 774 |
|
| 775 |
+
# ------------------ واجهة Gradio ------------------
|
| 776 |
with gr.Blocks(title="Question Generator", css=CSS) as demo:
|
| 777 |
gr.Markdown("<h2 class='top'>Question Generator</h2>")
|
| 778 |
|
|
|
|
| 784 |
file_comp = gr.File(label="أو ارفع ملف (PDF / TXT)", file_count="single",
|
| 785 |
file_types=[".pdf",".txt"], type="filepath", elem_classes=["upload-like"])
|
| 786 |
num_q = gr.Slider(4, 20, value=DEFAULT_NUM_QUESTIONS, step=1, label="عدد الأسئلة")
|
| 787 |
+
|
| 788 |
+
# خيارات إضافية بدون تغيير البنية العامة
|
| 789 |
mode_radio = gr.Radio(choices=QUESTION_MODES, value="فراغ", label="نوع السؤال")
|
| 790 |
+
difficulty_radio = gr.Radio(choices=DIFFICULTY_MODES, value="متوسط", label="درجة الصعوبة")
|
| 791 |
|
| 792 |
with gr.Accordion("خيارات PDF المصوّر (اختياري)", open=False):
|
| 793 |
trocr_model = gr.Dropdown(
|
|
|
|
| 800 |
value=DEFAULT_TROCR_MODEL, label="نموذج TrOCR"
|
| 801 |
)
|
| 802 |
trocr_zoom = gr.Slider(2.0, 3.5, value=DEFAULT_TROCR_ZOOM, step=0.1, label="Zoom OCR")
|
| 803 |
+
|
| 804 |
btn_build = gr.Button("generate quistion", elem_classes=["button-primary"])
|
| 805 |
warn = gr.Markdown("", elem_classes=["small"])
|
| 806 |
|
|
|
|
| 813 |
# بناء الامتحان + تبديل الصفحات + ربط الـJS
|
| 814 |
btn_build.click(
|
| 815 |
build_quiz,
|
| 816 |
+
inputs=[text_area, file_comp, num_q, trocr_model, trocr_zoom, mode_radio, difficulty_radio],
|
| 817 |
outputs=[quiz_html, page1, page2, warn]
|
| 818 |
).then(
|
| 819 |
None, inputs=None, outputs=[js_wired], js=ATTACH_LISTENERS_JS
|