Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
# صفحتان ثابتتان + Submit لكل سؤال يعمل فعليًا + منع تغيّر أبعاد صفحة الإدخال
|
|
|
|
| 3 |
|
| 4 |
import os, json, uuid, random, unicodedata
|
| 5 |
from dataclasses import dataclass
|
| 6 |
from pathlib import Path
|
| 7 |
-
from typing import List, Tuple
|
| 8 |
|
| 9 |
from PIL import Image
|
| 10 |
from pypdf import PdfReader
|
|
@@ -18,6 +19,7 @@ random.seed(42)
|
|
| 18 |
DEFAULT_NUM_QUESTIONS = 6
|
| 19 |
DEFAULT_TROCR_MODEL = "microsoft/trocr-base-printed"
|
| 20 |
DEFAULT_TROCR_ZOOM = 2.6
|
|
|
|
| 21 |
|
| 22 |
# ------------------ OCR (تحميل كسول) ------------------
|
| 23 |
_OCR = {}
|
|
@@ -105,7 +107,7 @@ def postprocess(raw:str)->str:
|
|
| 105 |
t = re2.sub(r"\[\d+\]", " ", t)
|
| 106 |
return norm_ar(t)
|
| 107 |
|
| 108 |
-
# ------------------
|
| 109 |
SENT_SPLIT = re2.compile(r"(?<=[\.!؟\?])\s+")
|
| 110 |
AR_STOP = set("""في على من إلى عن مع لدى ذلك هذه هذا الذين التي الذي أو أم إن أن كان تكون كانوا كانت كنت ثم قد لقد ربما بل لكن إلا سوى حتى حيث كما لما ما لماذا متى أين كيف أي هناك هنا هؤلاء أولئك نحن هو هي هم هن أنت أنتم أنتن""".split())
|
| 111 |
|
|
@@ -132,9 +134,9 @@ def yake_keywords(t: str, k: int = 160) -> List[str]:
|
|
| 132 |
pairs = []
|
| 133 |
for w, _ in pairs:
|
| 134 |
w = re2.sub(r"\s+", " ", w.strip())
|
| 135 |
-
if not w or w in seen:
|
| 136 |
continue
|
| 137 |
-
if re2.match(r"^[\p{P}\p{S}\d_]+$", w):
|
| 138 |
continue
|
| 139 |
if 2 <= len(w) <= 40:
|
| 140 |
phrases.append(w)
|
|
@@ -144,10 +146,9 @@ def yake_keywords(t: str, k: int = 160) -> List[str]:
|
|
| 144 |
def good_kw(kw:str)->bool:
|
| 145 |
return kw and len(kw)>=2 and kw not in AR_STOP and not re2.match(r"^[\p{P}\p{S}\d_]+$", kw)
|
| 146 |
|
| 147 |
-
# ====== تحسينات
|
| 148 |
_HAS_CAMEL = False
|
| 149 |
try:
|
| 150 |
-
from camel_tools.tokenizers.word import simple_word_tokenize
|
| 151 |
from camel_tools.morphology.analyzer import Analyzer
|
| 152 |
from camel_tools.ner import NERecognizer
|
| 153 |
_HAS_CAMEL = True
|
|
@@ -168,9 +169,8 @@ def ar_pos(word: str) -> str:
|
|
| 168 |
try:
|
| 169 |
ana = _AN.analyze(word)
|
| 170 |
if not ana: return "X"
|
| 171 |
-
pos_candidates = [a.get('pos','X') for a in ana]
|
| 172 |
-
# خذ الأكثر تكرارًا
|
| 173 |
from collections import Counter
|
|
|
|
| 174 |
return Counter(pos_candidates).most_common(1)[0][0] if pos_candidates else "X"
|
| 175 |
except Exception:
|
| 176 |
return "X"
|
|
@@ -205,7 +205,7 @@ def get_embedder():
|
|
| 205 |
from sentence_transformers import SentenceTransformer
|
| 206 |
_EMB = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
|
| 207 |
except Exception:
|
| 208 |
-
_EMB = False
|
| 209 |
return _EMB
|
| 210 |
|
| 211 |
def nearest_terms(target: str, pool: List[str], k: int = 12) -> List[Tuple[str, float]]:
|
|
@@ -218,7 +218,7 @@ def nearest_terms(target: str, pool: List[str], k: int = 12) -> List[Tuple[str,
|
|
| 218 |
vecs = emb.encode([target] + cand, normalize_embeddings=True)
|
| 219 |
t, C = vecs[0], vecs[1:]
|
| 220 |
import numpy as np
|
| 221 |
-
sims = (C @ t)
|
| 222 |
idx = np.argsort(-sims)[:k]
|
| 223 |
return [(cand[i], float(sims[i])) for i in idx]
|
| 224 |
|
|
@@ -254,7 +254,6 @@ def mlm_distractors(sentence_with_blank: str, correct: str, k: int = 8) -> List[
|
|
| 254 |
return []
|
| 255 |
|
| 256 |
def legacy_distractors(correct:str, pool:List[str], k:int=3)->List[str]:
|
| 257 |
-
# النسخة القديمة كنسخة احتياط
|
| 258 |
L=len(correct.strip()); cand=[]
|
| 259 |
for w in pool:
|
| 260 |
w=w.strip()
|
|
@@ -280,7 +279,7 @@ def get_cross_encoder():
|
|
| 280 |
|
| 281 |
def pos_compatible(a: str, b: str) -> bool:
|
| 282 |
pa, pb = ar_pos(a), ar_pos(b)
|
| 283 |
-
if "X" in (pa, pb):
|
| 284 |
return True
|
| 285 |
return pa == pb
|
| 286 |
|
|
@@ -301,32 +300,19 @@ def rank_by_ce(sentence_with_blank: str, candidates: List[str]) -> List[str]:
|
|
| 301 |
|
| 302 |
def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: int = 3) -> List[str]:
|
| 303 |
base = []
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
# 2) FILL-MASK بديل
|
| 308 |
-
mlm = mlm_distractors(sentence.replace(correct, "_____"), correct, k=15)
|
| 309 |
-
for w in mlm:
|
| 310 |
-
if w not in base:
|
| 311 |
-
base.append(w)
|
| 312 |
-
# 3) فلترة POS/NER وطول وتشابه/تطبيع
|
| 313 |
clean = []
|
| 314 |
for w in base:
|
| 315 |
w = w.strip()
|
| 316 |
-
if not w or w == correct:
|
| 317 |
-
|
| 318 |
-
if
|
| 319 |
-
|
| 320 |
-
if
|
| 321 |
-
continue
|
| 322 |
-
if not length_close(w, correct):
|
| 323 |
-
continue
|
| 324 |
-
if norm_ar(w) == norm_ar(correct):
|
| 325 |
-
continue
|
| 326 |
clean.append(w)
|
| 327 |
-
# 4) ترتيب Cross-Encoder اختياري
|
| 328 |
clean = rank_by_ce(sentence.replace(correct, "_____"), clean)[:max(k*2, k)]
|
| 329 |
-
# 5) إزالة المتشابه جداً مع الجواب
|
| 330 |
try:
|
| 331 |
emb = get_embedder()
|
| 332 |
if emb and clean:
|
|
@@ -342,94 +328,170 @@ def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: in
|
|
| 342 |
out = clean[:k]
|
| 343 |
while len(out) < k:
|
| 344 |
extra = [w for w in phrase_pool if w not in out and w != correct and length_close(w, correct)]
|
| 345 |
-
if not extra:
|
| 346 |
-
|
| 347 |
-
out.extend(extra[:(k-len(out))])
|
| 348 |
-
break
|
| 349 |
if len(out) < k:
|
| 350 |
out.extend(legacy_distractors(correct, phrase_pool, k=k-len(out)))
|
| 351 |
return out[:k]
|
| 352 |
|
| 353 |
-
# ====== (4) مُولِّد أسئلة
|
| 354 |
def make_mcqs(text:str, n:int=6)->List[MCQ]:
|
| 355 |
sents=split_sents(text)
|
| 356 |
-
if not sents:
|
| 357 |
raise ValueError("النص قصير أو غير صالح.")
|
| 358 |
-
|
| 359 |
-
# عبارات مفتاحية 1–3 كلمات + فلترة أذكى
|
| 360 |
keyphrases = yake_keywords(text, k=160)
|
| 361 |
keyphrases = [kp for kp in keyphrases if safe_keyword(kp) and 2 <= len(kp) <= 40]
|
| 362 |
-
|
| 363 |
-
# ربط العبارة بجملة مناسبة (نظيفة، ظهور وحيد للعبارة)
|
| 364 |
sent_for={}
|
| 365 |
for s in sents:
|
| 366 |
-
if not is_clean_sentence(s):
|
| 367 |
-
continue
|
| 368 |
for kp in keyphrases:
|
| 369 |
-
if kp in sent_for:
|
| 370 |
-
continue
|
| 371 |
hits = re2.findall(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", s)
|
| 372 |
if len(hits) == 1:
|
| 373 |
sent_for[kp]=s
|
| 374 |
-
if len(sent_for)>=n*3:
|
| 375 |
-
break
|
| 376 |
-
|
| 377 |
if not sent_for:
|
| 378 |
-
# fallback: لو ما لقينا مطابقات جيدة، نرجع للمفردات من النص
|
| 379 |
tokens = [t for t in re2.findall(r"[\p{L}\p{N}_]+", text) if good_kw(t)]
|
| 380 |
freq = [w for w,_ in sorted(((t, text.count(t)) for t in tokens), key=lambda x:-x[1])]
|
| 381 |
keyphrases = [w for w in freq if safe_keyword(w)][:120]
|
| 382 |
for s in sents:
|
| 383 |
-
if not is_clean_sentence(s):
|
| 384 |
-
continue
|
| 385 |
for kp in keyphrases:
|
| 386 |
-
if kp in sent_for:
|
| 387 |
-
continue
|
| 388 |
hits = re2.findall(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", s)
|
| 389 |
-
if len(hits) == 1:
|
| 390 |
-
|
| 391 |
-
if len(sent_for)>=n*2:
|
| 392 |
-
break
|
| 393 |
-
|
| 394 |
if not sent_for:
|
| 395 |
raise RuntimeError("تعذّر توليد أسئلة من هذا النص.")
|
| 396 |
-
|
| 397 |
-
# نعطي أولوية للعبارات الأطول (أكثر إعلامية)
|
| 398 |
items=[]; used_sents=set(); used_keys=set()
|
| 399 |
for kp in sorted(sent_for.keys(), key=lambda x: (-len(x), x)):
|
| 400 |
if len(items)>=n: break
|
| 401 |
s=sent_for[kp]
|
| 402 |
-
if s in used_sents or kp in used_keys:
|
| 403 |
-
continue
|
| 404 |
-
|
| 405 |
-
# ابنِ سؤال الفراغ
|
| 406 |
q=re2.sub(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", "_____", s, count=1)
|
| 407 |
-
|
| 408 |
-
# مشتتات أذكى (مع رجوع تلقائي لو النماذج مش متاحة)
|
| 409 |
pool = [x for x in keyphrases if x != kp]
|
| 410 |
ch = smart_distractors(kp, pool, s, k=3) + [kp]
|
| 411 |
-
|
| 412 |
-
clean_choices=[]
|
| 413 |
-
seen=set()
|
| 414 |
for c in ch:
|
| 415 |
c = c.strip()
|
| 416 |
-
if not c: continue
|
| 417 |
-
|
| 418 |
-
seen.add(c)
|
| 419 |
-
clean_choices.append(c)
|
| 420 |
ch = clean_choices[:4]
|
| 421 |
-
|
| 422 |
-
while len(ch)<4:
|
| 423 |
-
ch.append("…")
|
| 424 |
random.shuffle(ch); ans=ch.index(kp) if kp in ch else 3
|
| 425 |
-
|
| 426 |
items.append(MCQ(id=str(uuid.uuid4())[:8], question=q, choices=ch, answer_index=ans))
|
| 427 |
used_sents.add(s); used_keys.add(kp)
|
| 428 |
-
|
| 429 |
-
if not items:
|
| 430 |
raise RuntimeError("تعذّر توليد أسئلة.")
|
| 431 |
return items
|
| 432 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 433 |
def clean_option_text(t: str) -> str:
|
| 434 |
t = (t or "").strip()
|
| 435 |
t = re2.sub(AR_DIAC, "", t)
|
|
@@ -454,12 +516,12 @@ def to_records(items:List[MCQ])->List[dict]:
|
|
| 454 |
|
| 455 |
# ------------------ صفحة الأسئلة (HTML فقط) ------------------
|
| 456 |
def render_quiz_html(records: List[dict]) -> str:
|
| 457 |
-
parts
|
| 458 |
for i, rec in enumerate(records, start=1):
|
| 459 |
-
qid
|
| 460 |
qtxt = rec["question"]
|
| 461 |
-
cor
|
| 462 |
-
opts_html
|
| 463 |
for o in rec["options"]:
|
| 464 |
lid, txt = o["id"], o["text"]
|
| 465 |
opts_html.append(f"""
|
|
@@ -485,9 +547,8 @@ def render_quiz_html(records: List[dict]) -> str:
|
|
| 485 |
""")
|
| 486 |
return f"""<div id="quiz" class="quiz-wrap">{''.join(parts)}</div>"""
|
| 487 |
|
| 488 |
-
|
| 489 |
# ------------------ توليد الامتحان وتبديل الصفحات ------------------
|
| 490 |
-
def build_quiz(text_area, file_path, n, model_id, zoom):
|
| 491 |
text_area = (text_area or "").strip()
|
| 492 |
if not text_area and not file_path:
|
| 493 |
return "", gr.update(visible=True), gr.update(visible=False), "🛈 الصق نصًا أو ارفع ملفًا أولًا."
|
|
@@ -496,7 +557,17 @@ def build_quiz(text_area, file_path, n, model_id, zoom):
|
|
| 496 |
else:
|
| 497 |
raw, _ = file_to_text(file_path, model_id=model_id, zoom=float(zoom))
|
| 498 |
cleaned = postprocess(raw)
|
| 499 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 500 |
recs = to_records(items)
|
| 501 |
return render_quiz_html(recs), gr.update(visible=False), gr.update(visible=True), ""
|
| 502 |
|
|
@@ -510,7 +581,7 @@ body{direction:rtl; font-family:system-ui,'Cairo','IBM Plex Arabic',sans-serif;
|
|
| 510 |
.gradio-container{max-width:980px;margin:0 auto;padding:12px 12px 40px;}
|
| 511 |
h2.top{color:#eaeaf2;margin:6px 0 16px}
|
| 512 |
|
| 513 |
-
/* صفحة الإدخال ثابتة الارتفاع ولا تتغير
|
| 514 |
.input-panel{background:var(--panel);border:1px solid var(--border);border-radius:14px;padding:16px;
|
| 515 |
box-shadow:0 16px 38px rgba(0,0,0,.35); min-height:360px; display:flex; flex-direction:column; gap:12px;}
|
| 516 |
.small{opacity:.9;color:#d9dee8}
|
|
@@ -547,13 +618,12 @@ textarea{min-height:120px}
|
|
| 547 |
}
|
| 548 |
.q-actions .q-submit:disabled{opacity:.5;cursor:not-allowed}
|
| 549 |
.q-note{color:#ffd1d6}
|
| 550 |
-
.q-note.warn{color:#
|
| 551 |
"""
|
| 552 |
|
| 553 |
-
# ------------------ JS: ربط Submit بعد الرندر (مع
|
| 554 |
ATTACH_LISTENERS_JS = """
|
| 555 |
() => {
|
| 556 |
-
// اربط مرة واحدة فقط
|
| 557 |
if (window.__q_submit_bound_multi2) { return 'already'; }
|
| 558 |
window.__q_submit_bound_multi2 = true;
|
| 559 |
|
|
@@ -576,23 +646,20 @@ ATTACH_LISTENERS_JS = """
|
|
| 576 |
|
| 577 |
const chosenLabel = chosen.closest('.opt');
|
| 578 |
|
| 579 |
-
// حالة صحيحة: لوّن أخضر وأقفل السؤال كاملاً + إبراز الكلمة الصحيحة داخل الجملة
|
| 580 |
if (chosen.value === correct) {
|
| 581 |
chosenLabel.classList.add('ok');
|
| 582 |
if (badge){ badge.hidden=false; badge.className='q-badge ok'; badge.textContent='Correct!'; }
|
| 583 |
-
// أقفل هذا السؤال فقط بعد الصح
|
| 584 |
card.querySelectorAll('input[type="radio"]').forEach(i => i.disabled = true);
|
| 585 |
e.target.disabled = true;
|
| 586 |
if (note) note.textContent = '';
|
| 587 |
|
| 588 |
-
// إبراز الجواب الصحيح ضمن الجملة الحالية دون تغيير البنية
|
| 589 |
const qNode = card.querySelector('.q-text');
|
| 590 |
if (qNode){
|
| 591 |
-
const full = qNode.textContent || '';
|
| 592 |
const correctText = [...card.querySelectorAll('.opt')].find(o =>
|
| 593 |
o.querySelector('input').value === correct
|
| 594 |
)?.querySelector('.opt-text')?.textContent || '';
|
| 595 |
-
if (full && correctText){
|
| 596 |
const highlighted = full.replace('_____', `<mark style="background:#2dd4bf22;border:1px solid #2dd4bf55;border-radius:6px;padding:0 4px">${correctText}</mark>`);
|
| 597 |
qNode.innerHTML = highlighted;
|
| 598 |
}
|
|
@@ -600,7 +667,6 @@ ATTACH_LISTENERS_JS = """
|
|
| 600 |
return;
|
| 601 |
}
|
| 602 |
|
| 603 |
-
// حالة خاطئة: لوّن أحمر فقط، ولا تعطل أي شيء — ليقدر يجرّب خيار آخر
|
| 604 |
chosenLabel.classList.add('err');
|
| 605 |
if (badge){ badge.hidden=false; badge.className='q-badge err'; badge.textContent='Incorrect.'; }
|
| 606 |
if (note) note.textContent = '';
|
|
@@ -610,7 +676,7 @@ ATTACH_LISTENERS_JS = """
|
|
| 610 |
}
|
| 611 |
"""
|
| 612 |
|
| 613 |
-
# ------------------ واجهة Gradio (
|
| 614 |
with gr.Blocks(title="Question Generator", css=CSS) as demo:
|
| 615 |
gr.Markdown("<h2 class='top'>Question Generator</h2>")
|
| 616 |
|
|
@@ -622,6 +688,9 @@ with gr.Blocks(title="Question Generator", css=CSS) as demo:
|
|
| 622 |
file_comp = gr.File(label="أو ارفع ملف (PDF / TXT)", file_count="single",
|
| 623 |
file_types=[".pdf",".txt"], type="filepath", elem_classes=["upload-like"])
|
| 624 |
num_q = gr.Slider(4, 20, value=DEFAULT_NUM_QUESTIONS, step=1, label="عدد الأسئلة")
|
|
|
|
|
|
|
|
|
|
| 625 |
with gr.Accordion("خيارات PDF المصوّر (اختياري)", open=False):
|
| 626 |
trocr_model = gr.Dropdown(
|
| 627 |
choices=[
|
|
@@ -645,7 +714,7 @@ with gr.Blocks(title="Question Generator", css=CSS) as demo:
|
|
| 645 |
# بناء الامتحان + تبديل الصفحات + ربط الـJS
|
| 646 |
btn_build.click(
|
| 647 |
build_quiz,
|
| 648 |
-
inputs=[text_area, file_comp, num_q, trocr_model, trocr_zoom],
|
| 649 |
outputs=[quiz_html, page1, page2, warn]
|
| 650 |
).then(
|
| 651 |
None, inputs=None, outputs=[js_wired], js=ATTACH_LISTENERS_JS
|
|
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
# صفحتان ثابتتان + Submit لكل سؤال يعمل فعليًا + منع تغيّر أبعاد صفحة الإدخال
|
| 3 |
+
# + طور اختياري لأسئلة فهم مباشر باستخدام mT5 (تحميل كسول + fallback)
|
| 4 |
|
| 5 |
import os, json, uuid, random, unicodedata
|
| 6 |
from dataclasses import dataclass
|
| 7 |
from pathlib import Path
|
| 8 |
+
from typing import List, Tuple, Optional
|
| 9 |
|
| 10 |
from PIL import Image
|
| 11 |
from pypdf import PdfReader
|
|
|
|
| 19 |
DEFAULT_NUM_QUESTIONS = 6
|
| 20 |
DEFAULT_TROCR_MODEL = "microsoft/trocr-base-printed"
|
| 21 |
DEFAULT_TROCR_ZOOM = 2.6
|
| 22 |
+
QUESTION_MODES = ["فراغ", "فهم مباشر"] # جديد
|
| 23 |
|
| 24 |
# ------------------ OCR (تحميل كسول) ------------------
|
| 25 |
_OCR = {}
|
|
|
|
| 107 |
t = re2.sub(r"\[\d+\]", " ", t)
|
| 108 |
return norm_ar(t)
|
| 109 |
|
| 110 |
+
# ------------------ بنية السؤال ------------------
|
| 111 |
SENT_SPLIT = re2.compile(r"(?<=[\.!؟\?])\s+")
|
| 112 |
AR_STOP = set("""في على من إلى عن مع لدى ذلك هذه هذا الذين التي الذي أو أم إن أن كان تكون كانوا كانت كنت ثم قد لقد ربما بل لكن إلا سوى حتى حيث كما لما ما لماذا متى أين كيف أي هناك هنا هؤلاء أولئك نحن هو هي هم هن أنت أنتم أنتن""".split())
|
| 113 |
|
|
|
|
| 134 |
pairs = []
|
| 135 |
for w, _ in pairs:
|
| 136 |
w = re2.sub(r"\s+", " ", w.strip())
|
| 137 |
+
if not w or w in seen:
|
| 138 |
continue
|
| 139 |
+
if re2.match(r"^[\p{P}\p{S}\d_]+$", w):
|
| 140 |
continue
|
| 141 |
if 2 <= len(w) <= 40:
|
| 142 |
phrases.append(w)
|
|
|
|
| 146 |
def good_kw(kw:str)->bool:
|
| 147 |
return kw and len(kw)>=2 and kw not in AR_STOP and not re2.match(r"^[\p{P}\p{S}\d_]+$", kw)
|
| 148 |
|
| 149 |
+
# ====== تحسينات الذكاء: POS/NER اختياري مع fallback ======
|
| 150 |
_HAS_CAMEL = False
|
| 151 |
try:
|
|
|
|
| 152 |
from camel_tools.morphology.analyzer import Analyzer
|
| 153 |
from camel_tools.ner import NERecognizer
|
| 154 |
_HAS_CAMEL = True
|
|
|
|
| 169 |
try:
|
| 170 |
ana = _AN.analyze(word)
|
| 171 |
if not ana: return "X"
|
|
|
|
|
|
|
| 172 |
from collections import Counter
|
| 173 |
+
pos_candidates = [a.get('pos','X') for a in ana]
|
| 174 |
return Counter(pos_candidates).most_common(1)[0][0] if pos_candidates else "X"
|
| 175 |
except Exception:
|
| 176 |
return "X"
|
|
|
|
| 205 |
from sentence_transformers import SentenceTransformer
|
| 206 |
_EMB = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
|
| 207 |
except Exception:
|
| 208 |
+
_EMB = False
|
| 209 |
return _EMB
|
| 210 |
|
| 211 |
def nearest_terms(target: str, pool: List[str], k: int = 12) -> List[Tuple[str, float]]:
|
|
|
|
| 218 |
vecs = emb.encode([target] + cand, normalize_embeddings=True)
|
| 219 |
t, C = vecs[0], vecs[1:]
|
| 220 |
import numpy as np
|
| 221 |
+
sims = (C @ t)
|
| 222 |
idx = np.argsort(-sims)[:k]
|
| 223 |
return [(cand[i], float(sims[i])) for i in idx]
|
| 224 |
|
|
|
|
| 254 |
return []
|
| 255 |
|
| 256 |
def legacy_distractors(correct:str, pool:List[str], k:int=3)->List[str]:
|
|
|
|
| 257 |
L=len(correct.strip()); cand=[]
|
| 258 |
for w in pool:
|
| 259 |
w=w.strip()
|
|
|
|
| 279 |
|
| 280 |
def pos_compatible(a: str, b: str) -> bool:
|
| 281 |
pa, pb = ar_pos(a), ar_pos(b)
|
| 282 |
+
if "X" in (pa, pb):
|
| 283 |
return True
|
| 284 |
return pa == pb
|
| 285 |
|
|
|
|
| 300 |
|
| 301 |
def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: int = 3) -> List[str]:
|
| 302 |
base = []
|
| 303 |
+
base.extend([w for w,_ in nearest_terms(correct, phrase_pool, k=20)])
|
| 304 |
+
for w in mlm_distractors(sentence.replace(correct, "_____"), correct, k=15):
|
| 305 |
+
if w not in base: base.append(w)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
clean = []
|
| 307 |
for w in base:
|
| 308 |
w = w.strip()
|
| 309 |
+
if not w or w == correct: continue
|
| 310 |
+
if is_named_entity(w): continue
|
| 311 |
+
if not pos_compatible(w, correct): continue
|
| 312 |
+
if not length_close(w, correct): continue
|
| 313 |
+
if norm_ar(w) == norm_ar(correct): continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
clean.append(w)
|
|
|
|
| 315 |
clean = rank_by_ce(sentence.replace(correct, "_____"), clean)[:max(k*2, k)]
|
|
|
|
| 316 |
try:
|
| 317 |
emb = get_embedder()
|
| 318 |
if emb and clean:
|
|
|
|
| 328 |
out = clean[:k]
|
| 329 |
while len(out) < k:
|
| 330 |
extra = [w for w in phrase_pool if w not in out and w != correct and length_close(w, correct)]
|
| 331 |
+
if not extra: break
|
| 332 |
+
out.extend(extra[:(k-len(out))]); break
|
|
|
|
|
|
|
| 333 |
if len(out) < k:
|
| 334 |
out.extend(legacy_distractors(correct, phrase_pool, k=k-len(out)))
|
| 335 |
return out[:k]
|
| 336 |
|
| 337 |
+
# ====== (4-أ) مُولِّد أسئلة "فراغ" (القائم) ======
|
| 338 |
def make_mcqs(text:str, n:int=6)->List[MCQ]:
|
| 339 |
sents=split_sents(text)
|
| 340 |
+
if not sents:
|
| 341 |
raise ValueError("النص قصير أو غير صالح.")
|
|
|
|
|
|
|
| 342 |
keyphrases = yake_keywords(text, k=160)
|
| 343 |
keyphrases = [kp for kp in keyphrases if safe_keyword(kp) and 2 <= len(kp) <= 40]
|
|
|
|
|
|
|
| 344 |
sent_for={}
|
| 345 |
for s in sents:
|
| 346 |
+
if not is_clean_sentence(s): continue
|
|
|
|
| 347 |
for kp in keyphrases:
|
| 348 |
+
if kp in sent_for: continue
|
|
|
|
| 349 |
hits = re2.findall(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", s)
|
| 350 |
if len(hits) == 1:
|
| 351 |
sent_for[kp]=s
|
| 352 |
+
if len(sent_for)>=n*3: break
|
|
|
|
|
|
|
| 353 |
if not sent_for:
|
|
|
|
| 354 |
tokens = [t for t in re2.findall(r"[\p{L}\p{N}_]+", text) if good_kw(t)]
|
| 355 |
freq = [w for w,_ in sorted(((t, text.count(t)) for t in tokens), key=lambda x:-x[1])]
|
| 356 |
keyphrases = [w for w in freq if safe_keyword(w)][:120]
|
| 357 |
for s in sents:
|
| 358 |
+
if not is_clean_sentence(s): continue
|
|
|
|
| 359 |
for kp in keyphrases:
|
| 360 |
+
if kp in sent_for: continue
|
|
|
|
| 361 |
hits = re2.findall(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", s)
|
| 362 |
+
if len(hits) == 1: sent_for[kp]=s
|
| 363 |
+
if len(sent_for)>=n*2: break
|
|
|
|
|
|
|
|
|
|
| 364 |
if not sent_for:
|
| 365 |
raise RuntimeError("تعذّر توليد أسئلة من هذا النص.")
|
|
|
|
|
|
|
| 366 |
items=[]; used_sents=set(); used_keys=set()
|
| 367 |
for kp in sorted(sent_for.keys(), key=lambda x: (-len(x), x)):
|
| 368 |
if len(items)>=n: break
|
| 369 |
s=sent_for[kp]
|
| 370 |
+
if s in used_sents or kp in used_keys: continue
|
|
|
|
|
|
|
|
|
|
| 371 |
q=re2.sub(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", "_____", s, count=1)
|
|
|
|
|
|
|
| 372 |
pool = [x for x in keyphrases if x != kp]
|
| 373 |
ch = smart_distractors(kp, pool, s, k=3) + [kp]
|
| 374 |
+
clean_choices=[]; seen=set()
|
|
|
|
|
|
|
| 375 |
for c in ch:
|
| 376 |
c = c.strip()
|
| 377 |
+
if not c or c in seen: continue
|
| 378 |
+
seen.add(c); clean_choices.append(c)
|
|
|
|
|
|
|
| 379 |
ch = clean_choices[:4]
|
| 380 |
+
while len(ch)<4: ch.append("…")
|
|
|
|
|
|
|
| 381 |
random.shuffle(ch); ans=ch.index(kp) if kp in ch else 3
|
|
|
|
| 382 |
items.append(MCQ(id=str(uuid.uuid4())[:8], question=q, choices=ch, answer_index=ans))
|
| 383 |
used_sents.add(s); used_keys.add(kp)
|
| 384 |
+
if not items:
|
|
|
|
| 385 |
raise RuntimeError("تعذّر توليد أسئلة.")
|
| 386 |
return items
|
| 387 |
|
| 388 |
+
# ====== (4-ب) مُولِّد أسئلة "فهم مباشر" (توليدي mT5) ======
|
| 389 |
+
_MT5 = {"tok": None, "model": None, "ok": False}
|
| 390 |
+
def get_mt5():
|
| 391 |
+
if _MT5["tok"] is not None or _MT5["model"] is not None or _MT5["ok"]:
|
| 392 |
+
return _MT5["tok"], _MT5["model"], _MT5["ok"]
|
| 393 |
+
try:
|
| 394 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
| 395 |
+
_MT5["tok"] = AutoTokenizer.from_pretrained("google/mt5-small")
|
| 396 |
+
_MT5["model"] = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")
|
| 397 |
+
_MT5["ok"] = True
|
| 398 |
+
except Exception:
|
| 399 |
+
_MT5["tok"] = None; _MT5["model"] = None; _MT5["ok"] = False
|
| 400 |
+
return _MT5["tok"], _MT5["model"], _MT5["ok"]
|
| 401 |
+
|
| 402 |
+
def parse_json_block(s: str) -> Optional[dict]:
|
| 403 |
+
# حاول التقاط أول كائن JSON صالح
|
| 404 |
+
try:
|
| 405 |
+
# إن وُجد JSON مباشر
|
| 406 |
+
return json.loads(s)
|
| 407 |
+
except Exception:
|
| 408 |
+
pass
|
| 409 |
+
# التقط أقواس {} الأولى والأخيرة
|
| 410 |
+
m = re2.search(r"\{.*\}", s, flags=re2.DOTALL)
|
| 411 |
+
if m:
|
| 412 |
+
try:
|
| 413 |
+
return json.loads(m.group(0))
|
| 414 |
+
except Exception:
|
| 415 |
+
return None
|
| 416 |
+
return None
|
| 417 |
+
|
| 418 |
+
def comp_prompt(sentence: str) -> str:
|
| 419 |
+
# تعليمات قصيرة ومحددة مع تنسيق JSON
|
| 420 |
+
return (
|
| 421 |
+
"أنت منشئ أسئلة متعددة الخيارات باللغة العربية.\n"
|
| 422 |
+
"من الجملة التالية، أنشئ سؤال فهم مباشر واحدًا مع أربع خيارات وإشارة للجواب الصحيح.\n"
|
| 423 |
+
"أعد فقط JSON بهذا الشكل:\n"
|
| 424 |
+
"{"
|
| 425 |
+
"\"question\": \"...\",\n"
|
| 426 |
+
"\"choices\": [\"...\",\"...\",\"...\",\"...\"],\n"
|
| 427 |
+
"\"answer_index\": 0\n"
|
| 428 |
+
"}\n\n"
|
| 429 |
+
f"الجملة: {sentence}"
|
| 430 |
+
)
|
| 431 |
+
|
| 432 |
+
def gen_one_comp_q(sentence: str, tok, model, max_new_tokens=128) -> Optional[MCQ]:
|
| 433 |
+
try:
|
| 434 |
+
import torch
|
| 435 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 436 |
+
model = model.to(device)
|
| 437 |
+
inp = tok(comp_prompt(sentence), return_tensors="pt").to(device)
|
| 438 |
+
out = model.generate(
|
| 439 |
+
**inp,
|
| 440 |
+
max_new_tokens=max_new_tokens,
|
| 441 |
+
do_sample=True,
|
| 442 |
+
temperature=0.8,
|
| 443 |
+
top_p=0.9,
|
| 444 |
+
num_return_sequences=1,
|
| 445 |
+
eos_token_id=tok.eos_token_id
|
| 446 |
+
)
|
| 447 |
+
text = tok.decode(out[0], skip_special_tokens=True)
|
| 448 |
+
data = parse_json_block(text) or {}
|
| 449 |
+
q = str(data.get("question","")).strip()
|
| 450 |
+
choices = data.get("choices", [])
|
| 451 |
+
ai = data.get("answer_index", 0)
|
| 452 |
+
if not q or not isinstance(choices, list) or len(choices) < 4:
|
| 453 |
+
return None
|
| 454 |
+
choices = [str(c).strip() for c in choices][:4]
|
| 455 |
+
ai = ai if isinstance(ai, int) and 0 <= ai < 4 else 0
|
| 456 |
+
return MCQ(id=str(uuid.uuid4())[:8], question=q, choices=choices, answer_index=ai)
|
| 457 |
+
except Exception:
|
| 458 |
+
return None
|
| 459 |
+
|
| 460 |
+
def make_comp_mcqs(text: str, n: int = 6) -> List[MCQ]:
|
| 461 |
+
tok, model, ok = get_mt5()
|
| 462 |
+
if not ok:
|
| 463 |
+
# لو ما توفر mT5 نرجع للفراغ
|
| 464 |
+
return make_mcqs(text, n)
|
| 465 |
+
sents = [s for s in split_sents(text) if is_clean_sentence(s)]
|
| 466 |
+
if not sents:
|
| 467 |
+
return make_mcqs(text, n)
|
| 468 |
+
random.shuffle(sents)
|
| 469 |
+
items: List[MCQ] = []
|
| 470 |
+
tried = 0
|
| 471 |
+
for s in sents:
|
| 472 |
+
if len(items) >= n: break
|
| 473 |
+
mcq = gen_one_comp_q(s, tok, model)
|
| 474 |
+
tried += 1
|
| 475 |
+
if mcq:
|
| 476 |
+
# تنظيف بسيط للخيار والنص
|
| 477 |
+
mcq.question = re2.sub(r"\s+", " ", mcq.question).strip()
|
| 478 |
+
mcq.choices = [re2.sub(r"\s+", " ", c).strip() or "…" for c in mcq.choices]
|
| 479 |
+
items.append(mcq)
|
| 480 |
+
if tried >= n * 6: # سقف محاولات معقول
|
| 481 |
+
break
|
| 482 |
+
if not items:
|
| 483 |
+
# fallback احتياطي
|
| 484 |
+
return make_mcqs(text, n)
|
| 485 |
+
# توحيد البنية (A..D) بنفس الشكل
|
| 486 |
+
normed=[]
|
| 487 |
+
for it in items[:n]:
|
| 488 |
+
# القص إلى 4 خيارات وتأمين الفهارس
|
| 489 |
+
ch = (it.choices + ["…","…","…","…"])[:4]
|
| 490 |
+
ai = it.answer_index if 0 <= it.answer_index < 4 else 0
|
| 491 |
+
normed.append(MCQ(id=it.id, question=it.question, choices=ch, answer_index=ai))
|
| 492 |
+
return normed
|
| 493 |
+
|
| 494 |
+
# ------------------ تحويل إلى سجلات العرض ------------------
|
| 495 |
def clean_option_text(t: str) -> str:
|
| 496 |
t = (t or "").strip()
|
| 497 |
t = re2.sub(AR_DIAC, "", t)
|
|
|
|
| 516 |
|
| 517 |
# ------------------ صفحة الأسئلة (HTML فقط) ------------------
|
| 518 |
def render_quiz_html(records: List[dict]) -> str:
|
| 519 |
+
parts=[]
|
| 520 |
for i, rec in enumerate(records, start=1):
|
| 521 |
+
qid = rec["id"]
|
| 522 |
qtxt = rec["question"]
|
| 523 |
+
cor = next((o["id"] for o in rec["options"] if o["is_correct"]), "")
|
| 524 |
+
opts_html=[]
|
| 525 |
for o in rec["options"]:
|
| 526 |
lid, txt = o["id"], o["text"]
|
| 527 |
opts_html.append(f"""
|
|
|
|
| 547 |
""")
|
| 548 |
return f"""<div id="quiz" class="quiz-wrap">{''.join(parts)}</div>"""
|
| 549 |
|
|
|
|
| 550 |
# ------------------ توليد الامتحان وتبديل الصفحات ------------------
|
| 551 |
+
def build_quiz(text_area, file_path, n, model_id, zoom, mode):
|
| 552 |
text_area = (text_area or "").strip()
|
| 553 |
if not text_area and not file_path:
|
| 554 |
return "", gr.update(visible=True), gr.update(visible=False), "🛈 الصق نصًا أو ارفع ملفًا أولًا."
|
|
|
|
| 557 |
else:
|
| 558 |
raw, _ = file_to_text(file_path, model_id=model_id, zoom=float(zoom))
|
| 559 |
cleaned = postprocess(raw)
|
| 560 |
+
|
| 561 |
+
# اختيار الطور
|
| 562 |
+
try:
|
| 563 |
+
if mode == "فهم مباشر":
|
| 564 |
+
items = make_comp_mcqs(cleaned, n=int(n))
|
| 565 |
+
else:
|
| 566 |
+
items = make_mcqs(cleaned, n=int(n))
|
| 567 |
+
except Exception as e:
|
| 568 |
+
# fallback النهائي
|
| 569 |
+
items = make_mcqs(cleaned, n=int(n))
|
| 570 |
+
|
| 571 |
recs = to_records(items)
|
| 572 |
return render_quiz_html(recs), gr.update(visible=False), gr.update(visible=True), ""
|
| 573 |
|
|
|
|
| 581 |
.gradio-container{max-width:980px;margin:0 auto;padding:12px 12px 40px;}
|
| 582 |
h2.top{color:#eaeaf2;margin:6px 0 16px}
|
| 583 |
|
| 584 |
+
/* صفحة الإدخال ثابتة الارتفاع ولا تتغير أبعاده */
|
| 585 |
.input-panel{background:var(--panel);border:1px solid var(--border);border-radius:14px;padding:16px;
|
| 586 |
box-shadow:0 16px 38px rgba(0,0,0,.35); min-height:360px; display:flex; flex-direction:column; gap:12px;}
|
| 587 |
.small{opacity:.9;color:#d9dee8}
|
|
|
|
| 618 |
}
|
| 619 |
.q-actions .q-submit:disabled{opacity:.5;cursor:not-allowed}
|
| 620 |
.q-note{color:#ffd1d6}
|
| 621 |
+
.q-note.warn{color:#ffd1د6}
|
| 622 |
"""
|
| 623 |
|
| 624 |
+
# ------------------ JS: ربط Submit بعد الرندر (كما هو مع إبراز الصح) ------------------
|
| 625 |
ATTACH_LISTENERS_JS = """
|
| 626 |
() => {
|
|
|
|
| 627 |
if (window.__q_submit_bound_multi2) { return 'already'; }
|
| 628 |
window.__q_submit_bound_multi2 = true;
|
| 629 |
|
|
|
|
| 646 |
|
| 647 |
const chosenLabel = chosen.closest('.opt');
|
| 648 |
|
|
|
|
| 649 |
if (chosen.value === correct) {
|
| 650 |
chosenLabel.classList.add('ok');
|
| 651 |
if (badge){ badge.hidden=false; badge.className='q-badge ok'; badge.textContent='Correct!'; }
|
|
|
|
| 652 |
card.querySelectorAll('input[type="radio"]').forEach(i => i.disabled = true);
|
| 653 |
e.target.disabled = true;
|
| 654 |
if (note) note.textContent = '';
|
| 655 |
|
|
|
|
| 656 |
const qNode = card.querySelector('.q-text');
|
| 657 |
if (qNode){
|
| 658 |
+
const full = qNode.textContent || qNode.innerText || '';
|
| 659 |
const correctText = [...card.querySelectorAll('.opt')].find(o =>
|
| 660 |
o.querySelector('input').value === correct
|
| 661 |
)?.querySelector('.opt-text')?.textContent || '';
|
| 662 |
+
if (full && correctText && full.includes('_____')){
|
| 663 |
const highlighted = full.replace('_____', `<mark style="background:#2dd4bf22;border:1px solid #2dd4bf55;border-radius:6px;padding:0 4px">${correctText}</mark>`);
|
| 664 |
qNode.innerHTML = highlighted;
|
| 665 |
}
|
|
|
|
| 667 |
return;
|
| 668 |
}
|
| 669 |
|
|
|
|
| 670 |
chosenLabel.classList.add('err');
|
| 671 |
if (badge){ badge.hidden=false; badge.className='q-badge err'; badge.textContent='Incorrect.'; }
|
| 672 |
if (note) note.textContent = '';
|
|
|
|
| 676 |
}
|
| 677 |
"""
|
| 678 |
|
| 679 |
+
# ------------------ واجهة Gradio (نفس الصفحتين + اختيار نوع السؤال) ------------------
|
| 680 |
with gr.Blocks(title="Question Generator", css=CSS) as demo:
|
| 681 |
gr.Markdown("<h2 class='top'>Question Generator</h2>")
|
| 682 |
|
|
|
|
| 688 |
file_comp = gr.File(label="أو ارفع ملف (PDF / TXT)", file_count="single",
|
| 689 |
file_types=[".pdf",".txt"], type="filepath", elem_classes=["upload-like"])
|
| 690 |
num_q = gr.Slider(4, 20, value=DEFAULT_NUM_QUESTIONS, step=1, label="عدد الأسئلة")
|
| 691 |
+
# جديد: اختيار نوع السؤال دون تغيير بنية الصفحة
|
| 692 |
+
mode_radio = gr.Radio(choices=QUESTION_MODES, value="فراغ", label="نوع السؤال")
|
| 693 |
+
|
| 694 |
with gr.Accordion("خيارات PDF المصوّر (اختياري)", open=False):
|
| 695 |
trocr_model = gr.Dropdown(
|
| 696 |
choices=[
|
|
|
|
| 714 |
# بناء الامتحان + تبديل الصفحات + ربط الـJS
|
| 715 |
btn_build.click(
|
| 716 |
build_quiz,
|
| 717 |
+
inputs=[text_area, file_comp, num_q, trocr_model, trocr_zoom, mode_radio],
|
| 718 |
outputs=[quiz_html, page1, page2, warn]
|
| 719 |
).then(
|
| 720 |
None, inputs=None, outputs=[js_wired], js=ATTACH_LISTENERS_JS
|