Leen172 commited on
Commit
4988947
·
verified ·
1 Parent(s): d3a2eea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +178 -79
app.py CHANGED
@@ -1,6 +1,7 @@
1
  # -*- coding: utf-8 -*-
2
  # صفحتان ثابتتان + Submit لكل سؤال يعمل فعليًا + منع تغيّر أبعاد صفحة الإدخال
3
  # + طور اختياري لأسئلة فهم مباشر باستخدام mT5 (تحميل كسول + fallback)
 
4
 
5
  import os, json, uuid, random, unicodedata
6
  from dataclasses import dataclass
@@ -19,17 +20,29 @@ random.seed(42)
19
  DEFAULT_NUM_QUESTIONS = 6
20
  DEFAULT_TROCR_MODEL = "microsoft/trocr-base-printed"
21
  DEFAULT_TROCR_ZOOM = 2.6
22
- QUESTION_MODES = ["فراغ", "فهم مباشر"] # جديد
 
 
 
 
 
 
 
 
23
 
24
  # ------------------ OCR (تحميل كسول) ------------------
25
  _OCR = {}
26
  def get_ocr(model_id: str):
27
- from transformers import pipeline
28
- import torch
29
- dev = 0 if torch.cuda.is_available() else -1
30
- if model_id not in _OCR:
31
- _OCR[model_id] = pipeline("image-to-text", model=model_id, device=dev)
32
- return _OCR[model_id]
 
 
 
 
33
 
34
  # ------------------ PDF/TXT → نص ------------------
35
  def extract_text_with_pypdf(path: str) -> str:
@@ -123,7 +136,7 @@ def split_sents(t:str)->List[str]:
123
  return [x for x in s if len(x)>=25]
124
 
125
  # ====== (1) عبارات مفتاحية أذكى: n=3 ثم 2 ثم 1، مع فلترة ======
126
- def yake_keywords(t: str, k: int = 160) -> List[str]:
127
  phrases = []
128
  seen = set()
129
  for n in [3, 2, 1]:
@@ -208,7 +221,7 @@ def get_embedder():
208
  _EMB = False
209
  return _EMB
210
 
211
- def nearest_terms(target: str, pool: List[str], k: int = 12) -> List[Tuple[str, float]]:
212
  emb = get_embedder()
213
  if not emb:
214
  return []
@@ -218,7 +231,7 @@ def nearest_terms(target: str, pool: List[str], k: int = 12) -> List[Tuple[str,
218
  vecs = emb.encode([target] + cand, normalize_embeddings=True)
219
  t, C = vecs[0], vecs[1:]
220
  import numpy as np
221
- sims = (C @ t)
222
  idx = np.argsort(-sims)[:k]
223
  return [(cand[i], float(sims[i])) for i in idx]
224
 
@@ -233,19 +246,19 @@ def get_masker():
233
  _MLM = False
234
  return _MLM
235
 
236
- def mlm_distractors(sentence_with_blank: str, correct: str, k: int = 8) -> List[str]:
237
  masker = get_masker()
238
  if not masker:
239
  return []
240
  masked = sentence_with_blank.replace("_____", masker.tokenizer.mask_token)
241
  try:
242
- outs = masker(masked, top_k=max(15, k+5))
243
  cands = []
244
  for o in outs:
245
  tok = o["token_str"].strip()
246
  if tok and tok != correct and len(tok) >= 2 and not re2.match(r"^[\p{P}\p{S}\d_]+$", tok):
247
  cands.append(tok)
248
- seen = set(); uniq = []
249
  for w in cands:
250
  if w not in seen:
251
  uniq.append(w); seen.add(w)
@@ -298,21 +311,94 @@ def rank_by_ce(sentence_with_blank: str, candidates: List[str]) -> List[str]:
298
  except Exception:
299
  return candidates
300
 
301
- def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: int = 3) -> List[str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
  base = []
303
- base.extend([w for w,_ in nearest_terms(correct, phrase_pool, k=20)])
304
- for w in mlm_distractors(sentence.replace(correct, "_____"), correct, k=15):
305
- if w not in base: base.append(w)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
  clean = []
307
  for w in base:
308
  w = w.strip()
309
- if not w or w == correct: continue
310
- if is_named_entity(w): continue
311
- if not pos_compatible(w, correct): continue
312
- if not length_close(w, correct): continue
313
- if norm_ar(w) == norm_ar(correct): continue
 
 
 
 
 
314
  clean.append(w)
315
- clean = rank_by_ce(sentence.replace(correct, "_____"), clean)[:max(k*2, k)]
 
 
 
 
 
316
  try:
317
  emb = get_embedder()
318
  if emb and clean:
@@ -320,11 +406,12 @@ def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: in
320
  c, others = vecs[0], vecs[1:]
321
  import numpy as np
322
  sims = others @ c
323
- filtered = [w for w, s in zip(clean, sims) if s < 0.92]
324
  if len(filtered) >= k:
325
  clean = filtered
326
  except Exception:
327
  pass
 
328
  out = clean[:k]
329
  while len(out) < k:
330
  extra = [w for w in phrase_pool if w not in out and w != correct and length_close(w, correct)]
@@ -334,43 +421,55 @@ def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: in
334
  out.extend(legacy_distractors(correct, phrase_pool, k=k-len(out)))
335
  return out[:k]
336
 
337
- # ====== (4-أ) مُولِّد أسئلة "فراغ" (القائم) ======
338
- def make_mcqs(text:str, n:int=6)->List[MCQ]:
339
- sents=split_sents(text)
340
- if not sents:
 
341
  raise ValueError("النص قصير أو غير صالح.")
342
- keyphrases = yake_keywords(text, k=160)
 
343
  keyphrases = [kp for kp in keyphrases if safe_keyword(kp) and 2 <= len(kp) <= 40]
 
 
344
  sent_for={}
345
  for s in sents:
346
- if not is_clean_sentence(s): continue
347
  for kp in keyphrases:
348
  if kp in sent_for: continue
349
  hits = re2.findall(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", s)
350
  if len(hits) == 1:
351
  sent_for[kp]=s
352
- if len(sent_for)>=n*3: break
 
 
353
  if not sent_for:
354
  tokens = [t for t in re2.findall(r"[\p{L}\p{N}_]+", text) if good_kw(t)]
355
  freq = [w for w,_ in sorted(((t, text.count(t)) for t in tokens), key=lambda x:-x[1])]
356
- keyphrases = [w for w in freq if safe_keyword(w)][:120]
357
  for s in sents:
358
- if not is_clean_sentence(s): continue
359
  for kp in keyphrases:
360
  if kp in sent_for: continue
361
  hits = re2.findall(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", s)
362
  if len(hits) == 1: sent_for[kp]=s
363
- if len(sent_for)>=n*2: break
 
 
364
  if not sent_for:
365
  raise RuntimeError("تعذّر توليد أسئلة من هذا النص.")
 
 
366
  items=[]; used_sents=set(); used_keys=set()
367
  for kp in sorted(sent_for.keys(), key=lambda x: (-len(x), x)):
368
  if len(items)>=n: break
369
  s=sent_for[kp]
370
  if s in used_sents or kp in used_keys: continue
 
371
  q=re2.sub(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", "_____", s, count=1)
 
372
  pool = [x for x in keyphrases if x != kp]
373
- ch = smart_distractors(kp, pool, s, k=3) + [kp]
 
 
374
  clean_choices=[]; seen=set()
375
  for c in ch:
376
  c = c.strip()
@@ -379,13 +478,15 @@ def make_mcqs(text:str, n:int=6)->List[MCQ]:
379
  ch = clean_choices[:4]
380
  while len(ch)<4: ch.append("…")
381
  random.shuffle(ch); ans=ch.index(kp) if kp in ch else 3
 
382
  items.append(MCQ(id=str(uuid.uuid4())[:8], question=q, choices=ch, answer_index=ans))
383
  used_sents.add(s); used_keys.add(kp)
384
- if not items:
 
385
  raise RuntimeError("تعذّر توليد أسئلة.")
386
  return items
387
 
388
- # ====== (4-ب) مُولِّد أسئلة "فهم مباشر" (توليدي mT5) ======
389
  _MT5 = {"tok": None, "model": None, "ok": False}
390
  def get_mt5():
391
  if _MT5["tok"] is not None or _MT5["model"] is not None or _MT5["ok"]:
@@ -400,13 +501,10 @@ def get_mt5():
400
  return _MT5["tok"], _MT5["model"], _MT5["ok"]
401
 
402
  def parse_json_block(s: str) -> Optional[dict]:
403
- # حاول التقاط أول كائن JSON صالح
404
  try:
405
- # إن وُجد JSON مباشر
406
  return json.loads(s)
407
  except Exception:
408
  pass
409
- # التقط أقواس {} الأولى والأخيرة
410
  m = re2.search(r"\{.*\}", s, flags=re2.DOTALL)
411
  if m:
412
  try:
@@ -416,7 +514,6 @@ def parse_json_block(s: str) -> Optional[dict]:
416
  return None
417
 
418
  def comp_prompt(sentence: str) -> str:
419
- # تعليمات قصيرة ومحددة مع تنسيق JSON
420
  return (
421
  "أنت منشئ أسئلة متعددة الخيارات باللغة العربية.\n"
422
  "من الجملة التالية، أنشئ سؤال فهم مباشر واحدًا مع أربع خيارات وإشارة للجواب الصحيح.\n"
@@ -457,15 +554,18 @@ def gen_one_comp_q(sentence: str, tok, model, max_new_tokens=128) -> Optional[MC
457
  except Exception:
458
  return None
459
 
460
- def make_comp_mcqs(text: str, n: int = 6) -> List[MCQ]:
461
  tok, model, ok = get_mt5()
462
  if not ok:
463
- # لو ما توفر mT5 نرجع للفراغ
464
- return make_mcqs(text, n)
465
- sents = [s for s in split_sents(text) if is_clean_sentence(s)]
 
466
  if not sents:
467
- return make_mcqs(text, n)
 
468
  random.shuffle(sents)
 
469
  items: List[MCQ] = []
470
  tried = 0
471
  for s in sents:
@@ -473,23 +573,24 @@ def make_comp_mcqs(text: str, n: int = 6) -> List[MCQ]:
473
  mcq = gen_one_comp_q(s, tok, model)
474
  tried += 1
475
  if mcq:
476
- # تنظيف بسيط للخيار والنص
477
- mcq.question = re2.sub(r"\s+", " ", mcq.question).strip()
478
- mcq.choices = [re2.sub(r"\s+", " ", c).strip() or "…" for c in mcq.choices]
479
- items.append(mcq)
480
- if tried >= n * 6: # سقف محاولات معقول
 
 
 
 
 
 
 
 
481
  break
 
482
  if not items:
483
- # fallback احتياطي
484
- return make_mcqs(text, n)
485
- # توحيد البنية (A..D) بنفس الشكل
486
- normed=[]
487
- for it in items[:n]:
488
- # القص إلى 4 خيارات وتأمين الفهارس
489
- ch = (it.choices + ["…","…","…","…"])[:4]
490
- ai = it.answer_index if 0 <= it.answer_index < 4 else 0
491
- normed.append(MCQ(id=it.id, question=it.question, choices=ch, answer_index=ai))
492
- return normed
493
 
494
  # ------------------ تحويل إلى سجلات العرض ------------------
495
  def clean_option_text(t: str) -> str:
@@ -548,30 +649,25 @@ def render_quiz_html(records: List[dict]) -> str:
548
  return f"""<div id="quiz" class="quiz-wrap">{''.join(parts)}</div>"""
549
 
550
  # ------------------ توليد الامتحان وتبديل الصفحات ------------------
551
- def build_quiz(text_area, file_path, n, model_id, zoom, mode):
552
  text_area = (text_area or "").strip()
553
  if not text_area and not file_path:
554
  return "", gr.update(visible=True), gr.update(visible=False), "🛈 الصق نصًا أو ارفع ملفًا أولًا."
555
- if text_area:
556
- raw = text_area
557
- else:
558
- raw, _ = file_to_text(file_path, model_id=model_id, zoom=float(zoom))
559
  cleaned = postprocess(raw)
560
 
561
- # اختيار الطور
562
  try:
563
  if mode == "فهم مباشر":
564
- items = make_comp_mcqs(cleaned, n=int(n))
565
  else:
566
- items = make_mcqs(cleaned, n=int(n))
567
- except Exception as e:
568
- # fallback النهائي
569
- items = make_mcqs(cleaned, n=int(n))
570
 
571
- recs = to_records(items)
572
  return render_quiz_html(recs), gr.update(visible=False), gr.update(visible=True), ""
573
 
574
- # ------------------ CSS (كما هو) ------------------
575
  CSS = """
576
  :root{
577
  --bg:#0e0e11; --panel:#15161a; --card:#1a1b20; --muted:#a7b0be;
@@ -618,10 +714,10 @@ textarea{min-height:120px}
618
  }
619
  .q-actions .q-submit:disabled{opacity:.5;cursor:not-allowed}
620
  .q-note{color:#ffd1d6}
621
- .q-note.warn{color:#ffd1د6}
622
  """
623
 
624
- # ------------------ JS: ربط Submit بعد الرندر (كما هو مع إبراز الصح) ------------------
625
  ATTACH_LISTENERS_JS = """
626
  () => {
627
  if (window.__q_submit_bound_multi2) { return 'already'; }
@@ -676,7 +772,7 @@ ATTACH_LISTENERS_JS = """
676
  }
677
  """
678
 
679
- # ------------------ واجهة Gradio (نفس الصفحتين + اختيار نوع السؤال) ------------------
680
  with gr.Blocks(title="Question Generator", css=CSS) as demo:
681
  gr.Markdown("<h2 class='top'>Question Generator</h2>")
682
 
@@ -688,8 +784,10 @@ with gr.Blocks(title="Question Generator", css=CSS) as demo:
688
  file_comp = gr.File(label="أو ارفع ملف (PDF / TXT)", file_count="single",
689
  file_types=[".pdf",".txt"], type="filepath", elem_classes=["upload-like"])
690
  num_q = gr.Slider(4, 20, value=DEFAULT_NUM_QUESTIONS, step=1, label="عدد الأسئلة")
691
- # جديد: اختيار نوع السؤال دون تغيير بنية الصفحة
 
692
  mode_radio = gr.Radio(choices=QUESTION_MODES, value="فراغ", label="نوع السؤال")
 
693
 
694
  with gr.Accordion("خيارات PDF المصوّر (اختياري)", open=False):
695
  trocr_model = gr.Dropdown(
@@ -702,6 +800,7 @@ with gr.Blocks(title="Question Generator", css=CSS) as demo:
702
  value=DEFAULT_TROCR_MODEL, label="نموذج TrOCR"
703
  )
704
  trocr_zoom = gr.Slider(2.0, 3.5, value=DEFAULT_TROCR_ZOOM, step=0.1, label="Zoom OCR")
 
705
  btn_build = gr.Button("generate quistion", elem_classes=["button-primary"])
706
  warn = gr.Markdown("", elem_classes=["small"])
707
 
@@ -714,7 +813,7 @@ with gr.Blocks(title="Question Generator", css=CSS) as demo:
714
  # بناء الامتحان + تبديل الصفحات + ربط الـJS
715
  btn_build.click(
716
  build_quiz,
717
- inputs=[text_area, file_comp, num_q, trocr_model, trocr_zoom, mode_radio],
718
  outputs=[quiz_html, page1, page2, warn]
719
  ).then(
720
  None, inputs=None, outputs=[js_wired], js=ATTACH_LISTENERS_JS
 
1
  # -*- coding: utf-8 -*-
2
  # صفحتان ثابتتان + Submit لكل سؤال يعمل فعليًا + منع تغيّر أبعاد صفحة الإدخال
3
  # + طور اختياري لأسئلة فهم مباشر باستخدام mT5 (تحميل كسول + fallback)
4
+ # + تحكم بدرجة الصعوبة + مشتّتات أقوى (BM25) + فلاتر POS/NER وCross-Encoder اختياريين
5
 
6
  import os, json, uuid, random, unicodedata
7
  from dataclasses import dataclass
 
20
  DEFAULT_NUM_QUESTIONS = 6
21
  DEFAULT_TROCR_MODEL = "microsoft/trocr-base-printed"
22
  DEFAULT_TROCR_ZOOM = 2.6
23
+ QUESTION_MODES = ["فراغ", "فهم مباشر"]
24
+ DIFFICULTY_MODES = ["سهل", "متوسط", "صعب"]
25
+
26
+ # NEW: BM25 (اختياري)
27
+ try:
28
+ from rank_bm25 import BM25Okapi
29
+ _HAS_BM25 = True
30
+ except Exception:
31
+ _HAS_BM25 = False
32
 
33
  # ------------------ OCR (تحميل كسول) ------------------
34
  _OCR = {}
35
  def get_ocr(model_id: str):
36
+ try:
37
+ from transformers import pipeline
38
+ import torch
39
+ dev = 0 if torch.cuda.is_available() else -1
40
+ if model_id not in _OCR:
41
+ _OCR[model_id] = pipeline("image-to-text", model=model_id, device=dev)
42
+ return _OCR[model_id]
43
+ except Exception:
44
+ # ارجعي دالة وهمية تعيد نصًا فارغًا بدل التعطّل
45
+ return lambda im: [{"generated_text": ""}]
46
 
47
  # ------------------ PDF/TXT → نص ------------------
48
  def extract_text_with_pypdf(path: str) -> str:
 
136
  return [x for x in s if len(x)>=25]
137
 
138
  # ====== (1) عبارات مفتاحية أذكى: n=3 ثم 2 ثم 1، مع فلترة ======
139
+ def yake_keywords(t: str, k: int = 200) -> List[str]:
140
  phrases = []
141
  seen = set()
142
  for n in [3, 2, 1]:
 
221
  _EMB = False
222
  return _EMB
223
 
224
+ def nearest_terms(target: str, pool: List[str], k: int = 24) -> List[Tuple[str, float]]:
225
  emb = get_embedder()
226
  if not emb:
227
  return []
 
231
  vecs = emb.encode([target] + cand, normalize_embeddings=True)
232
  t, C = vecs[0], vecs[1:]
233
  import numpy as np
234
+ sims = (C @ t) # cosine لأن المتجهات مُطبّعة
235
  idx = np.argsort(-sims)[:k]
236
  return [(cand[i], float(sims[i])) for i in idx]
237
 
 
246
  _MLM = False
247
  return _MLM
248
 
249
+ def mlm_distractors(sentence_with_blank: str, correct: str, k: int = 18) -> List[str]:
250
  masker = get_masker()
251
  if not masker:
252
  return []
253
  masked = sentence_with_blank.replace("_____", masker.tokenizer.mask_token)
254
  try:
255
+ outs = masker(masked, top_k=max(25, k+7))
256
  cands = []
257
  for o in outs:
258
  tok = o["token_str"].strip()
259
  if tok and tok != correct and len(tok) >= 2 and not re2.match(r"^[\p{P}\p{S}\d_]+$", tok):
260
  cands.append(tok)
261
+ uniq, seen = [], set()
262
  for w in cands:
263
  if w not in seen:
264
  uniq.append(w); seen.add(w)
 
311
  except Exception:
312
  return candidates
313
 
314
+ # --------- أدوات BM25 للصعوبة ---------
315
+ def tokenize_ar(s: str) -> List[str]:
316
+ s = norm_ar(s)
317
+ toks = re2.findall(r"\p{L}+", s)
318
+ return [t for t in toks if len(t) >= 2 and t not in AR_STOP]
319
+
320
+ def bm25_build(sentences: List[str]):
321
+ if not _HAS_BM25 or not sentences:
322
+ return None, []
323
+ corpus_tokens = [tokenize_ar(s) for s in sentences]
324
+ bm = BM25Okapi(corpus_tokens)
325
+ return bm, corpus_tokens
326
+
327
+ def bm25_candidates(correct: str, sentences: List[str], bm, corpus_tokens, top: int = 20) -> List[str]:
328
+ if not bm: return []
329
+ q = tokenize_ar(correct)
330
+ scores = bm.get_scores(q)
331
+ idxs = sorted(range(len(scores)), key=lambda i: -scores[i])[:min(top, len(scores))]
332
+ pool = set()
333
+ for i in idxs:
334
+ for tok in corpus_tokens[i]:
335
+ if tok != correct and good_kw(tok):
336
+ pool.add(tok)
337
+ return list(pool)
338
+
339
+ def pick_clean_sentences(sents: List[str], difficulty: str) -> List[str]:
340
+ out = []
341
+ for s in sents:
342
+ if not is_clean_sentence(s):
343
+ continue
344
+ L = len(s)
345
+ if difficulty == "سهل" and not (70 <= L <= 180):
346
+ continue
347
+ if difficulty == "متوسط" and not (70 <= L <= 220):
348
+ continue
349
+ if difficulty == "صعب" and not (60 <= L <= 240):
350
+ continue
351
+ out.append(s)
352
+ return out
353
+
354
+ def similarity_caps(difficulty: str):
355
+ if difficulty == "سهل":
356
+ return 0.88
357
+ if difficulty == "صعب":
358
+ return 0.95
359
+ return 0.92
360
+
361
+ # ====== مشتّتات ذكية مع الصعوبة وBM25 ======
362
+ def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: int = 3,
363
+ all_sentences: Optional[List[str]] = None, difficulty: str = "متوسط") -> List[str]:
364
  base = []
365
+
366
+ # (أ) جيران دلاليين
367
+ base.extend([w for w,_ in nearest_terms(correct, phrase_pool, k=24)])
368
+
369
+ # (ب) FILL-MASK
370
+ for w in mlm_distractors(sentence.replace(correct, "_____"), correct, k=18):
371
+ if w not in base:
372
+ base.append(w)
373
+
374
+ # (ج) BM25 من النص
375
+ if all_sentences:
376
+ bm, corp = bm25_build(all_sentences)
377
+ for w in bm25_candidates(correct, all_sentences, bm, corp, top=18):
378
+ if w not in base:
379
+ base.append(w)
380
+
381
+ # فلترة POS/NER وطول وتطبيع
382
  clean = []
383
  for w in base:
384
  w = w.strip()
385
+ if not w or w == correct:
386
+ continue
387
+ if is_named_entity(w):
388
+ continue
389
+ if not pos_compatible(w, correct):
390
+ continue
391
+ if not length_close(w, correct):
392
+ continue
393
+ if norm_ar(w) == norm_ar(correct):
394
+ continue
395
  clean.append(w)
396
+
397
+ # ترتيب Cross-Encoder (اختياري)
398
+ clean = rank_by_ce(sentence.replace(correct, "_____"), clean)[:max(k*3, k)]
399
+
400
+ # حذف المشتّت شديد القرب دلالياً
401
+ cap = similarity_caps(difficulty)
402
  try:
403
  emb = get_embedder()
404
  if emb and clean:
 
406
  c, others = vecs[0], vecs[1:]
407
  import numpy as np
408
  sims = others @ c
409
+ filtered = [w for w, s in zip(clean, sims) if s < cap]
410
  if len(filtered) >= k:
411
  clean = filtered
412
  except Exception:
413
  pass
414
+
415
  out = clean[:k]
416
  while len(out) < k:
417
  extra = [w for w in phrase_pool if w not in out and w != correct and length_close(w, correct)]
 
421
  out.extend(legacy_distractors(correct, phrase_pool, k=k-len(out)))
422
  return out[:k]
423
 
424
+ # ====== (4-أ) مُولِّد أسئلة "فراغ" ======
425
+ def make_mcqs(text:str, n:int=6, difficulty: str = "متوسط")->List[MCQ]:
426
+ all_sents = split_sents(text)
427
+ sents = pick_clean_sentences(all_sents, difficulty)
428
+ if not sents:
429
  raise ValueError("النص قصير أو غير صالح.")
430
+
431
+ keyphrases = yake_keywords(text, k=200)
432
  keyphrases = [kp for kp in keyphrases if safe_keyword(kp) and 2 <= len(kp) <= 40]
433
+
434
+ # ربط العبارة بجملة مناسبة (ظهور وحيد)
435
  sent_for={}
436
  for s in sents:
 
437
  for kp in keyphrases:
438
  if kp in sent_for: continue
439
  hits = re2.findall(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", s)
440
  if len(hits) == 1:
441
  sent_for[kp]=s
442
+ if len(sent_for)>=n*3:
443
+ break
444
+
445
  if not sent_for:
446
  tokens = [t for t in re2.findall(r"[\p{L}\p{N}_]+", text) if good_kw(t)]
447
  freq = [w for w,_ in sorted(((t, text.count(t)) for t in tokens), key=lambda x:-x[1])]
448
+ keyphrases = [w for w in freq if safe_keyword(w)][:150]
449
  for s in sents:
 
450
  for kp in keyphrases:
451
  if kp in sent_for: continue
452
  hits = re2.findall(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", s)
453
  if len(hits) == 1: sent_for[kp]=s
454
+ if len(sent_for)>=n*2:
455
+ break
456
+
457
  if not sent_for:
458
  raise RuntimeError("تعذّر توليد أسئلة من هذا النص.")
459
+
460
+ # أولوية للعبارات الأطول (أعلميّة أعلى)
461
  items=[]; used_sents=set(); used_keys=set()
462
  for kp in sorted(sent_for.keys(), key=lambda x: (-len(x), x)):
463
  if len(items)>=n: break
464
  s=sent_for[kp]
465
  if s in used_sents or kp in used_keys: continue
466
+
467
  q=re2.sub(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", "_____", s, count=1)
468
+
469
  pool = [x for x in keyphrases if x != kp]
470
+ ch = smart_distractors(kp, pool, s, k=3, all_sentences=all_sents, difficulty=difficulty) + [kp]
471
+
472
+ # تنظيف ومنع تكرار وضمان أربع خيارات
473
  clean_choices=[]; seen=set()
474
  for c in ch:
475
  c = c.strip()
 
478
  ch = clean_choices[:4]
479
  while len(ch)<4: ch.append("…")
480
  random.shuffle(ch); ans=ch.index(kp) if kp in ch else 3
481
+
482
  items.append(MCQ(id=str(uuid.uuid4())[:8], question=q, choices=ch, answer_index=ans))
483
  used_sents.add(s); used_keys.add(kp)
484
+
485
+ if not items:
486
  raise RuntimeError("تعذّر توليد أسئلة.")
487
  return items
488
 
489
+ # ====== (4-ب) مُولِّد أسئلة "فهم مباشر" (mT5) ======
490
  _MT5 = {"tok": None, "model": None, "ok": False}
491
  def get_mt5():
492
  if _MT5["tok"] is not None or _MT5["model"] is not None or _MT5["ok"]:
 
501
  return _MT5["tok"], _MT5["model"], _MT5["ok"]
502
 
503
  def parse_json_block(s: str) -> Optional[dict]:
 
504
  try:
 
505
  return json.loads(s)
506
  except Exception:
507
  pass
 
508
  m = re2.search(r"\{.*\}", s, flags=re2.DOTALL)
509
  if m:
510
  try:
 
514
  return None
515
 
516
  def comp_prompt(sentence: str) -> str:
 
517
  return (
518
  "أنت منشئ أسئلة متعددة الخيارات باللغة العربية.\n"
519
  "من الجملة التالية، أنشئ سؤال فهم مباشر واحدًا مع أربع خيارات وإشارة للجواب الصحيح.\n"
 
554
  except Exception:
555
  return None
556
 
557
+ def make_comp_mcqs(text: str, n: int = 6, difficulty: str = "متوسط") -> List[MCQ]:
558
  tok, model, ok = get_mt5()
559
  if not ok:
560
+ return make_mcqs(text, n, difficulty=difficulty)
561
+
562
+ sents_all = split_sents(text)
563
+ sents = pick_clean_sentences(sents_all, difficulty)
564
  if not sents:
565
+ return make_mcqs(text, n, difficulty=difficulty)
566
+
567
  random.shuffle(sents)
568
+
569
  items: List[MCQ] = []
570
  tried = 0
571
  for s in sents:
 
573
  mcq = gen_one_comp_q(s, tok, model)
574
  tried += 1
575
  if mcq:
576
+ q = re2.sub(r"\s+", " ", mcq.question).strip()
577
+ if not (12 <= len(q) <= 220):
578
+ continue
579
+ choices = [re2.sub(r"\s+", " ", c).strip() for c in mcq.choices]
580
+ seen=set(); clean=[]
581
+ for c in choices:
582
+ if c and c not in seen:
583
+ seen.add(c); clean.append(c)
584
+ clean = (clean + ["…","…","…","…"])[:4]
585
+ ai = mcq.answer_index if isinstance(mcq.answer_index,int) and 0<=mcq.answer_index<4 else 0
586
+
587
+ items.append(MCQ(id=str(uuid.uuid4())[:8], question=q, choices=clean, answer_index=ai))
588
+ if tried >= n * 7:
589
  break
590
+
591
  if not items:
592
+ return make_mcqs(text, n, difficulty=difficulty)
593
+ return items[:n]
 
 
 
 
 
 
 
 
594
 
595
  # ------------------ تحويل إلى سجلات العرض ------------------
596
  def clean_option_text(t: str) -> str:
 
649
  return f"""<div id="quiz" class="quiz-wrap">{''.join(parts)}</div>"""
650
 
651
  # ------------------ توليد الامتحان وتبديل الصفحات ------------------
652
+ def build_quiz(text_area, file_path, n, model_id, zoom, mode, difficulty):
653
  text_area = (text_area or "").strip()
654
  if not text_area and not file_path:
655
  return "", gr.update(visible=True), gr.update(visible=False), "🛈 الصق نصًا أو ارفع ملفًا أولًا."
656
+ raw = text_area if text_area else file_to_text(file_path, model_id=model_id, zoom=float(zoom))[0]
 
 
 
657
  cleaned = postprocess(raw)
658
 
 
659
  try:
660
  if mode == "فهم مباشر":
661
+ items = make_comp_mcqs(cleaned, n=int(n), difficulty=difficulty)
662
  else:
663
+ items = make_mcqs(cleaned, n=int(n), difficulty=difficulty)
664
+ except Exception:
665
+ items = make_mcqs(cleaned, n=int(n), difficulty=difficulty)
 
666
 
667
+ recs = to_records(items)
668
  return render_quiz_html(recs), gr.update(visible=False), gr.update(visible=True), ""
669
 
670
+ # ------------------ CSS ------------------
671
  CSS = """
672
  :root{
673
  --bg:#0e0e11; --panel:#15161a; --card:#1a1b20; --muted:#a7b0be;
 
714
  }
715
  .q-actions .q-submit:disabled{opacity:.5;cursor:not-allowed}
716
  .q-note{color:#ffd1d6}
717
+ .q-note.warn{color:#ffd1d6}
718
  """
719
 
720
+ # ------------------ JS: ربط Submit بعد الرندر (مع إبراز الصح لأسئلة الفراغ) ------------------
721
  ATTACH_LISTENERS_JS = """
722
  () => {
723
  if (window.__q_submit_bound_multi2) { return 'already'; }
 
772
  }
773
  """
774
 
775
+ # ------------------ واجهة Gradio ------------------
776
  with gr.Blocks(title="Question Generator", css=CSS) as demo:
777
  gr.Markdown("<h2 class='top'>Question Generator</h2>")
778
 
 
784
  file_comp = gr.File(label="أو ارفع ملف (PDF / TXT)", file_count="single",
785
  file_types=[".pdf",".txt"], type="filepath", elem_classes=["upload-like"])
786
  num_q = gr.Slider(4, 20, value=DEFAULT_NUM_QUESTIONS, step=1, label="عدد الأسئلة")
787
+
788
+ # خيارات إضافية بدون تغيير البنية العامة
789
  mode_radio = gr.Radio(choices=QUESTION_MODES, value="فراغ", label="نوع السؤال")
790
+ difficulty_radio = gr.Radio(choices=DIFFICULTY_MODES, value="متوسط", label="درجة الصعوبة")
791
 
792
  with gr.Accordion("خيارات PDF المصوّر (اختياري)", open=False):
793
  trocr_model = gr.Dropdown(
 
800
  value=DEFAULT_TROCR_MODEL, label="نموذج TrOCR"
801
  )
802
  trocr_zoom = gr.Slider(2.0, 3.5, value=DEFAULT_TROCR_ZOOM, step=0.1, label="Zoom OCR")
803
+
804
  btn_build = gr.Button("generate quistion", elem_classes=["button-primary"])
805
  warn = gr.Markdown("", elem_classes=["small"])
806
 
 
813
  # بناء الامتحان + تبديل الصفحات + ربط الـJS
814
  btn_build.click(
815
  build_quiz,
816
+ inputs=[text_area, file_comp, num_q, trocr_model, trocr_zoom, mode_radio, difficulty_radio],
817
  outputs=[quiz_html, page1, page2, warn]
818
  ).then(
819
  None, inputs=None, outputs=[js_wired], js=ATTACH_LISTENERS_JS