Leen172 commited on
Commit
a5f2a6f
·
verified ·
1 Parent(s): 254fb45

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +282 -129
app.py CHANGED
@@ -27,7 +27,7 @@ from tqdm import tqdm
27
  random.seed(42)
28
  DEFAULT_LANG = "ar"
29
  DEFAULT_NUM_QUESTIONS = 8
30
- DEFAULT_TROCR_MODEL = "microsoft/trocr-base-printed" # أسرع من large
31
  DEFAULT_TROCR_ZOOM = 2.8
32
 
33
  # كاش بسيط للـ OCR pipeline (تحميل كسول)
@@ -42,7 +42,7 @@ def _get_ocr_pipeline(model_id: str):
42
  return _OCR_PIPE[model_id]
43
 
44
  # =========================
45
- # 2) استخراج النص من PDF
46
  # =========================
47
  def extract_text_with_pypdf(pdf_path: str) -> str:
48
  reader = PdfReader(pdf_path)
@@ -88,7 +88,8 @@ def is_extraction_good(text: str, min_chars: int = 250, min_alpha_ratio: float =
88
  return ratio >= min_alpha_ratio
89
 
90
  def save_text(text: str, out_path: str) -> None:
91
- os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)
 
92
  with open(out_path, "w", encoding="utf-8") as f:
93
  f.write(text)
94
 
@@ -103,7 +104,6 @@ def pdf_to_txt(pdf_path: str, out_txt_path: str = None,
103
  method = "embedded (pypdf)"
104
  else:
105
  if not ocr_model:
106
- # وضع تجريبي بلا OCR
107
  final_text = embedded_text
108
  method = "embedded (pypdf: weak)"
109
  else:
@@ -125,12 +125,9 @@ def strip_page_headers(text: str) -> str:
125
  lines = text.splitlines()
126
  out = []
127
  for ln in lines:
128
- if re2.match(r"^\s*--- \[Page \d+\] ---\s*$", ln):
129
- continue
130
- if re2.match(r"^\s*(Page\s*\d+|صفحة\s*\d+)\s*$", ln):
131
- continue
132
- if re2.match(r"^\s*[-–—_*]{3,}\s*$", ln):
133
- continue
134
  out.append(ln)
135
  return "\n".join(out)
136
 
@@ -143,8 +140,8 @@ def normalize_arabic(text: str) -> str:
143
  text = re2.sub(r"[يى]", "ي", text)
144
  text = re2.sub(r"\s+", " ", text)
145
  # إزالة التكرار الزائد للحروف (مثل جذرياا -> جذريا)
146
- text = re2.sub(r'(\p{L})\1{2,}', r'\1', text) # أكثر من مرتين
147
- text = re2.sub(r'(\p{L})\1', r'\1', text) # التكرار المتبقي
148
  return text.strip()
149
 
150
  def arabic_ocr_fixes(text: str) -> str:
@@ -193,14 +190,10 @@ def top_keywords_yake(text: str, max_k: int = 120, lan: str = 'ar') -> List[str]
193
  seen, out = set(), []
194
  for k in candidates:
195
  kk = k.strip()
196
- if not kk or kk in seen:
197
- continue
198
- if lan == "ar" and kk in AR_STOP:
199
- continue
200
- if len(kk) < 3:
201
- continue
202
- if re2.match(r"^[\p{P}\p{S}]+$", kk):
203
- continue
204
  seen.add(kk)
205
  out.append(kk)
206
  return out
@@ -231,26 +224,18 @@ def build_distractors(correct: str, pool: List[str], k: int = 3) -> List[str]:
231
  target_len = len(correct.strip())
232
  cand = []
233
  for w in pool:
234
- if not w:
235
- continue
236
  w2 = w.strip()
237
- if w2 == correct.strip():
238
- continue
239
- if len(w2) < 3 or w2 in AR_STOP:
240
- continue
241
- if re2.match(r"^[\p{P}\p{S}\d_]+$", w2):
242
- continue
243
- # تقارب طولي
244
  if abs(len(w2) - target_len) <= 3:
245
  cand.append(w2)
246
-
247
  random.shuffle(cand)
248
  out = []
249
  for w in cand:
250
  out.append(w)
251
- if len(out) == k:
252
- break
253
-
254
  fillers = ["—", "— —", "—-"]
255
  while len(out) < k:
256
  out.append(random.choice(fillers))
@@ -260,7 +245,6 @@ def make_mcqs_from_text(text: str, n: int = 8, lang: str = 'ar') -> List[MCQ]:
260
  sentences = split_sentences(text)
261
  if not sentences:
262
  raise ValueError("النص قصير جدًا أو غير صالح لتوليد أسئلة.")
263
-
264
  keywords = top_keywords_yake(text, max_k=160, lan=lang)
265
  if not keywords:
266
  toks = re2.findall(r"[\p{L}\p{N}_]+", text)
@@ -269,27 +253,20 @@ def make_mcqs_from_text(text: str, n: int = 8, lang: str = 'ar') -> List[MCQ]:
269
  for t in toks:
270
  freq[t] = freq.get(t, 0) + 1
271
  keywords = [w for w, c in sorted(freq.items(), key=lambda x: -x[1])][:80]
272
-
273
  sent_for_kw = {}
274
  for s in sentences:
275
  for kw in keywords:
276
- if not _is_good_kw(kw):
277
- continue
278
  if re2.search(rf"(?<!\p{{L}}){re2.escape(kw)}(?!\p{{L}})", s) and kw not in sent_for_kw:
279
  sent_for_kw[kw] = s
280
-
281
  items: List[MCQ] = []
282
  used_sents = set()
283
  pool_iter = [kw for kw in keywords if kw in sent_for_kw]
284
-
285
  for kw in pool_iter:
286
- if len(items) >= n:
287
- break
288
- if not _is_good_kw(kw):
289
- continue
290
  s = sent_for_kw[kw]
291
- if s in used_sents:
292
- continue
293
  blanked = re2.sub(rf"(?<!\p{{L}}){re2.escape(kw)}(?!\p{{L}})", "_____", s, count=1)
294
  correct = kw
295
  distractors = build_distractors(correct, [x for x in keywords if x != kw], k=3)
@@ -297,15 +274,8 @@ def make_mcqs_from_text(text: str, n: int = 8, lang: str = 'ar') -> List[MCQ]:
297
  random.shuffle(choices)
298
  ans_idx = choices.index(correct)
299
  exp = f"مقتبس من الجملة: {s[:220]}" + ("..." if len(s) > 220 else "")
300
- items.append(MCQ(
301
- id=str(uuid.uuid4())[:8],
302
- question=blanked,
303
- choices=choices,
304
- answer_index=ans_idx,
305
- explanation=exp
306
- ))
307
  used_sents.add(s)
308
-
309
  if not items:
310
  raise RuntimeError("تعذر توليد أسئلة من النص. جرّب نصاً أطول أو مختلفاً.")
311
  return items
@@ -317,67 +287,196 @@ AR_PUNCT = "،؛؟"
317
  EN_PUNCT = ",;?"
318
 
319
  def normalize_punct(s: str) -> str:
320
- if not s:
321
- return ""
322
  s = s.replace(",", "،").replace(";", "؛").replace("?", "؟")
323
  return s.strip().strip(AR_PUNCT + EN_PUNCT).strip()
324
 
325
  def is_bad_choice(txt: str) -> bool:
326
- if not txt:
327
- return True
328
  txt = txt.strip()
329
  BAD_NOISE = {"وهنا","اليه","الي","ليبق","لان","لانها","لانّه","ذلك","هذا","هذه"}
330
- if txt in BAD_NOISE:
331
- return True
332
- if len(txt) > 18 and " " not in txt:
333
- return True
334
- if len(txt) < 2:
335
- return True
336
- if txt in AR_STOP:
337
- return True
338
- if re2.match(r"^[\p{P}\p{S}]+$", txt):
339
- return True
340
  return False
341
 
342
  def build_json_records(items: List[MCQ], lang: str, source_pdf: str, method: str, num_questions: int):
343
  json_data = []
344
  letters = ["A", "B", "C", "D"]
345
  for it in items:
346
- opts = []
347
- seen = set()
348
  for idx, lbl in enumerate(letters):
349
  raw = it.choices[idx] if idx < len(it.choices) else ""
350
  txt = normalize_punct(raw)
351
- if is_bad_choice(txt):
352
- txt = ""
353
- if txt in seen:
354
- txt += " "
355
  seen.add(txt)
356
- opts.append({
357
- "id": lbl,
358
- "text": txt,
359
- "is_correct": (it.answer_index == idx)
360
- })
361
  q_clean = normalize_punct(it.question)
362
  exp_clean = normalize_punct(it.explanation)
363
  record = {
364
- "id": it.id,
365
- "question": q_clean,
366
- "options": opts,
367
- "explanation": exp_clean,
368
- "meta": {
369
- "lang": lang,
370
- "normalized": True,
371
- "source_pdf": source_pdf,
372
- "extraction_method": method,
373
- "num_questions": int(num_questions),
374
- }
375
  }
376
  json_data.append(record)
377
  return json_data
378
 
379
  # =========================
380
- # 7) الدالة الرئيسية (دعم PDF و TXT)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
  # =========================
382
  def process_pdf(pdf_file_path,
383
  num_questions=DEFAULT_NUM_QUESTIONS,
@@ -389,16 +488,13 @@ def process_pdf(pdf_file_path,
389
  if not pdf_file_path:
390
  return {}, None, "يرجى رفع ملف PDF/TXT أولاً."
391
 
392
- # pdf_file_path قد يكون str أو NamedString -> خذه كمسار
393
  src_path = str(pdf_file_path)
394
  name_guess = getattr(pdf_file_path, "name", "") if hasattr(pdf_file_path, "name") else ""
395
  filename = Path(name_guess).name or Path(src_path).name or "input"
396
  workdir = tempfile.mkdtemp(prefix="mcq_")
397
 
398
- # تأكد من الامتداد
399
  ext = Path(filename).suffix.lower()
400
  if ext not in [".pdf", ".txt"]:
401
- # حاول تخمين نوعه، افتراض PDF
402
  ext = ".pdf"
403
  if not Path(filename).suffix:
404
  filename += ext
@@ -407,7 +503,7 @@ def process_pdf(pdf_file_path,
407
  shutil.copy(src_path, local_path)
408
  logs.append(f"تم نسخ الملف إلى: {local_path}")
409
 
410
- # 1) استخراج النص بحسب النوع
411
  if ext == ".txt":
412
  with open(local_path, "r", encoding="utf-8", errors="ignore") as f:
413
  raw_text = f.read()
@@ -448,45 +544,102 @@ def process_pdf(pdf_file_path,
448
  return {}, None, "\n".join(logs)
449
 
450
  # =========================
451
- # 8) واجهة Gradio (v5)
452
  # =========================
453
  import gradio as gr
454
 
455
- with gr.Blocks(title="PDF/TXT → MCQ JSON (Arabic YAKE / TrOCR)") as demo:
456
- gr.Markdown("## تحويل PDF/TXT إلى أسئلة اختيار من متعدد وإرجاع JSON جاهز للواجهة")
457
-
458
- with gr.Row():
459
- inp_pdf = gr.File(
460
- label="ارفع PDF أو TXT",
461
- file_count="single",
462
- file_types=[".pdf", ".txt"],
463
- type="filepath", # يُعيد مسار الملف
464
- )
465
- with gr.Column():
466
- num_q = gr.Slider(4, 20, value=DEFAULT_NUM_QUESTIONS, step=1, label="عدد الأسئلة")
467
- trocr_zoom = gr.Slider(2.0, 3.5, value=DEFAULT_TROCR_ZOOM, step=0.1, label="دقة تحويل PDF لصور (Zoom)")
468
- trocr_model = gr.Dropdown(
469
- choices=[
470
- "microsoft/trocr-base-printed",
471
- "microsoft/trocr-large-printed",
472
- "microsoft/trocr-base-handwritten",
473
- "microsoft/trocr-large-handwritten",
474
- ],
475
- value=DEFAULT_TROCR_MODEL,
476
- label="موديل TrOCR (للـ PDF المصوّر)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
477
  )
478
 
479
- btn = gr.Button("تشغيل المعالجة", variant="primary")
480
- out_json = gr.JSON(label="النتيجة (JSON)")
481
- out_file = gr.File(label="تحميل ملف JSON")
482
- out_log = gr.Textbox(label="Logs", lines=10)
483
-
484
- btn.click(
485
- fn=process_pdf,
486
- inputs=[inp_pdf, num_q, gr.State(DEFAULT_LANG), trocr_model, trocr_zoom],
487
- outputs=[out_json, out_file, out_log]
488
- )
489
-
490
- # ملاحظة: Spaces تتعرف تلقائياً على المتغير "demo".
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
491
  if __name__ == "__main__":
492
  demo.queue().launch()
 
27
  random.seed(42)
28
  DEFAULT_LANG = "ar"
29
  DEFAULT_NUM_QUESTIONS = 8
30
+ DEFAULT_TROCR_MODEL = "microsoft/trocr-base-printed"
31
  DEFAULT_TROCR_ZOOM = 2.8
32
 
33
  # كاش بسيط للـ OCR pipeline (تحميل كسول)
 
42
  return _OCR_PIPE[model_id]
43
 
44
  # =========================
45
+ # 2) استخراج النص من PDF/TXT
46
  # =========================
47
  def extract_text_with_pypdf(pdf_path: str) -> str:
48
  reader = PdfReader(pdf_path)
 
88
  return ratio >= min_alpha_ratio
89
 
90
  def save_text(text: str, out_path: str) -> None:
91
+ os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True
92
+ )
93
  with open(out_path, "w", encoding="utf-8") as f:
94
  f.write(text)
95
 
 
104
  method = "embedded (pypdf)"
105
  else:
106
  if not ocr_model:
 
107
  final_text = embedded_text
108
  method = "embedded (pypdf: weak)"
109
  else:
 
125
  lines = text.splitlines()
126
  out = []
127
  for ln in lines:
128
+ if re2.match(r"^\s*--- \[Page \d+\] ---\s*$", ln): continue
129
+ if re2.match(r"^\s*(Page\s*\d+|صفحة\s*\d+)\s*$", ln): continue
130
+ if re2.match(r"^\s*[-–—_*]{3,}\s*$", ln): continue
 
 
 
131
  out.append(ln)
132
  return "\n".join(out)
133
 
 
140
  text = re2.sub(r"[يى]", "ي", text)
141
  text = re2.sub(r"\s+", " ", text)
142
  # إزالة التكرار الزائد للحروف (مثل جذرياا -> جذريا)
143
+ text = re2.sub(r'(\p{L})\1{2,}', r'\1', text)
144
+ text = re2.sub(r'(\p{L})\1', r'\1', text)
145
  return text.strip()
146
 
147
  def arabic_ocr_fixes(text: str) -> str:
 
190
  seen, out = set(), []
191
  for k in candidates:
192
  kk = k.strip()
193
+ if not kk or kk in seen: continue
194
+ if lan == "ar" and kk in AR_STOP: continue
195
+ if len(kk) < 3: continue
196
+ if re2.match(r"^[\p{P}\p{S}]+$", kk): continue
 
 
 
 
197
  seen.add(kk)
198
  out.append(kk)
199
  return out
 
224
  target_len = len(correct.strip())
225
  cand = []
226
  for w in pool:
227
+ if not w: continue
 
228
  w2 = w.strip()
229
+ if w2 == correct.strip(): continue
230
+ if len(w2) < 3 or w2 in AR_STOP: continue
231
+ if re2.match(r"^[\p{P}\p{S}\d_]+$", w2): continue
 
 
 
 
232
  if abs(len(w2) - target_len) <= 3:
233
  cand.append(w2)
 
234
  random.shuffle(cand)
235
  out = []
236
  for w in cand:
237
  out.append(w)
238
+ if len(out) == k: break
 
 
239
  fillers = ["—", "— —", "—-"]
240
  while len(out) < k:
241
  out.append(random.choice(fillers))
 
245
  sentences = split_sentences(text)
246
  if not sentences:
247
  raise ValueError("النص قصير جدًا أو غير صالح لتوليد أسئلة.")
 
248
  keywords = top_keywords_yake(text, max_k=160, lan=lang)
249
  if not keywords:
250
  toks = re2.findall(r"[\p{L}\p{N}_]+", text)
 
253
  for t in toks:
254
  freq[t] = freq.get(t, 0) + 1
255
  keywords = [w for w, c in sorted(freq.items(), key=lambda x: -x[1])][:80]
 
256
  sent_for_kw = {}
257
  for s in sentences:
258
  for kw in keywords:
259
+ if not _is_good_kw(kw): continue
 
260
  if re2.search(rf"(?<!\p{{L}}){re2.escape(kw)}(?!\p{{L}})", s) and kw not in sent_for_kw:
261
  sent_for_kw[kw] = s
 
262
  items: List[MCQ] = []
263
  used_sents = set()
264
  pool_iter = [kw for kw in keywords if kw in sent_for_kw]
 
265
  for kw in pool_iter:
266
+ if len(items) >= n: break
267
+ if not _is_good_kw(kw): continue
 
 
268
  s = sent_for_kw[kw]
269
+ if s in used_sents: continue
 
270
  blanked = re2.sub(rf"(?<!\p{{L}}){re2.escape(kw)}(?!\p{{L}})", "_____", s, count=1)
271
  correct = kw
272
  distractors = build_distractors(correct, [x for x in keywords if x != kw], k=3)
 
274
  random.shuffle(choices)
275
  ans_idx = choices.index(correct)
276
  exp = f"مقتبس من الجملة: {s[:220]}" + ("..." if len(s) > 220 else "")
277
+ items.append(MCQ(id=str(uuid.uuid4())[:8], question=blanked, choices=choices, answer_index=ans_idx, explanation=exp))
 
 
 
 
 
 
278
  used_sents.add(s)
 
279
  if not items:
280
  raise RuntimeError("تعذر توليد أسئلة من النص. جرّب نصاً أطول أو مختلفاً.")
281
  return items
 
287
  EN_PUNCT = ",;?"
288
 
289
  def normalize_punct(s: str) -> str:
290
+ if not s: return ""
 
291
  s = s.replace(",", "،").replace(";", "؛").replace("?", "؟")
292
  return s.strip().strip(AR_PUNCT + EN_PUNCT).strip()
293
 
294
  def is_bad_choice(txt: str) -> bool:
295
+ if not txt: return True
 
296
  txt = txt.strip()
297
  BAD_NOISE = {"وهنا","اليه","الي","ليبق","لان","لانها","لانّه","ذلك","هذا","هذه"}
298
+ if txt in BAD_NOISE: return True
299
+ if len(txt) > 18 and " " not in txt: return True
300
+ if len(txt) < 2: return True
301
+ if txt in AR_STOP: return True
302
+ if re2.match(r"^[\p{P}\p{S}]+$", txt): return True
 
 
 
 
 
303
  return False
304
 
305
  def build_json_records(items: List[MCQ], lang: str, source_pdf: str, method: str, num_questions: int):
306
  json_data = []
307
  letters = ["A", "B", "C", "D"]
308
  for it in items:
309
+ opts, seen = [], set()
 
310
  for idx, lbl in enumerate(letters):
311
  raw = it.choices[idx] if idx < len(it.choices) else ""
312
  txt = normalize_punct(raw)
313
+ if is_bad_choice(txt): txt = "—"
314
+ if txt in seen: txt += " "
 
 
315
  seen.add(txt)
316
+ opts.append({"id": lbl, "text": txt, "is_correct": (it.answer_index == idx)})
 
 
 
 
317
  q_clean = normalize_punct(it.question)
318
  exp_clean = normalize_punct(it.explanation)
319
  record = {
320
+ "id": it.id, "question": q_clean, "options": opts, "explanation": exp_clean,
321
+ "meta": {"lang": lang, "normalized": True, "source_pdf": source_pdf, "extraction_method": method, "num_questions": int(num_questions)}
 
 
 
 
 
 
 
 
 
322
  }
323
  json_data.append(record)
324
  return json_data
325
 
326
  # =========================
327
+ # 7) دوال تبويب "حلّ الاختبار"
328
+ # =========================
329
+ def _format_question(rec):
330
+ q = rec.get("question","").strip()
331
+ return f"### السؤال:\n{q}"
332
+
333
+ def _radio_choices(rec):
334
+ # يعيد قائمة نصوص مثل "A) ...", "B) ..."
335
+ letters = ["A","B","C","D"]
336
+ out = []
337
+ for opt in rec.get("options", []):
338
+ lid, text = opt.get("id",""), opt.get("text","")
339
+ out.append(f"{lid}) {text}")
340
+ # إذا ناقص خيارات، كمّل لمواءمة المكوّن
341
+ while len(out) < 4:
342
+ out.append(f"{letters[len(out)]}) —")
343
+ return out
344
+
345
+ def _correct_letter(rec):
346
+ for opt in rec.get("options", []):
347
+ if opt.get("is_correct"):
348
+ return opt.get("id","")
349
+ return ""
350
+
351
+ def _explanation(rec):
352
+ return rec.get("explanation","")
353
+
354
+ def init_quiz_state(records):
355
+ # ترتيب عشوائي اختياري هنا (يمكن إبقاء كما هو)
356
+ # random.shuffle(records)
357
+ return {
358
+ "records": records,
359
+ "idx": 0,
360
+ "answers": {}, # id السؤال -> "A"/"B"/"C"/"D"
361
+ "revealed": set(), # ids تم إظهار حلّها
362
+ "finished": False,
363
+ "csv_path": None
364
+ }
365
+
366
+ def render_current(rec, user_choice=None, revealed=False):
367
+ q_md = _format_question(rec)
368
+ choices = _radio_choices(rec)
369
+ exp = _explanation(rec) if revealed else ""
370
+ progress = ""
371
+ correct = _correct_letter(rec)
372
+ feedback = ""
373
+ if user_choice:
374
+ if revealed:
375
+ feedback = "✅ إجابة صحيحة" if user_choice == correct else f"❌ إجابة خاطئة — الصحيح: {correct}"
376
+ else:
377
+ feedback = f"تم اختيار: {user_choice}"
378
+ return q_md, choices, exp, feedback
379
+
380
+ def on_start_quiz(json_records):
381
+ if not json_records or not isinstance(json_records, list):
382
+ return None, "لم يتم العثور على أسئلة صالحة."
383
+ return init_quiz_state(json_records), "تم بدء الاختبار. بالتوفيق!"
384
+
385
+ def on_load_json_file(file_path):
386
+ if not file_path: return None, "لم يتم اختيار ملف."
387
+ try:
388
+ with open(str(file_path), "r", encoding="utf-8") as f:
389
+ data = json.load(f)
390
+ if not isinstance(data, list): raise ValueError("صيغة JSON غير صحيحة (يجب أن تكون قائمة).")
391
+ return init_quiz_state(data), "تم تحميل ملف JSON بنجاح. اضغط بدء الاختبار."
392
+ except Exception as e:
393
+ return None, f"خطأ في قراءة JSON: {e}"
394
+
395
+ def on_show_question(state):
396
+ if not state: return "", [], "", "",""
397
+ recs, idx = state["records"], state["idx"]
398
+ rec = recs[idx]
399
+ q_md, choices, exp, feedback = render_current(
400
+ rec,
401
+ user_choice=state["answers"].get(rec["id"]),
402
+ revealed=(rec["id"] in state["revealed"])
403
+ )
404
+ pos = f"{idx+1} / {len(recs)}"
405
+ return q_md, choices, exp, feedback, pos
406
+
407
+ def on_select_choice(state, choice_label):
408
+ if not state or not choice_label: return state, ""
409
+ rec = state["records"][state["idx"]]
410
+ # choice_label على شكل "A) نص"
411
+ chosen_letter = choice_label.split(")")[0].strip()
412
+ state["answers"][rec["id"]] = chosen_letter
413
+ if rec["id"] in state["revealed"]:
414
+ # أعِد توليد الفيدباك
415
+ correct = _correct_letter(rec)
416
+ fb = "✅ إجابة صحيحة" if chosen_letter == correct else f"❌ إجابة خاطئة — الصحيح: {correct}"
417
+ else:
418
+ fb = f"تم اختيار: {chosen_letter}"
419
+ return state, fb
420
+
421
+ def on_prev(state):
422
+ if not state: return state
423
+ state["idx"] = max(0, state["idx"]-1)
424
+ return state
425
+
426
+ def on_next(state):
427
+ if not state: return state
428
+ state["idx"] = min(len(state["records"])-1, state["idx"]+1)
429
+ return state
430
+
431
+ def on_reveal(state):
432
+ if not state: return state, ""
433
+ rec = state["records"][state["idx"]]
434
+ state["revealed"].add(rec["id"])
435
+ user = state["answers"].get(rec["id"])
436
+ correct = _correct_letter(rec)
437
+ fb = "✅ إجابة صحيحة" if user == correct else (f"❌ إجابة خاطئة — الصحيح: {correct}" if user else f"الصحيح: {correct}")
438
+ return state, fb
439
+
440
+ def on_finish(state):
441
+ if not state: return state, "", None
442
+ recs = state["records"]
443
+ correct_count, wrong_count, skipped = 0,0,0
444
+ rows = []
445
+ for rec in recs:
446
+ qid = rec["id"]
447
+ user = state["answers"].get(qid)
448
+ correct = _correct_letter(rec)
449
+ is_correct = (user == correct) if user else False
450
+ if user is None: skipped += 1
451
+ elif is_correct: correct_count += 1
452
+ else: wrong_count += 1
453
+ # صف للـ CSV
454
+ # جمع النصوص للخيارات
455
+ opts = {opt["id"]: opt["text"] for opt in rec.get("options", [])}
456
+ rows.append({
457
+ "question": rec.get("question",""),
458
+ "A": opts.get("A",""), "B": opts.get("B",""),
459
+ "C": opts.get("C",""), "D": opts.get("D",""),
460
+ "user_choice": user or "",
461
+ "correct": correct,
462
+ "is_correct": bool(is_correct)
463
+ })
464
+ total = len(recs)
465
+ score = f"النتيجة: {correct_count}/{total} (صحيح: {correct_count}، خطأ: {wrong_count}، متروك: {skipped})"
466
+ # CSV
467
+ df = pd.DataFrame(rows)
468
+ workdir = tempfile.mkdtemp(prefix="quiz_")
469
+ csv_path = os.path.join(workdir, "results.csv")
470
+ df.to_csv(csv_path, index=False, encoding="utf-8-sig")
471
+ state["finished"] = True
472
+ state["csv_path"] = csv_path
473
+ return state, score, csv_path
474
+
475
+ def on_reset():
476
+ return None, "", "", "", "", "", None, "تمت إعادة الضبط."
477
+
478
+ # =========================
479
+ # 8) التبويب الأول: توليد الأسئلة (PDF/TXT → JSON)
480
  # =========================
481
  def process_pdf(pdf_file_path,
482
  num_questions=DEFAULT_NUM_QUESTIONS,
 
488
  if not pdf_file_path:
489
  return {}, None, "يرجى رفع ملف PDF/TXT أولاً."
490
 
 
491
  src_path = str(pdf_file_path)
492
  name_guess = getattr(pdf_file_path, "name", "") if hasattr(pdf_file_path, "name") else ""
493
  filename = Path(name_guess).name or Path(src_path).name or "input"
494
  workdir = tempfile.mkdtemp(prefix="mcq_")
495
 
 
496
  ext = Path(filename).suffix.lower()
497
  if ext not in [".pdf", ".txt"]:
 
498
  ext = ".pdf"
499
  if not Path(filename).suffix:
500
  filename += ext
 
503
  shutil.copy(src_path, local_path)
504
  logs.append(f"تم نسخ الملف إلى: {local_path}")
505
 
506
+ # 1) استخراج النص
507
  if ext == ".txt":
508
  with open(local_path, "r", encoding="utf-8", errors="ignore") as f:
509
  raw_text = f.read()
 
544
  return {}, None, "\n".join(logs)
545
 
546
  # =========================
547
+ # 9) واجهة Gradio (تبويبان)
548
  # =========================
549
  import gradio as gr
550
 
551
+ with gr.Blocks(title="PDF/TXT → MCQ + Quiz", css="""
552
+ body { direction: rtl; font-family: system-ui, 'Cairo', 'IBM Plex Arabic', sans-serif; }
553
+ label, .gr-markdown { text-align: right; }
554
+ """) as demo:
555
+ gr.Markdown("## مولّد أسئلة + واجهة اختبار تفاعلي")
556
+
557
+ # حالة مشتركة بين التبويبين
558
+ quiz_state = gr.State(value=None) # سيحمل dict من init_quiz_state(...)
559
+ toast = gr.Markdown("")
560
+
561
+ with gr.Tabs():
562
+ # --- تبويب 1: توليد الأسئلة ---
563
+ with gr.TabItem("توليد الأسئلة (PDF/TXT JSON)"):
564
+ with gr.Row():
565
+ inp_pdf = gr.File(label="ارفع PDF أو TXT", file_count="single", file_types=[".pdf",".txt"], type="filepath")
566
+ with gr.Column():
567
+ num_q = gr.Slider(4, 20, value=DEFAULT_NUM_QUESTIONS, step=1, label="عدد الأسئلة")
568
+ trocr_zoom = gr.Slider(2.0, 3.5, value=DEFAULT_TROCR_ZOOM, step=0.1, label="دقة تحويل PDF لصور (Zoom)")
569
+ trocr_model = gr.Dropdown(
570
+ choices=[
571
+ "microsoft/trocr-base-printed",
572
+ "microsoft/trocr-large-printed",
573
+ "microsoft/trocr-base-handwritten",
574
+ "microsoft/trocr-large-handwritten",
575
+ ],
576
+ value=DEFAULT_TROCR_MODEL, label="موديل TrOCR (للـ PDF المصوّر)"
577
+ )
578
+ btn_gen = gr.Button("تشغيل المعالجة", variant="primary")
579
+ out_json = gr.JSON(label="النتيجة (JSON)")
580
+ out_file = gr.File(label="تحميل mcqs.json")
581
+ out_log = gr.Textbox(label="Logs", lines=10)
582
+ btn_send_to_quiz = gr.Button("إرسال الأسئلة إلى تبويب الاختبار")
583
+
584
+ btn_gen.click(
585
+ fn=process_pdf,
586
+ inputs=[inp_pdf, num_q, gr.State(DEFAULT_LANG), trocr_model, trocr_zoom],
587
+ outputs=[out_json, out_file, out_log]
588
  )
589
 
590
+ # إرسال الناتج مباشرة إلى التبويب الثاني
591
+ def _send_to_quiz(records):
592
+ if not records: return None, "لا يوجد أسئلة لإرسالها."
593
+ return init_quiz_state(records), "تم إرسال الأسئلة إلى تبويب الاختبار. افتحه واضغط 'إظهار السؤال'."
594
+ btn_send_to_quiz.click(_send_to_quiz, inputs=[out_json], outputs=[quiz_state, toast])
595
+
596
+ # --- تبويب 2: حلّ الاختبار ---
597
+ with gr.TabItem("حلّ الاختبار (Quiz)"):
598
+ gr.Markdown("### 1) حمّل JSON للأسئلة أو استخدم زر الإرسال من التبويب الأول")
599
+ json_file = gr.File(label="أو ارفع ملف JSON", file_types=[".json"], type="filepath")
600
+ btn_load_json = gr.Button("تحميل ملف JSON")
601
+ btn_start = gr.Button("بدء الاختبار", variant="primary")
602
+
603
+ gr.Markdown("### 2) حل السؤال الحالي")
604
+ q_md = gr.Markdown("")
605
+ choices = gr.Radio(choices=[], label="اختر الإجابة")
606
+ exp_md = gr.Markdown("")
607
+ feedback = gr.Markdown("")
608
+ progress = gr.Label("")
609
+
610
+ with gr.Row():
611
+ btn_prev = gr.Button("السابق")
612
+ btn_next = gr.Button("التالي")
613
+ btn_reveal = gr.Button("إظهار الإجابة")
614
+ with gr.Row():
615
+ btn_finish = gr.Button("إنهاء الاختبار", variant="stop")
616
+ btn_reset = gr.Button("إعادة ضبط")
617
+
618
+ score_md = gr.Markdown("")
619
+ results_csv = gr.File(label="تحميل نتائج CSV")
620
+
621
+ # ربط الأزرار بالدوال
622
+ btn_load_json.click(on_load_json_file, inputs=[json_file], outputs=[quiz_state, toast])
623
+ btn_start.click(on_start_quiz, inputs=[quiz_state], outputs=[quiz_state, toast])
624
+ # عرض السؤال الحالي
625
+ def _show_and_render(state):
626
+ return on_show_question(state)
627
+ # عند البدء أو التنقل أو الإظهار نعيد رندر
628
+ btn_start.click(_show_and_render, inputs=[quiz_state], outputs=[q_md, choices, exp_md, feedback, progress])
629
+ btn_prev.click(on_prev, inputs=[quiz_state], outputs=[quiz_state]).then(_show_and_render, inputs=[quiz_state], outputs=[q_md, choices, exp_md, feedback, progress])
630
+ btn_next.click(on_next, inputs=[quiz_state], outputs=[quiz_state]).then(_show_and_render, inputs=[quiz_state], outputs=[q_md, choices, exp_md, feedback, progress])
631
+ btn_reveal.click(on_reveal, inputs=[quiz_state], outputs=[quiz_state, feedback]).then(_show_and_render, inputs=[quiz_state], outputs=[q_md, choices, exp_md, feedback, progress])
632
+
633
+ # اختيار الإجابة
634
+ def _on_choice(state, choice):
635
+ return on_select_choice(state, choice)
636
+ choices.change(_on_choice, inputs=[quiz_state, choices], outputs=[quiz_state, feedback])
637
+
638
+ # إنهاء
639
+ btn_finish.click(on_finish, inputs=[quiz_state], outputs=[quiz_state, score_md, results_csv])
640
+ # إعادة ضبط
641
+ btn_reset.click(on_reset, outputs=[quiz_state, q_md, exp_md, feedback, progress, score_md, results_csv, toast])
642
+
643
+ # Spaces تتعرف على demo تلقائيًا
644
  if __name__ == "__main__":
645
  demo.queue().launch()