Leen172 commited on
Commit
ef6a315
·
verified ·
1 Parent(s): a12b206

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +281 -442
app.py CHANGED
@@ -1,14 +1,7 @@
1
- # app.py
2
  # -*- coding: utf-8 -*-
 
3
 
4
- import os
5
- import io
6
- import json
7
- import uuid
8
- import random
9
- import tempfile
10
- import shutil
11
- import unicodedata
12
  from dataclasses import dataclass
13
  from pathlib import Path
14
  from typing import List, Tuple
@@ -18,169 +11,118 @@ from pypdf import PdfReader
18
  import fitz # PyMuPDF
19
  import regex as re2
20
  import yake
 
21
 
22
- # =========================
23
- # إعدادات عامة
24
- # =========================
25
  random.seed(42)
26
  DEFAULT_LANG = "ar"
27
- DEFAULT_NUM_QUESTIONS = 8
28
  DEFAULT_TROCR_MODEL = "microsoft/trocr-base-printed"
29
- DEFAULT_TROCR_ZOOM = 2.8
30
-
31
- # كاش بسيط للـ OCR pipeline (تحميل كسول)
32
- _OCR_PIPE = {}
33
- def _get_ocr_pipeline(model_id: str):
34
- """تحميل كسول + كاش لنموذج TrOCR."""
35
- from transformers import pipeline # استيراد متأخر
36
- import torch # استيراد متأخر
37
- device = 0 if torch.cuda.is_available() else -1
38
- if model_id not in _OCR_PIPE:
39
- _OCR_PIPE[model_id] = pipeline("image-to-text", model=model_id, device=device)
40
- return _OCR_PIPE[model_id]
41
-
42
- # =========================
43
- # 2) استخراج النص من PDF/TXT
44
- # =========================
45
- def extract_text_with_pypdf(pdf_path: str) -> str:
46
- reader = PdfReader(pdf_path)
47
- texts = []
48
- for page in reader.pages:
49
- try:
50
- t = page.extract_text() or ""
51
- except Exception:
52
- t = ""
53
- texts.append(t)
54
- return "\n".join(texts).strip()
55
-
56
- def pdf_pages_to_images(pdf_path: str, zoom: float = 2.5) -> List[Image.Image]:
57
- doc = fitz.open(pdf_path)
58
  imgs = []
59
- mat = fitz.Matrix(zoom, zoom)
60
- for page in doc:
61
- pix = page.get_pixmap(matrix=mat, alpha=False)
62
- img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
63
- imgs.append(img)
64
  doc.close()
65
  return imgs
66
 
67
- def extract_text_with_ocr(pdf_path: str, model_id: str, zoom: float = 2.5) -> str:
68
- ocr = _get_ocr_pipeline(model_id)
69
- images = pdf_pages_to_images(pdf_path, zoom=zoom)
70
- page_texts = []
71
- for idx, img in enumerate(images):
72
  try:
73
  out = ocr(img)
74
- txt = out[0]["generated_text"].strip() if out and "generated_text" in out[0] else ""
75
  except Exception:
76
  txt = ""
77
- page_texts.append(f"--- [Page {idx+1}] ---\n{txt}")
78
- return "\n\n".join(page_texts).strip()
79
-
80
- def is_extraction_good(text: str, min_chars: int = 250, min_alpha_ratio: float = 0.15) -> bool:
81
- if len(text) < min_chars:
82
- return False
83
- alnum = sum(ch.isalnum() for ch in text)
84
- ratio = alnum / max(1, len(text))
85
- return ratio >= min_alpha_ratio
86
-
87
- def pdf_to_text(pdf_path: str,
88
- ocr_model: str = DEFAULT_TROCR_MODEL,
89
- ocr_zoom: float = DEFAULT_TROCR_ZOOM) -> Tuple[str, str]:
90
- """
91
- يرجع (النص النهائي، طريقة الاستخراج) بدون أي حفظ ملفات.
92
- """
93
- assert os.path.isfile(pdf_path), f"File not found: {pdf_path}"
94
- embedded_text = extract_text_with_pypdf(pdf_path)
95
- if is_extraction_good(embedded_text):
96
- return embedded_text, "embedded (pypdf)"
97
- if not ocr_model:
98
- return embedded_text, "embedded (pypdf: weak)"
99
- return extract_text_with_ocr(pdf_path, model_id=ocr_model, zoom=ocr_zoom), "OCR (Hugging Face TrOCR)"
100
-
101
- # =========================
102
- # 3) تطبيع/تصحيح عربي
103
- # =========================
104
- def strip_page_headers(text: str) -> str:
105
- lines = text.splitlines()
106
- out = []
107
- for ln in lines:
108
  if re2.match(r"^\s*--- \[Page \d+\] ---\s*$", ln): continue
109
  if re2.match(r"^\s*(Page\s*\d+|صفحة\s*\d+)\s*$", ln): continue
110
  if re2.match(r"^\s*[-–—_*]{3,}\s*$", ln): continue
111
  out.append(ln)
112
  return "\n".join(out)
113
 
114
- AR_DIAC = r"[ًٌٍَُِّْ]"
115
- def normalize_arabic(text: str) -> str:
116
- text = unicodedata.normalize("NFKC", text)
117
- text = re2.sub(r"[ـ]", "", text)
118
- text = re2.sub(AR_DIAC, "", text)
119
- text = re2.sub(r"[إأآا]", "ا", text)
120
- text = re2.sub(r"[يى]", "ي", text)
121
- text = re2.sub(r"\s+", " ", text)
122
- # إزالة تكرار الحروف
123
- text = re2.sub(r'(\p{L})\1{2,}', r'\1', text)
124
- text = re2.sub(r'(\p{L})\1', r'\1', text)
125
- return text.strip()
126
-
127
- def arabic_ocr_fixes(text: str) -> str:
128
- fixes = {
129
- " الصطناعي": " الاصطناعي",
130
- "صطناعي": "اصطناعي",
131
- "الذكاء الاصطناعيي": "الذكاء الاصطناعي",
132
- "ذكاء صطناعي": "ذكاء اصطناعي",
133
- "الذكاء الاصطناعي.": "الذكاء الاصطناعي.",
134
- "التعليم ": "التعليم ",
135
- " مع غني": " غني",
136
- "مع غني ": " غني ",
137
- " غير المشبعة": " غيرُ المشبعة",
138
- "الااصطناعي": "الاصطناعي",
139
- "وشخصياا": "وشخصياً",
140
- }
141
- for wrong, right in fixes.items():
142
- text = text.replace(wrong, right)
143
- return text
144
-
145
- def postprocess_text(raw_text: str, lang: str = "ar") -> str:
146
- t = strip_page_headers(raw_text)
147
- t = t.replace("\r", "\n")
148
  t = re2.sub(r"\n{3,}", "\n\n", t)
149
  t = re2.sub(r"\d+\s*[\[\(][^\]\)]*[\]\)]", " ", t)
150
  t = re2.sub(r"\[\d+\]", " ", t)
151
- if lang == "ar":
152
- t = normalize_arabic(t)
153
- t = arabic_ocr_fixes(t)
154
- return t
155
-
156
- # =========================
157
- # 4) YAKE + تقسيم الجمل
158
- # =========================
159
  SENT_SPLIT = re2.compile(r"(?<=[\.!؟\?])\s+")
160
- AR_STOP = set("""
161
- في على من إلى عن مع لدى ذلك هذه هذا الذين التي الذي اللواتي اللواتيا أو أم إن أن كان تكون كانوا كانت كنت كنا كانا كانتِ ثم قد لقد ربما بل لكن لكنَّ إلا سوى حتى حيث كما لما لماّ لماَّ لماً ما ماذا لماذا متى أين كيف أي أيّ أيُّ هناك هنا هناكَ تلك ذلكم ذلكن أولئك هؤلاء هما هن هم أنتِ أنتَ أنتما أنتن أنتم أنا نحن هي هو هنَّ همَّ
162
- و أو كما بين بسبب بدون خلال عبر لدى لدىً حتى حيث ضمن عبره عليها عليه عليهم علي على إلي إليك إليه إليها لديك لديكِ لديه لديها لكم لكنكما لكنكن ولكن
163
- هذا هذه ذلك تلك هؤلاء أولئك كل بعض أي أيّ أيًا أحد شيء شيئًا أشياء
164
- "وهنا","اليه","الي","له","لها","لدي","لديه","لديها","لنا","عنده","عندها","مع","عبر","ضمن","حسب","حيث","كما","قد","بل","لكن","إذ","اذ","اذا","إن","أن","أيضا","فإن","فانه","فإنه","انه","إنه","مثلا","مثلاً","مثلاَ"
165
- """.split())
166
-
167
- def top_keywords_yake(text: str, max_k: int = 120, lan: str = 'ar') -> List[str]:
168
- kw_extractor = yake.KeywordExtractor(lan=lan, n=1, top=max_k)
169
- candidates = [kw for kw, _ in kw_extractor.extract_keywords(text)]
170
- seen, out = set(), []
171
- for k in candidates:
172
- kk = k.strip()
173
- if not kk or kk in seen: continue
174
- if lan == "ar" and kk in AR_STOP: continue
175
- if len(kk) < 3: continue
176
- if re2.match(r"^[\p{P}\p{S}]+$", kk): continue
177
- seen.add(kk)
178
- out.append(kk)
179
  return out
180
 
181
- # =========================
182
- # 5) مُولِّد MCQ
183
- # =========================
184
  @dataclass
185
  class MCQ:
186
  id: str
@@ -189,326 +131,223 @@ class MCQ:
189
  answer_index: int
190
  explanation: str
191
 
192
- def split_sentences(text: str) -> List[str]:
193
- sents = [s.strip() for s in SENT_SPLIT.split(text) if s.strip()]
194
- return [s for s in sents if len(s) >= 25]
195
 
196
- def _is_good_kw(kw: str) -> bool:
197
- if not kw or len(kw) < 3: return False
198
- if kw in AR_STOP: return False
199
- if re2.match(r"^[\p{P}\p{S}\d_]+$", kw): return False
200
- return True
201
-
202
- def build_distractors(correct: str, pool: List[str], k: int = 3) -> List[str]:
203
- """ملهيات أقرب طولياً للسياق."""
204
- target_len = len(correct.strip())
205
- cand = []
206
  for w in pool:
207
- if not w: continue
208
- w2 = w.strip()
209
- if w2 == correct.strip(): continue
210
- if len(w2) < 3 or w2 in AR_STOP: continue
211
- if re2.match(r"^[\p{P}\p{S}\d_]+$", w2): continue
212
- if abs(len(w2) - target_len) <= 3:
213
- cand.append(w2)
214
  random.shuffle(cand)
215
- out = []
216
- for w in cand:
217
- out.append(w)
218
- if len(out) == k: break
219
- fillers = ["—", "— —", "—-"]
220
- while len(out) < k:
221
- out.append(random.choice(fillers))
222
  return out
223
 
224
- def make_mcqs_from_text(text: str, n: int = 8, lang: str = 'ar') -> List[MCQ]:
225
- sentences = split_sentences(text)
226
- if not sentences:
227
- raise ValueError("النص قصير جدًا أو غير صالح لتوليد أسئلة.")
228
- keywords = top_keywords_yake(text, max_k=160, lan=lang)
229
- if not keywords:
230
- toks = re2.findall(r"[\p{L}\p{N}_]+", text)
231
- toks = [t for t in toks if not (lang == "ar" and t in AR_STOP)]
232
- freq = {}
233
- for t in toks:
234
- freq[t] = freq.get(t, 0) + 1
235
- keywords = [w for w, c in sorted(freq.items(), key=lambda x: -x[1])][:80]
236
- sent_for_kw = {}
237
- for s in sentences:
238
- for kw in keywords:
239
- if not _is_good_kw(kw): continue
240
- if re2.search(rf"(?<!\p{{L}}){re2.escape(kw)}(?!\p{{L}})", s) and kw not in sent_for_kw:
241
- sent_for_kw[kw] = s
242
- items: List[MCQ] = []
243
- used_sents = set()
244
- pool_iter = [kw for kw in keywords if kw in sent_for_kw]
245
- for kw in pool_iter:
246
- if len(items) >= n: break
247
- if not _is_good_kw(kw): continue
248
- s = sent_for_kw[kw]
249
- if s in used_sents: continue
250
- blanked = re2.sub(rf"(?<!\p{{L}}){re2.escape(kw)}(?!\p{{L}})", "_____", s, count=1)
251
- correct = kw
252
- distractors = build_distractors(correct, [x for x in keywords if x != kw], k=3)
253
- choices = distractors + [correct]
254
- random.shuffle(choices)
255
- ans_idx = choices.index(correct)
256
- exp = f"مقتبس من الجملة: {s[:220]}" + ("..." if len(s) > 220 else "")
257
- items.append(MCQ(id=str(uuid.uuid4())[:8], question=blanked, choices=choices, answer_index=ans_idx, explanation=exp))
258
- used_sents.add(s)
259
- if not items:
260
- raise RuntimeError("تعذر توليد أسئلة من النص. جرّب نصاً أطول أو مختلفاً.")
261
  return items
262
 
263
- # =========================
264
- # 6) تحويل عناصر الأسئلة إلى سجلات لواجهة الحلّ
265
- # =========================
266
- AR_PUNCT = "،؛؟"
267
- EN_PUNCT = ",;?"
268
-
269
- def normalize_punct(s: str) -> str:
270
- if not s: return ""
271
- s = s.replace(",", "،").replace(";", "؛").replace("?", "؟")
272
- return s.strip().strip(AR_PUNCT + EN_PUNCT).strip()
273
-
274
- def build_quiz_records(items: List[MCQ], lang: str, source_name: str, method: str, num_questions: int):
275
- json_data = []
276
- letters = ["A", "B", "C", "D"]
277
  for it in items:
278
- opts = []
279
- for idx, lbl in enumerate(letters):
280
- raw = it.choices[idx] if idx < len(it.choices) else ""
281
- txt = normalize_punct(raw) or ""
282
- opts.append({"id": lbl, "text": txt, "is_correct": (it.answer_index == idx)})
283
- q_clean = normalize_punct(it.question)
284
- exp_clean = normalize_punct(it.explanation)
285
- record = {
286
  "id": it.id,
287
- "question": q_clean,
288
  "options": opts,
289
- "explanation": exp_clean,
290
- "meta": {"lang": lang, "source": source_name, "extraction_method": method, "num_questions": int(num_questions)}
291
- }
292
- json_data.append(record)
293
- return json_data
294
-
295
- # =========================
296
- # 7) منطق الاختبار (State + Handlers)
297
- # =========================
298
- def _format_question(rec):
299
- q = rec.get("question","").strip()
300
- return f"### السؤال:\n{q}"
301
-
302
- def _radio_choices(rec):
303
- out = []
304
- for opt in rec.get("options", []):
305
- lid, text = opt.get("id",""), opt.get("text","")
306
- out.append(f"{lid}) {text}")
307
- while len(out) < 4:
308
- letters = ["A","B","C","D"]
309
- out.append(f"{letters[len(out)]}) —")
310
- return out
311
-
312
- def _correct_letter(rec):
313
- for opt in rec.get("options", []):
314
- if opt.get("is_correct"): return opt.get("id","")
315
  return ""
316
 
317
- def _explanation(rec): return rec.get("explanation","")
318
-
319
- def init_quiz_state(records):
320
- return {"records": records, "idx": 0, "answers": {}, "revealed": set(), "finished": False}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
 
322
- def render_current(rec, user_choice=None, revealed=False):
323
- q_md = _format_question(rec)
324
- choices = _radio_choices(rec)
325
- exp = _explanation(rec) if revealed else ""
326
- correct = _correct_letter(rec)
327
- if user_choice and revealed:
328
- feedback = "✅ إجابة صحيحة" if user_choice == correct else f"❌ إجابة خاطئة — الصحيح: {correct}"
329
- elif user_choice:
330
- feedback = f"تم اختيار: {user_choice}"
331
- else:
332
- feedback = ""
333
- return q_md, choices, exp, feedback
334
-
335
- def on_show_question(state):
336
- if not state: return "", [], "", "",""
337
- recs, idx = state["records"], state["idx"]
338
- rec = recs[idx]
339
- q_md, choices, exp, feedback = render_current(rec, user_choice=state["answers"].get(rec["id"]),
340
- revealed=(rec["id"] in state["revealed"]))
341
- pos = f"{idx+1} / {len(recs)}"
342
- return q_md, choices, exp, feedback, pos
343
-
344
- def on_select_choice(state, choice_label):
345
- if not state or not choice_label: return state, ""
346
  rec = state["records"][state["idx"]]
347
- chosen_letter = choice_label.split(")")[0].strip()
348
- state["answers"][rec["id"]] = chosen_letter
349
  if rec["id"] in state["revealed"]:
350
- correct = _correct_letter(rec)
351
- fb = "✅ إجابة صحيحة" if chosen_letter == correct else f"❌ إجابة خاطئة — الصحيح: {correct}"
352
  else:
353
- fb = f"تم اختيار: {chosen_letter}"
354
- return state, fb
355
-
356
- def on_prev(state):
357
- if not state: return state
358
- state["idx"] = max(0, state["idx"]-1)
359
- return state
360
-
361
- def on_next(state):
362
- if not state: return state
363
- state["idx"] = min(len(state["records"])-1, state["idx"]+1)
364
- return state
365
-
366
- def on_reveal(state):
367
- if not state: return state, ""
368
- rec = state["records"][state["idx"]]
369
- state["revealed"].add(rec["id"])
370
- user = state["answers"].get(rec["id"])
371
- correct = _correct_letter(rec)
372
- fb = "✅ إجابة صحيحة" if user == correct else (f"❌ إجابة خاطئة — الصحيح: {correct}" if user else f"الصحيح: {correct}")
373
  return state, fb
374
 
375
- def on_finish(state):
376
- if not state: return state, ""
377
- recs = state["records"]
378
- correct_count, wrong_count, skipped = 0,0,0
379
- for rec in recs:
380
- qid = rec["id"]
381
- user = state["answers"].get(qid)
382
- correct = _correct_letter(rec)
383
- if user is None: skipped += 1
384
- elif user == correct: correct_count += 1
385
- else: wrong_count += 1
386
- total = len(recs)
387
- score = f"النتيجة: {correct_count}/{total} (صحيح: {correct_count}، خطأ: {wrong_count}، متروك: {skipped})"
388
- state["finished"] = True
389
- return state, score
390
-
391
- def on_reset():
392
- return None, "", "", "", "", "", "تمت إعادة الضبط."
393
-
394
- # =========================
395
- # 8) معالجة الملف وبناء الأسئلة (بدون أي ملفات ناتجة)
396
- # =========================
397
- def process_input_file(uploaded_path,
398
- num_questions=DEFAULT_NUM_QUESTIONS,
399
- lang=DEFAULT_LANG,
400
- trocr_model=DEFAULT_TROCR_MODEL,
401
- trocr_zoom=DEFAULT_TROCR_ZOOM):
402
- if not uploaded_path:
403
- return None, "يرجى رفع ملف PDF/TXT أولاً."
404
- src_path = str(uploaded_path)
405
- filename = Path(src_path).name or "input"
406
- ext = Path(filename).suffix.lower()
407
- if ext not in [".pdf", ".txt"]:
408
- return None, "الرجاء رفع PDF أو TXT فقط."
409
-
410
- # قراءة النص
411
- if ext == ".txt":
412
- with open(src_path, "r", encoding="utf-8", errors="ignore") as f:
413
- raw_text = f.read()
414
- method = "plain text (no PDF)"
415
  else:
416
- raw_text, method = pdf_to_text(src_path, ocr_model=trocr_model, ocr_zoom=float(trocr_zoom))
417
-
418
- cleaned_text = postprocess_text(raw_text, lang=lang)
419
- items = make_mcqs_from_text(cleaned_text, n=int(num_questions), lang=lang)
420
- records = build_quiz_records(items, lang=lang, source_name=filename, method=method, num_questions=num_questions)
421
- return init_quiz_state(records), f"تم توليد {len(records)} سؤالاً. بالتوفيق!"
422
-
423
- # =========================
424
- # 9) واجهة Gradio (تبويب واحد)
425
- # =========================
426
- import gradio as gr
427
-
428
- THEME_CSS = """
429
- body { direction: rtl; font-family: system-ui, 'Cairo', 'IBM Plex Arabic', sans-serif; }
430
- label, .gr-markdown, .gr-button { text-align: right; }
431
- .gradio-container { max-width: 880px; margin: auto; }
432
- .card { background: #fff; border-radius: 1rem; padding: 1rem 1.2rem; box-shadow: 0 10px 25px rgba(0,0,0,0.06); }
433
- .small { opacity: .85; font-size: .9rem; }
434
- .progress { text-align: left; opacity:.75 }
 
 
435
  """
436
 
437
- with gr.Blocks(title="اختبار من ملف (PDF/TXT)", css=THEME_CSS) as demo:
438
- gr.Markdown("## صانع اختبار من ملف PDF/TXT — واجهة واحدة بسيطة")
439
- gr.Markdown("ارفع ملفك، حدّد عدد الأسئلة، واضغط **ابدأ**. ثمّ أجب وتحقق من الإجابة.")
440
-
441
- quiz_state = gr.State(value=None)
442
- toast = gr.Markdown("")
443
-
444
- with gr.Row():
445
- inp_file = gr.File(label="ارفع ملف PDF أو TXT", file_count="single", file_types=[".pdf",".txt"], type="filepath")
446
- num_q = gr.Slider(4, 20, value=DEFAULT_NUM_QUESTIONS, step=1, label="عدد الأسئلة")
447
- with gr.Accordion("خيارات متقدمة (للـ PDF المصوّر)", open=False):
448
- trocr_zoom = gr.Slider(2.0, 3.5, value=DEFAULT_TROCR_ZOOM, step=0.1, label="Zoom لتحويل الصفحات لصورة (OCR)")
449
- trocr_model = gr.Dropdown(
450
- choices=[
451
- "microsoft/trocr-base-printed",
452
- "microsoft/trocr-large-printed",
453
- "microsoft/trocr-base-handwritten",
454
- "microsoft/trocr-large-handwritten",
455
- ],
456
- value=DEFAULT_TROCR_MODEL, label="نموذج TrOCR"
457
- )
458
-
459
- btn_start = gr.Button("ابدأ توليد الاختبار", variant="primary")
460
-
461
- with gr.Group():
 
 
 
 
 
 
 
 
462
  with gr.Row():
463
  progress = gr.Label("", elem_classes=["progress"])
464
- q_md = gr.Markdown("", elem_classes=["card"])
465
- choices = gr.Radio(choices=[], label="اختر الإجابة", interactive=True)
466
- feedback = gr.Markdown("")
467
- exp_md = gr.Markdown("")
 
 
468
  with gr.Row():
469
  btn_prev = gr.Button("السابق")
470
  btn_next = gr.Button("التالي")
471
  btn_reveal = gr.Button("إظهار الإجابة")
472
- btn_finish = gr.Button("إنهاء الاختبار", variant="stop")
473
- btn_reset = gr.Button("إعادة ضبط")
474
-
475
- score_md = gr.Markdown("")
476
-
477
- # بدء المعالجة وبناء الأسئلة
478
- btn_start.click(
479
- process_input_file,
480
- inputs=[inp_file, num_q, gr.State(DEFAULT_LANG), trocr_model, trocr_zoom],
481
- outputs=[quiz_state, toast]
482
- ).then(
483
- on_show_question, inputs=[quiz_state],
484
- outputs=[q_md, choices, exp_md, feedback, progress]
485
- )
486
-
487
- # التنقل
488
- btn_prev.click(on_prev, inputs=[quiz_state], outputs=[quiz_state]).then(
489
- on_show_question, inputs=[quiz_state],
490
- outputs=[q_md, choices, exp_md, feedback, progress]
491
- )
492
- btn_next.click(on_next, inputs=[quiz_state], outputs=[quiz_state]).then(
493
- on_show_question, inputs=[quiz_state],
494
- outputs=[q_md, choices, exp_md, feedback, progress]
495
- )
496
- btn_reveal.click(on_reveal, inputs=[quiz_state], outputs=[quiz_state, feedback]).then(
497
- on_show_question, inputs=[quiz_state],
498
- outputs=[q_md, choices, exp_md, feedback, progress]
499
- )
500
-
501
- # اختيار الإجابة
502
- def _on_choice(state, choice):
503
- return on_select_choice(state, choice)
504
- choices.change(_on_choice, inputs=[quiz_state, choices], outputs=[quiz_state, feedback])
505
-
506
- # إنهاء وإظهار نتيجة
507
- btn_finish.click(on_finish, inputs=[quiz_state], outputs=[quiz_state, score_md])
508
-
509
- # إعادة ضبط
510
- btn_reset.click(lambda: on_reset(), outputs=[quiz_state, q_md, choices, exp_md, feedback, score_md, toast])
511
-
512
- # Spaces تتعرف على demo تلقائيًا
513
  if __name__ == "__main__":
514
  demo.queue().launch()
 
 
1
  # -*- coding: utf-8 -*-
2
+ # app.py — واجهة واحدة: توليد أسئلة ➜ اختبار تفاعلي بنفس الثيم
3
 
4
+ import os, json, uuid, random, unicodedata
 
 
 
 
 
 
 
5
  from dataclasses import dataclass
6
  from pathlib import Path
7
  from typing import List, Tuple
 
11
  import fitz # PyMuPDF
12
  import regex as re2
13
  import yake
14
+ import gradio as gr
15
 
16
+ # ---------- إعدادات عامة ----------
 
 
17
  random.seed(42)
18
  DEFAULT_LANG = "ar"
19
+ DEFAULT_NUM_QUESTIONS = 6
20
  DEFAULT_TROCR_MODEL = "microsoft/trocr-base-printed"
21
+ DEFAULT_TROCR_ZOOM = 2.6
22
+
23
+ # ---------- OCR (تحميل كسول) ----------
24
+ _OCR = {}
25
+ def get_ocr(model_id: str):
26
+ from transformers import pipeline
27
+ import torch
28
+ dev = 0 if torch.cuda.is_available() else -1
29
+ if model_id not in _OCR:
30
+ _OCR[model_id] = pipeline("image-to-text", model=model_id, device=dev)
31
+ return _OCR[model_id]
32
+
33
+ # ---------- PDF/TXT → نص ----------
34
+ def extract_text_with_pypdf(path: str) -> str:
35
+ reader = PdfReader(path)
36
+ out = []
37
+ for p in reader.pages:
38
+ try: t = p.extract_text() or ""
39
+ except Exception: t = ""
40
+ out.append(t)
41
+ return "\n".join(out).strip()
42
+
43
+ def pdf_to_images(path: str, zoom: float=2.5) -> List[Image.Image]:
44
+ doc = fitz.open(path); M = fitz.Matrix(zoom, zoom)
 
 
 
 
 
45
  imgs = []
46
+ for pg in doc:
47
+ pix = pg.get_pixmap(matrix=M, alpha=False)
48
+ imgs.append(Image.frombytes("RGB",(pix.width,pix.height),pix.samples))
 
 
49
  doc.close()
50
  return imgs
51
 
52
+ def extract_text_with_ocr(path: str, model_id: str, zoom: float) -> str:
53
+ ocr = get_ocr(model_id)
54
+ parts = []
55
+ for i, img in enumerate(pdf_to_images(path, zoom=zoom), start=1):
 
56
  try:
57
  out = ocr(img)
58
+ txt = out[0].get("generated_text","").strip() if out else ""
59
  except Exception:
60
  txt = ""
61
+ parts.append(f"--- [Page {i}] ---\n{txt}")
62
+ return "\n\n".join(parts).strip()
63
+
64
+ def is_good(t: str, min_chars=250, min_alpha=0.15) -> bool:
65
+ if len(t) < min_chars: return False
66
+ alnum = sum(ch.isalnum() for ch in t)
67
+ return (alnum/max(1,len(t))) >= min_alpha
68
+
69
+ def file_to_text(path: str, model_id=DEFAULT_TROCR_MODEL, zoom=DEFAULT_TROCR_ZOOM) -> Tuple[str,str]:
70
+ ext = Path(path).suffix.lower()
71
+ if ext == ".txt":
72
+ with open(path,"r",encoding="utf-8",errors="ignore") as f: return f.read(), "plain text"
73
+ raw = extract_text_with_pypdf(path)
74
+ if is_good(raw): return raw, "embedded (pypdf)"
75
+ return extract_text_with_ocr(path, model_id, zoom), "OCR (TrOCR)"
76
+
77
+ # ---------- تنظيف عربي ----------
78
+ AR_DIAC = r"[ًٌٍَُِّْ]"
79
+ def strip_headers(t:str)->str:
80
+ out=[]
81
+ for ln in t.splitlines():
 
 
 
 
 
 
 
 
 
 
82
  if re2.match(r"^\s*--- \[Page \d+\] ---\s*$", ln): continue
83
  if re2.match(r"^\s*(Page\s*\d+|صفحة\s*\d+)\s*$", ln): continue
84
  if re2.match(r"^\s*[-–—_*]{3,}\s*$", ln): continue
85
  out.append(ln)
86
  return "\n".join(out)
87
 
88
+ def norm_ar(t:str)->str:
89
+ t = unicodedata.normalize("NFKC", t)
90
+ t = re2.sub(r"[ـ]", "", t)
91
+ t = re2.sub(AR_DIAC, "", t)
92
+ t = re2.sub(r"[إأآا]", "ا", t)
93
+ t = re2.sub(r"[يى]", "ي", t)
94
+ t = re2.sub(r"\s+", " ", t)
95
+ t = re2.sub(r'(\p{L})\1{2,}', r'\1', t)
96
+ t = re2.sub(r'(\p{L})\1', r'\1', t)
97
+ return t.strip()
98
+
99
+ def postprocess(raw:str)->str:
100
+ t = strip_headers(raw).replace("\r","\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  t = re2.sub(r"\n{3,}", "\n\n", t)
102
  t = re2.sub(r"\d+\s*[\[\(][^\]\)]*[\]\)]", " ", t)
103
  t = re2.sub(r"\[\d+\]", " ", t)
104
+ return norm_ar(t)
105
+
106
+ # ---------- YAKE + تقسيم ----------
 
 
 
 
 
107
  SENT_SPLIT = re2.compile(r"(?<=[\.!؟\?])\s+")
108
+ AR_STOP = set("""في على من إلى عن مع لدى ذلك هذه هذا الذين التي الذي أو أم إن أن كان تكون كانوا كانت كنت ثم قد لقد ربما بل لكن إلا سوى حتى حيث كما لما ما لماذا متى أين كيف أي هناك هنا هؤلاء أولئك نحن هو هي هم هن أنت أنتم أنتن""".split())
109
+
110
+ def split_sents(t:str)->List[str]:
111
+ s=[x.strip() for x in SENT_SPLIT.split(t) if x.strip()]
112
+ return [x for x in s if len(x)>=25]
113
+
114
+ def yake_keywords(t:str, k:int=160)->List[str]:
115
+ ex = yake.KeywordExtractor(lan='ar', n=1, top=k)
116
+ cands = [w for w,_ in ex.extract_keywords(t)]
117
+ out=[]; seen=set()
118
+ for k in cands:
119
+ k=k.strip()
120
+ if not k or k in seen or k in AR_STOP: continue
121
+ if len(k)<3 or re2.match(r"^[\p{P}\p{S}]+$",k): continue
122
+ seen.add(k); out.append(k)
 
 
 
 
123
  return out
124
 
125
+ # ---------- مولّد MCQ ----------
 
 
126
  @dataclass
127
  class MCQ:
128
  id: str
 
131
  answer_index: int
132
  explanation: str
133
 
134
+ def good_kw(kw:str)->bool:
135
+ return kw and len(kw)>=3 and kw not in AR_STOP and not re2.match(r"^[\p{P}\p{S}\d_]+$", kw)
 
136
 
137
+ def distractors(correct:str, pool:List[str], k:int=3)->List[str]:
138
+ L=len(correct.strip()); cand=[]
 
 
 
 
 
 
 
 
139
  for w in pool:
140
+ w=w.strip()
141
+ if not w or w==correct or w in AR_STOP: continue
142
+ if re2.match(r"^[\p{P}\p{S}\d_]+$", w): continue
143
+ if abs(len(w)-L)<=3: cand.append(w)
 
 
 
144
  random.shuffle(cand)
145
+ out=cand[:k]
146
+ while len(out)<k: out.append("—")
 
 
 
 
 
147
  return out
148
 
149
+ def make_mcqs(text:str, n:int=6)->List[MCQ]:
150
+ sents=split_sents(text)
151
+ if not sents: raise ValueError("النص قصير أو غير صالح.")
152
+ kws=yake_keywords(text) or [w for w,_ in sorted(((t, text.count(t)) for t in re2.findall(r"[\p{L}\p{N}_]+",text)), key=lambda x:-x[1])][:80]
153
+ sent_for={}
154
+ for s in sents:
155
+ for kw in kws:
156
+ if good_kw(kw) and re2.search(rf"(?<!\p{{L}}){re2.escape(kw)}(?!\p{{L}})", s) and kw not in sent_for:
157
+ sent_for[kw]=s
158
+ items=[]; used=set()
159
+ for kw in [k for k in kws if k in sent_for]:
160
+ if len(items)>=n: break
161
+ s=sent_for[kw]
162
+ if s in used: continue
163
+ q=re2.sub(rf"(?<!\p{{L}}){re2.escape(kw)}(?!\p{{L}})", "_____", s, count=1)
164
+ ch=distractors(kw, [x for x in kws if x!=kw], 3)+[kw]
165
+ random.shuffle(ch); ans=ch.index(kw)
166
+ exp=f"مقتبس من الجملة: {s[:220]}" + ("..." if len(s)>220 else "")
167
+ items.append(MCQ(id=str(uuid.uuid4())[:8], question=q, choices=ch, answer_index=ans, explanation=exp))
168
+ used.add(s)
169
+ if not items: raise RuntimeError("تعذّر توليد أسئلة. جرّب نصاً أطول.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  return items
171
 
172
+ # ---------- تحويل للسجلات ----------
173
+ def to_records(items:List[MCQ], source:str, method:str, n:int)->List[dict]:
174
+ recs=[]
 
 
 
 
 
 
 
 
 
 
 
175
  for it in items:
176
+ opts=[]
177
+ for i,lbl in enumerate(["A","B","C","D"]):
178
+ txt=(it.choices[i] if i<len(it.choices) else "").strip()
179
+ txt=txt.replace(",", "،").replace("?", "؟").replace(";", "؛")
180
+ opts.append({"id":lbl,"text":txt or "—","is_correct":(i==it.answer_index)})
181
+ recs.append({
 
 
182
  "id": it.id,
183
+ "question": it.question.strip(),
184
  "options": opts,
185
+ "explanation": it.explanation.strip(),
186
+ "meta": {"source": source, "extraction_method": method, "num_questions": int(n)}
187
+ })
188
+ return recs
189
+
190
+ # ---------- منطق الاختبار ----------
191
+ def correct_letter(rec):
192
+ for o in rec["options"]:
193
+ if o["is_correct"]: return o["id"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  return ""
195
 
196
+ def init_state(records):
197
+ return {"records": records, "idx":0, "answers":{}, "revealed":set(), "finished":False}
198
+
199
+ def render(rec, user=None, revealed=False):
200
+ q_md = f"### السؤال\n{rec['question']}"
201
+ ch = [f"{o['id']}) {o['text']}" for o in rec["options"]]
202
+ exp = rec["explanation"] if revealed else ""
203
+ fb=""
204
+ if user and revealed:
205
+ fb = "✅ إجابة صحيحة" if user==correct_letter(rec) else f"❌ إجابة خاطئة — الصحيح: {correct_letter(rec)}"
206
+ elif user:
207
+ fb = f"تم اختيار: {user}"
208
+ return q_md, ch, exp, fb
209
+
210
+ def show(state):
211
+ if not state: return "", [], "", "", ""
212
+ rec = state["records"][state["idx"]]
213
+ q, ch, exp, fb = render(rec, state["answers"].get(rec["id"]), rec["id"] in state["revealed"])
214
+ pos = f"{state['idx']+1} / {len(state['records'])}"
215
+ return q, ch, exp, fb, pos
216
 
217
+ def choose(state, label):
218
+ if not state or not label: return state, ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  rec = state["records"][state["idx"]]
220
+ letter = label.split(")")[0].strip()
221
+ state["answers"][rec["id"]] = letter
222
  if rec["id"] in state["revealed"]:
223
+ fb = "✅ إجابة صحيحة" if letter==correct_letter(rec) else f"❌ إجابة خاطئة — الصحيح: {correct_letter(rec)}"
 
224
  else:
225
+ fb = f"تم اختيار: {letter}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  return state, fb
227
 
228
+ def prev_(s):
229
+ if s: s["idx"]=max(0, s["idx"]-1);
230
+ return s
231
+ def next_(s):
232
+ if s: s["idx"]=min(len(s["records"])-1, s["idx"]+1);
233
+ return s
234
+ def reveal(s):
235
+ if not s: return s, ""
236
+ rec = s["records"][s["idx"]]
237
+ s["revealed"].add(rec["id"])
238
+ u = s["answers"].get(rec["id"])
239
+ fb = "✅ إجابة صحيحة" if u==correct_letter(rec) else (f"❌ إجابة خاطئة — الصحيح: {correct_letter(rec)}" if u else f"الصحيح: {correct_letter(rec)}")
240
+ return s, fb
241
+
242
+ def finish(s):
243
+ if not s: return s, ""
244
+ c=w=sk=0
245
+ for r in s["records"]:
246
+ u=s["answers"].get(r["id"])
247
+ cor=correct_letter(r)
248
+ if u is None: sk+=1
249
+ elif u==cor: c+=1
250
+ else: w+=1
251
+ s["finished"]=True
252
+ return s, f"النتيجة: {c}/{len(s['records'])} (صحيح: {c}، خطأ: {w}، متروك: {sk})"
253
+
254
+ # ---------- معالجة الإدخال (نص أو ملف) ----------
255
+ def build_quiz(text_area, file_path, n, model_id, zoom):
256
+ text_area = (text_area or "").strip()
257
+ if not text_area and not file_path:
258
+ return None, gr.update(visible=True), gr.update(visible=False), "🛈 أدخل نصًا أو ارفع ملفًا أولًا."
259
+ if text_area:
260
+ src_name = "pasted_text.txt"
261
+ raw, method = text_area, "user text"
 
 
 
 
 
 
262
  else:
263
+ raw, method = file_to_text(file_path, model_id=model_id, zoom=float(zoom))
264
+ src_name = Path(file_path).name
265
+ cleaned = postprocess(raw)
266
+ items = make_mcqs(cleaned, n=int(n))
267
+ records = to_records(items, source=src_name, method=method, n=n)
268
+ state = init_state(records)
269
+ # إظهار قسم الاختبار وإخفاء قسم الإدخال
270
+ return state, gr.update(visible=False), gr.update(visible=True), f"تم توليد {len(records)} سؤالًا."
271
+
272
+ # ---------- الثيم (CSS مطابق للصورة تقريبًا) ----------
273
+ CSS = """
274
+ body {direction:rtl; font-family: system-ui,'Cairo','IBM Plex Arabic',sans-serif; background: radial-gradient(1200px 500px at 50% -100px,#fff7ef,#e9d8c9);}
275
+ .gradio-container {max-width: 980px; margin: 0 auto;}
276
+ .card {background:#fff; border-radius:20px; padding:22px; box-shadow:0 25px 45px rgba(0,0,0,.07);}
277
+ h1,h2,h3,.gr-markdown h1,.gr-markdown h2,.gr-markdown h3 {color:#6c4b34;}
278
+ .button-primary > button {background: linear-gradient(180deg,#d9a978,#c98f65); border:none; color:#22150d;}
279
+ .button-primary > button:hover {filter:brightness(0.95);}
280
+ .soft {opacity:.8;}
281
+ .upload-like {border:2px dashed #d9a97855; background:#fffaf3; border-radius:16px; padding:14px;}
282
+ .progress {text-align:left; opacity:.75}
283
+ .radio .wrap.svelte-1ipelgc label{border-radius:12px}
284
  """
285
 
286
+ # ---------- واجهة Gradio ----------
287
+ with gr.Blocks(title="Question Generator", css=CSS) as demo:
288
+ gr.Markdown("<h2 style='text-align:center;margin-top:8px;'>Question Generator</h2>", elem_classes=["soft"])
289
+
290
+ # القسم A: الإدخال (نص/ملف)
291
+ input_group = gr.Group(visible=True)
292
+ with input_group:
293
+ with gr.Row():
294
+ with gr.Column(scale=2):
295
+ gr.Markdown("<h3>أدخل نصًا أو ارفع ملفًا</h3>")
296
+ text_area = gr.Textbox(lines=10, placeholder="ألصق هنا مقطع نصي...", label=None)
297
+ num_q = gr.Slider(4, 20, value=DEFAULT_NUM_QUESTIONS, step=1, label="عدد الأسئلة")
298
+ with gr.Column(scale=1):
299
+ file_comp = gr.File(label="اختر ملفًا", file_count="single",
300
+ file_types=[".pdf",".txt"], type="filepath", elem_classes=["upload-like"])
301
+ with gr.Accordion("خيارات متقدمة (لـ PDF المصوّر)", open=False):
302
+ trocr_model = gr.Dropdown(
303
+ choices=[
304
+ "microsoft/trocr-base-printed",
305
+ "microsoft/trocr-large-printed",
306
+ "microsoft/trocr-base-handwritten",
307
+ "microsoft/trocr-large-handwritten",
308
+ ],
309
+ value=DEFAULT_TROCR_MODEL, label="نموذج TrOCR"
310
+ )
311
+ trocr_zoom = gr.Slider(2.0, 3.5, value=DEFAULT_TROCR_ZOOM, step=0.1, label="Zoom OCR")
312
+ btn_build = gr.Button("توليد الأسئلة", elem_classes=["button-primary"])
313
+ toast = gr.Markdown("", elem_classes=["soft"])
314
+ input_card = gr.Markdown("", visible=False) # placeholder
315
+
316
+ # القسم B: الاختبار
317
+ quiz_group = gr.Group(visible=False)
318
+ with quiz_group:
319
  with gr.Row():
320
  progress = gr.Label("", elem_classes=["progress"])
321
+ with gr.Row():
322
+ with gr.Column():
323
+ q_md = gr.Markdown("", elem_classes=["card"])
324
+ choices = gr.Radio(choices=[], label="اختر الإجابة", interactive=True, elem_classes=["radio"])
325
+ feedback = gr.Markdown("")
326
+ exp_md = gr.Markdown("")
327
  with gr.Row():
328
  btn_prev = gr.Button("السابق")
329
  btn_next = gr.Button("التالي")
330
  btn_reveal = gr.Button("إظهار الإجابة")
331
+ btn_finish = gr.Button("إنهاء الاختبار", elem_classes=["button-primary"])
332
+ btn_reset = gr.Button("العودة للواجهة", variant="secondary")
333
+
334
+ state = gr.State(None)
335
+
336
+ # بناء الاختبار من الإدخال
337
+ btn_build.click(
338
+ build_quiz,
339
+ inputs=[text_area, file_comp, num_q, trocr_model, trocr_zoom],
340
+ outputs=[state, input_group, quiz_group, toast]
341
+ ).then(fn=show, inputs=[state], outputs=[q_md, choices, exp_md, feedback, progress])
342
+
343
+ # تفاعلات الاختبار
344
+ choices.change(lambda s,c: choose(s,c), inputs=[state, choices], outputs=[state, feedback])
345
+ btn_prev.click(prev_, inputs=[state], outputs=[state]).then(show, inputs=[state], outputs=[q_md, choices, exp_md, feedback, progress])
346
+ btn_next.click(next_, inputs=[state], outputs=[state]).then(show, inputs=[state], outputs=[q_md, choices, exp_md, feedback, progress])
347
+ btn_reveal.click(reveal, inputs=[state], outputs=[state, feedback]).then(show, inputs=[state], outputs=[q_md, choices, exp_md, feedback, progress])
348
+ btn_finish.click(finish, inputs=[state], outputs=[state, feedback])
349
+ btn_reset.click(lambda: (None, gr.update(visible=True), gr.update(visible=False), "", "", "", "", ""),
350
+ outputs=[state, input_group, quiz_group, feedback, q_md, choices, exp_md, progress])
351
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
  if __name__ == "__main__":
353
  demo.queue().launch()