Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,14 +1,7 @@
|
|
| 1 |
-
# app.py
|
| 2 |
# -*- coding: utf-8 -*-
|
|
|
|
| 3 |
|
| 4 |
-
import os
|
| 5 |
-
import io
|
| 6 |
-
import json
|
| 7 |
-
import uuid
|
| 8 |
-
import random
|
| 9 |
-
import tempfile
|
| 10 |
-
import shutil
|
| 11 |
-
import unicodedata
|
| 12 |
from dataclasses import dataclass
|
| 13 |
from pathlib import Path
|
| 14 |
from typing import List, Tuple
|
|
@@ -18,169 +11,118 @@ from pypdf import PdfReader
|
|
| 18 |
import fitz # PyMuPDF
|
| 19 |
import regex as re2
|
| 20 |
import yake
|
|
|
|
| 21 |
|
| 22 |
-
#
|
| 23 |
-
# إعدادات عامة
|
| 24 |
-
# =========================
|
| 25 |
random.seed(42)
|
| 26 |
DEFAULT_LANG = "ar"
|
| 27 |
-
DEFAULT_NUM_QUESTIONS =
|
| 28 |
DEFAULT_TROCR_MODEL = "microsoft/trocr-base-printed"
|
| 29 |
-
DEFAULT_TROCR_ZOOM = 2.
|
| 30 |
-
|
| 31 |
-
#
|
| 32 |
-
|
| 33 |
-
def
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
texts.append(t)
|
| 54 |
-
return "\n".join(texts).strip()
|
| 55 |
-
|
| 56 |
-
def pdf_pages_to_images(pdf_path: str, zoom: float = 2.5) -> List[Image.Image]:
|
| 57 |
-
doc = fitz.open(pdf_path)
|
| 58 |
imgs = []
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
|
| 63 |
-
imgs.append(img)
|
| 64 |
doc.close()
|
| 65 |
return imgs
|
| 66 |
|
| 67 |
-
def extract_text_with_ocr(
|
| 68 |
-
ocr =
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
for idx, img in enumerate(images):
|
| 72 |
try:
|
| 73 |
out = ocr(img)
|
| 74 |
-
txt = out[0]
|
| 75 |
except Exception:
|
| 76 |
txt = ""
|
| 77 |
-
|
| 78 |
-
return "\n\n".join(
|
| 79 |
-
|
| 80 |
-
def
|
| 81 |
-
if len(
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
""
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
return embedded_text, "embedded (pypdf: weak)"
|
| 99 |
-
return extract_text_with_ocr(pdf_path, model_id=ocr_model, zoom=ocr_zoom), "OCR (Hugging Face TrOCR)"
|
| 100 |
-
|
| 101 |
-
# =========================
|
| 102 |
-
# 3) تطبيع/تصحيح عربي
|
| 103 |
-
# =========================
|
| 104 |
-
def strip_page_headers(text: str) -> str:
|
| 105 |
-
lines = text.splitlines()
|
| 106 |
-
out = []
|
| 107 |
-
for ln in lines:
|
| 108 |
if re2.match(r"^\s*--- \[Page \d+\] ---\s*$", ln): continue
|
| 109 |
if re2.match(r"^\s*(Page\s*\d+|صفحة\s*\d+)\s*$", ln): continue
|
| 110 |
if re2.match(r"^\s*[-–—_*]{3,}\s*$", ln): continue
|
| 111 |
out.append(ln)
|
| 112 |
return "\n".join(out)
|
| 113 |
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
def arabic_ocr_fixes(text: str) -> str:
|
| 128 |
-
fixes = {
|
| 129 |
-
" الصطناعي": " الاصطناعي",
|
| 130 |
-
"صطناعي": "اصطناعي",
|
| 131 |
-
"الذكاء الاصطناعيي": "الذكاء الاصطناعي",
|
| 132 |
-
"ذكاء صطناعي": "ذكاء اصطناعي",
|
| 133 |
-
"الذكاء الاصطناعي.": "الذكاء الاصطناعي.",
|
| 134 |
-
"التعليم ": "التعليم ",
|
| 135 |
-
" مع غني": " غني",
|
| 136 |
-
"مع غني ": " غني ",
|
| 137 |
-
" غير المشبعة": " غيرُ المشبعة",
|
| 138 |
-
"الااصطناعي": "الاصطناعي",
|
| 139 |
-
"وشخصياا": "وشخصياً",
|
| 140 |
-
}
|
| 141 |
-
for wrong, right in fixes.items():
|
| 142 |
-
text = text.replace(wrong, right)
|
| 143 |
-
return text
|
| 144 |
-
|
| 145 |
-
def postprocess_text(raw_text: str, lang: str = "ar") -> str:
|
| 146 |
-
t = strip_page_headers(raw_text)
|
| 147 |
-
t = t.replace("\r", "\n")
|
| 148 |
t = re2.sub(r"\n{3,}", "\n\n", t)
|
| 149 |
t = re2.sub(r"\d+\s*[\[\(][^\]\)]*[\]\)]", " ", t)
|
| 150 |
t = re2.sub(r"\[\d+\]", " ", t)
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
return t
|
| 155 |
-
|
| 156 |
-
# =========================
|
| 157 |
-
# 4) YAKE + تقسيم الجمل
|
| 158 |
-
# =========================
|
| 159 |
SENT_SPLIT = re2.compile(r"(?<=[\.!؟\?])\s+")
|
| 160 |
-
AR_STOP = set("""
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
if
|
| 174 |
-
|
| 175 |
-
if len(kk) < 3: continue
|
| 176 |
-
if re2.match(r"^[\p{P}\p{S}]+$", kk): continue
|
| 177 |
-
seen.add(kk)
|
| 178 |
-
out.append(kk)
|
| 179 |
return out
|
| 180 |
|
| 181 |
-
#
|
| 182 |
-
# 5) مُولِّد MCQ
|
| 183 |
-
# =========================
|
| 184 |
@dataclass
|
| 185 |
class MCQ:
|
| 186 |
id: str
|
|
@@ -189,326 +131,223 @@ class MCQ:
|
|
| 189 |
answer_index: int
|
| 190 |
explanation: str
|
| 191 |
|
| 192 |
-
def
|
| 193 |
-
|
| 194 |
-
return [s for s in sents if len(s) >= 25]
|
| 195 |
|
| 196 |
-
def
|
| 197 |
-
|
| 198 |
-
if kw in AR_STOP: return False
|
| 199 |
-
if re2.match(r"^[\p{P}\p{S}\d_]+$", kw): return False
|
| 200 |
-
return True
|
| 201 |
-
|
| 202 |
-
def build_distractors(correct: str, pool: List[str], k: int = 3) -> List[str]:
|
| 203 |
-
"""ملهيات أقرب طولياً للسياق."""
|
| 204 |
-
target_len = len(correct.strip())
|
| 205 |
-
cand = []
|
| 206 |
for w in pool:
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
if
|
| 210 |
-
if len(
|
| 211 |
-
if re2.match(r"^[\p{P}\p{S}\d_]+$", w2): continue
|
| 212 |
-
if abs(len(w2) - target_len) <= 3:
|
| 213 |
-
cand.append(w2)
|
| 214 |
random.shuffle(cand)
|
| 215 |
-
out
|
| 216 |
-
|
| 217 |
-
out.append(w)
|
| 218 |
-
if len(out) == k: break
|
| 219 |
-
fillers = ["—", "— —", "—-"]
|
| 220 |
-
while len(out) < k:
|
| 221 |
-
out.append(random.choice(fillers))
|
| 222 |
return out
|
| 223 |
|
| 224 |
-
def
|
| 225 |
-
|
| 226 |
-
if not
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
for kw in pool_iter:
|
| 246 |
-
if len(items) >= n: break
|
| 247 |
-
if not _is_good_kw(kw): continue
|
| 248 |
-
s = sent_for_kw[kw]
|
| 249 |
-
if s in used_sents: continue
|
| 250 |
-
blanked = re2.sub(rf"(?<!\p{{L}}){re2.escape(kw)}(?!\p{{L}})", "_____", s, count=1)
|
| 251 |
-
correct = kw
|
| 252 |
-
distractors = build_distractors(correct, [x for x in keywords if x != kw], k=3)
|
| 253 |
-
choices = distractors + [correct]
|
| 254 |
-
random.shuffle(choices)
|
| 255 |
-
ans_idx = choices.index(correct)
|
| 256 |
-
exp = f"مقتبس من الجملة: {s[:220]}" + ("..." if len(s) > 220 else "")
|
| 257 |
-
items.append(MCQ(id=str(uuid.uuid4())[:8], question=blanked, choices=choices, answer_index=ans_idx, explanation=exp))
|
| 258 |
-
used_sents.add(s)
|
| 259 |
-
if not items:
|
| 260 |
-
raise RuntimeError("تعذر توليد أسئلة من النص. جرّب نصاً أطول أو مختلفاً.")
|
| 261 |
return items
|
| 262 |
|
| 263 |
-
#
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
AR_PUNCT = "،؛؟"
|
| 267 |
-
EN_PUNCT = ",;?"
|
| 268 |
-
|
| 269 |
-
def normalize_punct(s: str) -> str:
|
| 270 |
-
if not s: return ""
|
| 271 |
-
s = s.replace(",", "،").replace(";", "؛").replace("?", "؟")
|
| 272 |
-
return s.strip().strip(AR_PUNCT + EN_PUNCT).strip()
|
| 273 |
-
|
| 274 |
-
def build_quiz_records(items: List[MCQ], lang: str, source_name: str, method: str, num_questions: int):
|
| 275 |
-
json_data = []
|
| 276 |
-
letters = ["A", "B", "C", "D"]
|
| 277 |
for it in items:
|
| 278 |
-
opts
|
| 279 |
-
for
|
| 280 |
-
|
| 281 |
-
txt
|
| 282 |
-
opts.append({"id":
|
| 283 |
-
|
| 284 |
-
exp_clean = normalize_punct(it.explanation)
|
| 285 |
-
record = {
|
| 286 |
"id": it.id,
|
| 287 |
-
"question":
|
| 288 |
"options": opts,
|
| 289 |
-
"explanation":
|
| 290 |
-
"meta": {"
|
| 291 |
-
}
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
def _format_question(rec):
|
| 299 |
-
q = rec.get("question","").strip()
|
| 300 |
-
return f"### السؤال:\n{q}"
|
| 301 |
-
|
| 302 |
-
def _radio_choices(rec):
|
| 303 |
-
out = []
|
| 304 |
-
for opt in rec.get("options", []):
|
| 305 |
-
lid, text = opt.get("id",""), opt.get("text","")
|
| 306 |
-
out.append(f"{lid}) {text}")
|
| 307 |
-
while len(out) < 4:
|
| 308 |
-
letters = ["A","B","C","D"]
|
| 309 |
-
out.append(f"{letters[len(out)]}) —")
|
| 310 |
-
return out
|
| 311 |
-
|
| 312 |
-
def _correct_letter(rec):
|
| 313 |
-
for opt in rec.get("options", []):
|
| 314 |
-
if opt.get("is_correct"): return opt.get("id","")
|
| 315 |
return ""
|
| 316 |
|
| 317 |
-
def
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 321 |
|
| 322 |
-
def
|
| 323 |
-
|
| 324 |
-
choices = _radio_choices(rec)
|
| 325 |
-
exp = _explanation(rec) if revealed else ""
|
| 326 |
-
correct = _correct_letter(rec)
|
| 327 |
-
if user_choice and revealed:
|
| 328 |
-
feedback = "✅ إجابة صحيحة" if user_choice == correct else f"❌ إجابة خاطئة — الصحيح: {correct}"
|
| 329 |
-
elif user_choice:
|
| 330 |
-
feedback = f"تم اختيار: {user_choice}"
|
| 331 |
-
else:
|
| 332 |
-
feedback = ""
|
| 333 |
-
return q_md, choices, exp, feedback
|
| 334 |
-
|
| 335 |
-
def on_show_question(state):
|
| 336 |
-
if not state: return "", [], "", "",""
|
| 337 |
-
recs, idx = state["records"], state["idx"]
|
| 338 |
-
rec = recs[idx]
|
| 339 |
-
q_md, choices, exp, feedback = render_current(rec, user_choice=state["answers"].get(rec["id"]),
|
| 340 |
-
revealed=(rec["id"] in state["revealed"]))
|
| 341 |
-
pos = f"{idx+1} / {len(recs)}"
|
| 342 |
-
return q_md, choices, exp, feedback, pos
|
| 343 |
-
|
| 344 |
-
def on_select_choice(state, choice_label):
|
| 345 |
-
if not state or not choice_label: return state, ""
|
| 346 |
rec = state["records"][state["idx"]]
|
| 347 |
-
|
| 348 |
-
state["answers"][rec["id"]] =
|
| 349 |
if rec["id"] in state["revealed"]:
|
| 350 |
-
|
| 351 |
-
fb = "✅ إجابة صحيحة" if chosen_letter == correct else f"❌ إجابة خاطئة — الصحيح: {correct}"
|
| 352 |
else:
|
| 353 |
-
fb = f"تم اختيار: {
|
| 354 |
-
return state, fb
|
| 355 |
-
|
| 356 |
-
def on_prev(state):
|
| 357 |
-
if not state: return state
|
| 358 |
-
state["idx"] = max(0, state["idx"]-1)
|
| 359 |
-
return state
|
| 360 |
-
|
| 361 |
-
def on_next(state):
|
| 362 |
-
if not state: return state
|
| 363 |
-
state["idx"] = min(len(state["records"])-1, state["idx"]+1)
|
| 364 |
-
return state
|
| 365 |
-
|
| 366 |
-
def on_reveal(state):
|
| 367 |
-
if not state: return state, ""
|
| 368 |
-
rec = state["records"][state["idx"]]
|
| 369 |
-
state["revealed"].add(rec["id"])
|
| 370 |
-
user = state["answers"].get(rec["id"])
|
| 371 |
-
correct = _correct_letter(rec)
|
| 372 |
-
fb = "✅ إجابة صحيحة" if user == correct else (f"❌ إجابة خاطئة — الصحيح: {correct}" if user else f"الصحيح: {correct}")
|
| 373 |
return state, fb
|
| 374 |
|
| 375 |
-
def
|
| 376 |
-
if
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
# قراءة النص
|
| 411 |
-
if ext == ".txt":
|
| 412 |
-
with open(src_path, "r", encoding="utf-8", errors="ignore") as f:
|
| 413 |
-
raw_text = f.read()
|
| 414 |
-
method = "plain text (no PDF)"
|
| 415 |
else:
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
items =
|
| 420 |
-
records =
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
#
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
.
|
| 432 |
-
.
|
| 433 |
-
.
|
| 434 |
-
.
|
|
|
|
|
|
|
| 435 |
"""
|
| 436 |
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
gr.Markdown("
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
"
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 462 |
with gr.Row():
|
| 463 |
progress = gr.Label("", elem_classes=["progress"])
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
|
|
|
|
|
|
| 468 |
with gr.Row():
|
| 469 |
btn_prev = gr.Button("السابق")
|
| 470 |
btn_next = gr.Button("التالي")
|
| 471 |
btn_reveal = gr.Button("إظهار الإجابة")
|
| 472 |
-
btn_finish = gr.Button("إنهاء الاختبار",
|
| 473 |
-
btn_reset = gr.Button("
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
#
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
inputs=[
|
| 481 |
-
outputs=[
|
| 482 |
-
).then(
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
)
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
on_show_question, inputs=[quiz_state],
|
| 494 |
-
outputs=[q_md, choices, exp_md, feedback, progress]
|
| 495 |
-
)
|
| 496 |
-
btn_reveal.click(on_reveal, inputs=[quiz_state], outputs=[quiz_state, feedback]).then(
|
| 497 |
-
on_show_question, inputs=[quiz_state],
|
| 498 |
-
outputs=[q_md, choices, exp_md, feedback, progress]
|
| 499 |
-
)
|
| 500 |
-
|
| 501 |
-
# اختيار الإجابة
|
| 502 |
-
def _on_choice(state, choice):
|
| 503 |
-
return on_select_choice(state, choice)
|
| 504 |
-
choices.change(_on_choice, inputs=[quiz_state, choices], outputs=[quiz_state, feedback])
|
| 505 |
-
|
| 506 |
-
# إنهاء وإظهار نتيجة
|
| 507 |
-
btn_finish.click(on_finish, inputs=[quiz_state], outputs=[quiz_state, score_md])
|
| 508 |
-
|
| 509 |
-
# إعادة ضبط
|
| 510 |
-
btn_reset.click(lambda: on_reset(), outputs=[quiz_state, q_md, choices, exp_md, feedback, score_md, toast])
|
| 511 |
-
|
| 512 |
-
# Spaces تتعرف على demo تلقائيًا
|
| 513 |
if __name__ == "__main__":
|
| 514 |
demo.queue().launch()
|
|
|
|
|
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
+
# app.py — واجهة واحدة: توليد أسئلة ➜ اختبار تفاعلي بنفس الثيم
|
| 3 |
|
| 4 |
+
import os, json, uuid, random, unicodedata
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
from dataclasses import dataclass
|
| 6 |
from pathlib import Path
|
| 7 |
from typing import List, Tuple
|
|
|
|
| 11 |
import fitz # PyMuPDF
|
| 12 |
import regex as re2
|
| 13 |
import yake
|
| 14 |
+
import gradio as gr
|
| 15 |
|
| 16 |
+
# ---------- إعدادات عامة ----------
|
|
|
|
|
|
|
| 17 |
random.seed(42)
|
| 18 |
DEFAULT_LANG = "ar"
|
| 19 |
+
DEFAULT_NUM_QUESTIONS = 6
|
| 20 |
DEFAULT_TROCR_MODEL = "microsoft/trocr-base-printed"
|
| 21 |
+
DEFAULT_TROCR_ZOOM = 2.6
|
| 22 |
+
|
| 23 |
+
# ---------- OCR (تحميل كسول) ----------
|
| 24 |
+
_OCR = {}
|
| 25 |
+
def get_ocr(model_id: str):
|
| 26 |
+
from transformers import pipeline
|
| 27 |
+
import torch
|
| 28 |
+
dev = 0 if torch.cuda.is_available() else -1
|
| 29 |
+
if model_id not in _OCR:
|
| 30 |
+
_OCR[model_id] = pipeline("image-to-text", model=model_id, device=dev)
|
| 31 |
+
return _OCR[model_id]
|
| 32 |
+
|
| 33 |
+
# ---------- PDF/TXT → نص ----------
|
| 34 |
+
def extract_text_with_pypdf(path: str) -> str:
|
| 35 |
+
reader = PdfReader(path)
|
| 36 |
+
out = []
|
| 37 |
+
for p in reader.pages:
|
| 38 |
+
try: t = p.extract_text() or ""
|
| 39 |
+
except Exception: t = ""
|
| 40 |
+
out.append(t)
|
| 41 |
+
return "\n".join(out).strip()
|
| 42 |
+
|
| 43 |
+
def pdf_to_images(path: str, zoom: float=2.5) -> List[Image.Image]:
|
| 44 |
+
doc = fitz.open(path); M = fitz.Matrix(zoom, zoom)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
imgs = []
|
| 46 |
+
for pg in doc:
|
| 47 |
+
pix = pg.get_pixmap(matrix=M, alpha=False)
|
| 48 |
+
imgs.append(Image.frombytes("RGB",(pix.width,pix.height),pix.samples))
|
|
|
|
|
|
|
| 49 |
doc.close()
|
| 50 |
return imgs
|
| 51 |
|
| 52 |
+
def extract_text_with_ocr(path: str, model_id: str, zoom: float) -> str:
|
| 53 |
+
ocr = get_ocr(model_id)
|
| 54 |
+
parts = []
|
| 55 |
+
for i, img in enumerate(pdf_to_images(path, zoom=zoom), start=1):
|
|
|
|
| 56 |
try:
|
| 57 |
out = ocr(img)
|
| 58 |
+
txt = out[0].get("generated_text","").strip() if out else ""
|
| 59 |
except Exception:
|
| 60 |
txt = ""
|
| 61 |
+
parts.append(f"--- [Page {i}] ---\n{txt}")
|
| 62 |
+
return "\n\n".join(parts).strip()
|
| 63 |
+
|
| 64 |
+
def is_good(t: str, min_chars=250, min_alpha=0.15) -> bool:
|
| 65 |
+
if len(t) < min_chars: return False
|
| 66 |
+
alnum = sum(ch.isalnum() for ch in t)
|
| 67 |
+
return (alnum/max(1,len(t))) >= min_alpha
|
| 68 |
+
|
| 69 |
+
def file_to_text(path: str, model_id=DEFAULT_TROCR_MODEL, zoom=DEFAULT_TROCR_ZOOM) -> Tuple[str,str]:
|
| 70 |
+
ext = Path(path).suffix.lower()
|
| 71 |
+
if ext == ".txt":
|
| 72 |
+
with open(path,"r",encoding="utf-8",errors="ignore") as f: return f.read(), "plain text"
|
| 73 |
+
raw = extract_text_with_pypdf(path)
|
| 74 |
+
if is_good(raw): return raw, "embedded (pypdf)"
|
| 75 |
+
return extract_text_with_ocr(path, model_id, zoom), "OCR (TrOCR)"
|
| 76 |
+
|
| 77 |
+
# ---------- تنظيف عربي ----------
|
| 78 |
+
AR_DIAC = r"[ًٌٍَُِّْ]"
|
| 79 |
+
def strip_headers(t:str)->str:
|
| 80 |
+
out=[]
|
| 81 |
+
for ln in t.splitlines():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
if re2.match(r"^\s*--- \[Page \d+\] ---\s*$", ln): continue
|
| 83 |
if re2.match(r"^\s*(Page\s*\d+|صفحة\s*\d+)\s*$", ln): continue
|
| 84 |
if re2.match(r"^\s*[-–—_*]{3,}\s*$", ln): continue
|
| 85 |
out.append(ln)
|
| 86 |
return "\n".join(out)
|
| 87 |
|
| 88 |
+
def norm_ar(t:str)->str:
|
| 89 |
+
t = unicodedata.normalize("NFKC", t)
|
| 90 |
+
t = re2.sub(r"[ـ]", "", t)
|
| 91 |
+
t = re2.sub(AR_DIAC, "", t)
|
| 92 |
+
t = re2.sub(r"[إأآا]", "ا", t)
|
| 93 |
+
t = re2.sub(r"[يى]", "ي", t)
|
| 94 |
+
t = re2.sub(r"\s+", " ", t)
|
| 95 |
+
t = re2.sub(r'(\p{L})\1{2,}', r'\1', t)
|
| 96 |
+
t = re2.sub(r'(\p{L})\1', r'\1', t)
|
| 97 |
+
return t.strip()
|
| 98 |
+
|
| 99 |
+
def postprocess(raw:str)->str:
|
| 100 |
+
t = strip_headers(raw).replace("\r","\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
t = re2.sub(r"\n{3,}", "\n\n", t)
|
| 102 |
t = re2.sub(r"\d+\s*[\[\(][^\]\)]*[\]\)]", " ", t)
|
| 103 |
t = re2.sub(r"\[\d+\]", " ", t)
|
| 104 |
+
return norm_ar(t)
|
| 105 |
+
|
| 106 |
+
# ---------- YAKE + تقسيم ----------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
SENT_SPLIT = re2.compile(r"(?<=[\.!؟\?])\s+")
|
| 108 |
+
AR_STOP = set("""في على من إلى عن مع لدى ذلك هذه هذا الذين التي الذي أو أم إن أن كان تكون كانوا كانت كنت ثم قد لقد ربما بل لكن إلا سوى حتى حيث كما لما ما لماذا متى أين كيف أي هناك هنا هؤلاء أولئك نحن هو هي هم هن أنت أنتم أنتن""".split())
|
| 109 |
+
|
| 110 |
+
def split_sents(t:str)->List[str]:
|
| 111 |
+
s=[x.strip() for x in SENT_SPLIT.split(t) if x.strip()]
|
| 112 |
+
return [x for x in s if len(x)>=25]
|
| 113 |
+
|
| 114 |
+
def yake_keywords(t:str, k:int=160)->List[str]:
|
| 115 |
+
ex = yake.KeywordExtractor(lan='ar', n=1, top=k)
|
| 116 |
+
cands = [w for w,_ in ex.extract_keywords(t)]
|
| 117 |
+
out=[]; seen=set()
|
| 118 |
+
for k in cands:
|
| 119 |
+
k=k.strip()
|
| 120 |
+
if not k or k in seen or k in AR_STOP: continue
|
| 121 |
+
if len(k)<3 or re2.match(r"^[\p{P}\p{S}]+$",k): continue
|
| 122 |
+
seen.add(k); out.append(k)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
return out
|
| 124 |
|
| 125 |
+
# ---------- مولّد MCQ ----------
|
|
|
|
|
|
|
| 126 |
@dataclass
|
| 127 |
class MCQ:
|
| 128 |
id: str
|
|
|
|
| 131 |
answer_index: int
|
| 132 |
explanation: str
|
| 133 |
|
| 134 |
+
def good_kw(kw:str)->bool:
|
| 135 |
+
return kw and len(kw)>=3 and kw not in AR_STOP and not re2.match(r"^[\p{P}\p{S}\d_]+$", kw)
|
|
|
|
| 136 |
|
| 137 |
+
def distractors(correct:str, pool:List[str], k:int=3)->List[str]:
|
| 138 |
+
L=len(correct.strip()); cand=[]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
for w in pool:
|
| 140 |
+
w=w.strip()
|
| 141 |
+
if not w or w==correct or w in AR_STOP: continue
|
| 142 |
+
if re2.match(r"^[\p{P}\p{S}\d_]+$", w): continue
|
| 143 |
+
if abs(len(w)-L)<=3: cand.append(w)
|
|
|
|
|
|
|
|
|
|
| 144 |
random.shuffle(cand)
|
| 145 |
+
out=cand[:k]
|
| 146 |
+
while len(out)<k: out.append("—")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
return out
|
| 148 |
|
| 149 |
+
def make_mcqs(text:str, n:int=6)->List[MCQ]:
|
| 150 |
+
sents=split_sents(text)
|
| 151 |
+
if not sents: raise ValueError("النص قصير أو غير صالح.")
|
| 152 |
+
kws=yake_keywords(text) or [w for w,_ in sorted(((t, text.count(t)) for t in re2.findall(r"[\p{L}\p{N}_]+",text)), key=lambda x:-x[1])][:80]
|
| 153 |
+
sent_for={}
|
| 154 |
+
for s in sents:
|
| 155 |
+
for kw in kws:
|
| 156 |
+
if good_kw(kw) and re2.search(rf"(?<!\p{{L}}){re2.escape(kw)}(?!\p{{L}})", s) and kw not in sent_for:
|
| 157 |
+
sent_for[kw]=s
|
| 158 |
+
items=[]; used=set()
|
| 159 |
+
for kw in [k for k in kws if k in sent_for]:
|
| 160 |
+
if len(items)>=n: break
|
| 161 |
+
s=sent_for[kw]
|
| 162 |
+
if s in used: continue
|
| 163 |
+
q=re2.sub(rf"(?<!\p{{L}}){re2.escape(kw)}(?!\p{{L}})", "_____", s, count=1)
|
| 164 |
+
ch=distractors(kw, [x for x in kws if x!=kw], 3)+[kw]
|
| 165 |
+
random.shuffle(ch); ans=ch.index(kw)
|
| 166 |
+
exp=f"مقتبس من الجملة: {s[:220]}" + ("..." if len(s)>220 else "")
|
| 167 |
+
items.append(MCQ(id=str(uuid.uuid4())[:8], question=q, choices=ch, answer_index=ans, explanation=exp))
|
| 168 |
+
used.add(s)
|
| 169 |
+
if not items: raise RuntimeError("تعذّر توليد أسئلة. جرّب نصاً أطول.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
return items
|
| 171 |
|
| 172 |
+
# ---------- تحويل للسجلات ----------
|
| 173 |
+
def to_records(items:List[MCQ], source:str, method:str, n:int)->List[dict]:
|
| 174 |
+
recs=[]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
for it in items:
|
| 176 |
+
opts=[]
|
| 177 |
+
for i,lbl in enumerate(["A","B","C","D"]):
|
| 178 |
+
txt=(it.choices[i] if i<len(it.choices) else "—").strip()
|
| 179 |
+
txt=txt.replace(",", "،").replace("?", "؟").replace(";", "؛")
|
| 180 |
+
opts.append({"id":lbl,"text":txt or "—","is_correct":(i==it.answer_index)})
|
| 181 |
+
recs.append({
|
|
|
|
|
|
|
| 182 |
"id": it.id,
|
| 183 |
+
"question": it.question.strip(),
|
| 184 |
"options": opts,
|
| 185 |
+
"explanation": it.explanation.strip(),
|
| 186 |
+
"meta": {"source": source, "extraction_method": method, "num_questions": int(n)}
|
| 187 |
+
})
|
| 188 |
+
return recs
|
| 189 |
+
|
| 190 |
+
# ---------- منطق الاختبار ----------
|
| 191 |
+
def correct_letter(rec):
|
| 192 |
+
for o in rec["options"]:
|
| 193 |
+
if o["is_correct"]: return o["id"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
return ""
|
| 195 |
|
| 196 |
+
def init_state(records):
|
| 197 |
+
return {"records": records, "idx":0, "answers":{}, "revealed":set(), "finished":False}
|
| 198 |
+
|
| 199 |
+
def render(rec, user=None, revealed=False):
|
| 200 |
+
q_md = f"### السؤال\n{rec['question']}"
|
| 201 |
+
ch = [f"{o['id']}) {o['text']}" for o in rec["options"]]
|
| 202 |
+
exp = rec["explanation"] if revealed else ""
|
| 203 |
+
fb=""
|
| 204 |
+
if user and revealed:
|
| 205 |
+
fb = "✅ إجابة صحيحة" if user==correct_letter(rec) else f"❌ إجابة خاطئة — الصحيح: {correct_letter(rec)}"
|
| 206 |
+
elif user:
|
| 207 |
+
fb = f"تم اختيار: {user}"
|
| 208 |
+
return q_md, ch, exp, fb
|
| 209 |
+
|
| 210 |
+
def show(state):
|
| 211 |
+
if not state: return "", [], "", "", ""
|
| 212 |
+
rec = state["records"][state["idx"]]
|
| 213 |
+
q, ch, exp, fb = render(rec, state["answers"].get(rec["id"]), rec["id"] in state["revealed"])
|
| 214 |
+
pos = f"{state['idx']+1} / {len(state['records'])}"
|
| 215 |
+
return q, ch, exp, fb, pos
|
| 216 |
|
| 217 |
+
def choose(state, label):
|
| 218 |
+
if not state or not label: return state, ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
rec = state["records"][state["idx"]]
|
| 220 |
+
letter = label.split(")")[0].strip()
|
| 221 |
+
state["answers"][rec["id"]] = letter
|
| 222 |
if rec["id"] in state["revealed"]:
|
| 223 |
+
fb = "✅ إجابة صحيحة" if letter==correct_letter(rec) else f"❌ إجابة خاطئة — الصحيح: {correct_letter(rec)}"
|
|
|
|
| 224 |
else:
|
| 225 |
+
fb = f"تم اختيار: {letter}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
return state, fb
|
| 227 |
|
| 228 |
+
def prev_(s):
|
| 229 |
+
if s: s["idx"]=max(0, s["idx"]-1);
|
| 230 |
+
return s
|
| 231 |
+
def next_(s):
|
| 232 |
+
if s: s["idx"]=min(len(s["records"])-1, s["idx"]+1);
|
| 233 |
+
return s
|
| 234 |
+
def reveal(s):
|
| 235 |
+
if not s: return s, ""
|
| 236 |
+
rec = s["records"][s["idx"]]
|
| 237 |
+
s["revealed"].add(rec["id"])
|
| 238 |
+
u = s["answers"].get(rec["id"])
|
| 239 |
+
fb = "✅ إجابة صحيحة" if u==correct_letter(rec) else (f"❌ إجابة خاطئة — الصحيح: {correct_letter(rec)}" if u else f"الصحيح: {correct_letter(rec)}")
|
| 240 |
+
return s, fb
|
| 241 |
+
|
| 242 |
+
def finish(s):
|
| 243 |
+
if not s: return s, ""
|
| 244 |
+
c=w=sk=0
|
| 245 |
+
for r in s["records"]:
|
| 246 |
+
u=s["answers"].get(r["id"])
|
| 247 |
+
cor=correct_letter(r)
|
| 248 |
+
if u is None: sk+=1
|
| 249 |
+
elif u==cor: c+=1
|
| 250 |
+
else: w+=1
|
| 251 |
+
s["finished"]=True
|
| 252 |
+
return s, f"النتيجة: {c}/{len(s['records'])} (صحيح: {c}، خطأ: {w}، متروك: {sk})"
|
| 253 |
+
|
| 254 |
+
# ---------- معالجة الإدخال (نص أو ملف) ----------
|
| 255 |
+
def build_quiz(text_area, file_path, n, model_id, zoom):
|
| 256 |
+
text_area = (text_area or "").strip()
|
| 257 |
+
if not text_area and not file_path:
|
| 258 |
+
return None, gr.update(visible=True), gr.update(visible=False), "🛈 أدخل نصًا أو ارفع ملفًا أولًا."
|
| 259 |
+
if text_area:
|
| 260 |
+
src_name = "pasted_text.txt"
|
| 261 |
+
raw, method = text_area, "user text"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
else:
|
| 263 |
+
raw, method = file_to_text(file_path, model_id=model_id, zoom=float(zoom))
|
| 264 |
+
src_name = Path(file_path).name
|
| 265 |
+
cleaned = postprocess(raw)
|
| 266 |
+
items = make_mcqs(cleaned, n=int(n))
|
| 267 |
+
records = to_records(items, source=src_name, method=method, n=n)
|
| 268 |
+
state = init_state(records)
|
| 269 |
+
# إظهار قسم الاختبار وإخفاء قسم الإدخال
|
| 270 |
+
return state, gr.update(visible=False), gr.update(visible=True), f"تم توليد {len(records)} سؤالًا."
|
| 271 |
+
|
| 272 |
+
# ---------- الثيم (CSS مطابق للصورة تقريبًا) ----------
|
| 273 |
+
CSS = """
|
| 274 |
+
body {direction:rtl; font-family: system-ui,'Cairo','IBM Plex Arabic',sans-serif; background: radial-gradient(1200px 500px at 50% -100px,#fff7ef,#e9d8c9);}
|
| 275 |
+
.gradio-container {max-width: 980px; margin: 0 auto;}
|
| 276 |
+
.card {background:#fff; border-radius:20px; padding:22px; box-shadow:0 25px 45px rgba(0,0,0,.07);}
|
| 277 |
+
h1,h2,h3,.gr-markdown h1,.gr-markdown h2,.gr-markdown h3 {color:#6c4b34;}
|
| 278 |
+
.button-primary > button {background: linear-gradient(180deg,#d9a978,#c98f65); border:none; color:#22150d;}
|
| 279 |
+
.button-primary > button:hover {filter:brightness(0.95);}
|
| 280 |
+
.soft {opacity:.8;}
|
| 281 |
+
.upload-like {border:2px dashed #d9a97855; background:#fffaf3; border-radius:16px; padding:14px;}
|
| 282 |
+
.progress {text-align:left; opacity:.75}
|
| 283 |
+
.radio .wrap.svelte-1ipelgc label{border-radius:12px}
|
| 284 |
"""
|
| 285 |
|
| 286 |
+
# ---------- واجهة Gradio ----------
|
| 287 |
+
with gr.Blocks(title="Question Generator", css=CSS) as demo:
|
| 288 |
+
gr.Markdown("<h2 style='text-align:center;margin-top:8px;'>Question Generator</h2>", elem_classes=["soft"])
|
| 289 |
+
|
| 290 |
+
# القسم A: الإدخال (نص/ملف)
|
| 291 |
+
input_group = gr.Group(visible=True)
|
| 292 |
+
with input_group:
|
| 293 |
+
with gr.Row():
|
| 294 |
+
with gr.Column(scale=2):
|
| 295 |
+
gr.Markdown("<h3>أدخل نصًا أو ارفع ملفًا</h3>")
|
| 296 |
+
text_area = gr.Textbox(lines=10, placeholder="ألصق هنا مقطع نصي...", label=None)
|
| 297 |
+
num_q = gr.Slider(4, 20, value=DEFAULT_NUM_QUESTIONS, step=1, label="عدد الأسئلة")
|
| 298 |
+
with gr.Column(scale=1):
|
| 299 |
+
file_comp = gr.File(label="اختر ملفًا", file_count="single",
|
| 300 |
+
file_types=[".pdf",".txt"], type="filepath", elem_classes=["upload-like"])
|
| 301 |
+
with gr.Accordion("خيارات متقدمة (لـ PDF المصوّر)", open=False):
|
| 302 |
+
trocr_model = gr.Dropdown(
|
| 303 |
+
choices=[
|
| 304 |
+
"microsoft/trocr-base-printed",
|
| 305 |
+
"microsoft/trocr-large-printed",
|
| 306 |
+
"microsoft/trocr-base-handwritten",
|
| 307 |
+
"microsoft/trocr-large-handwritten",
|
| 308 |
+
],
|
| 309 |
+
value=DEFAULT_TROCR_MODEL, label="نموذج TrOCR"
|
| 310 |
+
)
|
| 311 |
+
trocr_zoom = gr.Slider(2.0, 3.5, value=DEFAULT_TROCR_ZOOM, step=0.1, label="Zoom OCR")
|
| 312 |
+
btn_build = gr.Button("توليد الأسئلة", elem_classes=["button-primary"])
|
| 313 |
+
toast = gr.Markdown("", elem_classes=["soft"])
|
| 314 |
+
input_card = gr.Markdown("", visible=False) # placeholder
|
| 315 |
+
|
| 316 |
+
# القسم B: الاختبار
|
| 317 |
+
quiz_group = gr.Group(visible=False)
|
| 318 |
+
with quiz_group:
|
| 319 |
with gr.Row():
|
| 320 |
progress = gr.Label("", elem_classes=["progress"])
|
| 321 |
+
with gr.Row():
|
| 322 |
+
with gr.Column():
|
| 323 |
+
q_md = gr.Markdown("", elem_classes=["card"])
|
| 324 |
+
choices = gr.Radio(choices=[], label="اختر الإجابة", interactive=True, elem_classes=["radio"])
|
| 325 |
+
feedback = gr.Markdown("")
|
| 326 |
+
exp_md = gr.Markdown("")
|
| 327 |
with gr.Row():
|
| 328 |
btn_prev = gr.Button("السابق")
|
| 329 |
btn_next = gr.Button("التالي")
|
| 330 |
btn_reveal = gr.Button("إظهار الإجابة")
|
| 331 |
+
btn_finish = gr.Button("إنهاء الاختبار", elem_classes=["button-primary"])
|
| 332 |
+
btn_reset = gr.Button("العودة للواجهة", variant="secondary")
|
| 333 |
+
|
| 334 |
+
state = gr.State(None)
|
| 335 |
+
|
| 336 |
+
# بناء الاختبار من الإدخال
|
| 337 |
+
btn_build.click(
|
| 338 |
+
build_quiz,
|
| 339 |
+
inputs=[text_area, file_comp, num_q, trocr_model, trocr_zoom],
|
| 340 |
+
outputs=[state, input_group, quiz_group, toast]
|
| 341 |
+
).then(fn=show, inputs=[state], outputs=[q_md, choices, exp_md, feedback, progress])
|
| 342 |
+
|
| 343 |
+
# تفاعلات الاختبار
|
| 344 |
+
choices.change(lambda s,c: choose(s,c), inputs=[state, choices], outputs=[state, feedback])
|
| 345 |
+
btn_prev.click(prev_, inputs=[state], outputs=[state]).then(show, inputs=[state], outputs=[q_md, choices, exp_md, feedback, progress])
|
| 346 |
+
btn_next.click(next_, inputs=[state], outputs=[state]).then(show, inputs=[state], outputs=[q_md, choices, exp_md, feedback, progress])
|
| 347 |
+
btn_reveal.click(reveal, inputs=[state], outputs=[state, feedback]).then(show, inputs=[state], outputs=[q_md, choices, exp_md, feedback, progress])
|
| 348 |
+
btn_finish.click(finish, inputs=[state], outputs=[state, feedback])
|
| 349 |
+
btn_reset.click(lambda: (None, gr.update(visible=True), gr.update(visible=False), "", "", "", "", ""),
|
| 350 |
+
outputs=[state, input_group, quiz_group, feedback, q_md, choices, exp_md, progress])
|
| 351 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 352 |
if __name__ == "__main__":
|
| 353 |
demo.queue().launch()
|