Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
-
#
|
| 3 |
-
|
| 4 |
import os, json, uuid, random, unicodedata
|
| 5 |
from dataclasses import dataclass
|
| 6 |
from pathlib import Path
|
|
@@ -13,14 +12,13 @@ import regex as re2
|
|
| 13 |
import yake
|
| 14 |
import gradio as gr
|
| 15 |
|
| 16 |
-
#
|
| 17 |
random.seed(42)
|
| 18 |
-
DEFAULT_LANG = "ar"
|
| 19 |
DEFAULT_NUM_QUESTIONS = 6
|
| 20 |
DEFAULT_TROCR_MODEL = "microsoft/trocr-base-printed"
|
| 21 |
DEFAULT_TROCR_ZOOM = 2.6
|
| 22 |
|
| 23 |
-
#
|
| 24 |
_OCR = {}
|
| 25 |
def get_ocr(model_id: str):
|
| 26 |
from transformers import pipeline
|
|
@@ -30,22 +28,25 @@ def get_ocr(model_id: str):
|
|
| 30 |
_OCR[model_id] = pipeline("image-to-text", model=model_id, device=dev)
|
| 31 |
return _OCR[model_id]
|
| 32 |
|
| 33 |
-
#
|
| 34 |
def extract_text_with_pypdf(path: str) -> str:
|
| 35 |
reader = PdfReader(path)
|
| 36 |
-
|
| 37 |
for p in reader.pages:
|
| 38 |
-
try:
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
|
|
|
|
|
|
| 42 |
|
| 43 |
def pdf_to_images(path: str, zoom: float=2.5) -> List[Image.Image]:
|
| 44 |
-
doc = fitz.open(path)
|
|
|
|
| 45 |
imgs = []
|
| 46 |
for pg in doc:
|
| 47 |
pix = pg.get_pixmap(matrix=M, alpha=False)
|
| 48 |
-
imgs.append(Image.frombytes("RGB",(pix.width,pix.height),pix.samples))
|
| 49 |
doc.close()
|
| 50 |
return imgs
|
| 51 |
|
|
@@ -55,7 +56,7 @@ def extract_text_with_ocr(path: str, model_id: str, zoom: float) -> str:
|
|
| 55 |
for i, img in enumerate(pdf_to_images(path, zoom=zoom), start=1):
|
| 56 |
try:
|
| 57 |
out = ocr(img)
|
| 58 |
-
txt = out[0].get("generated_text","").strip() if out else ""
|
| 59 |
except Exception:
|
| 60 |
txt = ""
|
| 61 |
parts.append(f"--- [Page {i}] ---\n{txt}")
|
|
@@ -64,20 +65,21 @@ def extract_text_with_ocr(path: str, model_id: str, zoom: float) -> str:
|
|
| 64 |
def is_good(t: str, min_chars=250, min_alpha=0.15) -> bool:
|
| 65 |
if len(t) < min_chars: return False
|
| 66 |
alnum = sum(ch.isalnum() for ch in t)
|
| 67 |
-
return (alnum/max(1,len(t))) >= min_alpha
|
| 68 |
|
| 69 |
-
def file_to_text(path: str, model_id=DEFAULT_TROCR_MODEL, zoom=DEFAULT_TROCR_ZOOM) -> Tuple[str,str]:
|
| 70 |
ext = Path(path).suffix.lower()
|
| 71 |
if ext == ".txt":
|
| 72 |
-
with open(path,"r",encoding="utf-8",errors="ignore") as f:
|
|
|
|
| 73 |
raw = extract_text_with_pypdf(path)
|
| 74 |
if is_good(raw): return raw, "embedded (pypdf)"
|
| 75 |
return extract_text_with_ocr(path, model_id, zoom), "OCR (TrOCR)"
|
| 76 |
|
| 77 |
-
#
|
| 78 |
AR_DIAC = r"[ًٌٍَُِّْ]"
|
| 79 |
-
def strip_headers(t:str)->str:
|
| 80 |
-
out=[]
|
| 81 |
for ln in t.splitlines():
|
| 82 |
if re2.match(r"^\s*--- \[Page \d+\] ---\s*$", ln): continue
|
| 83 |
if re2.match(r"^\s*(Page\s*\d+|صفحة\s*\d+)\s*$", ln): continue
|
|
@@ -85,246 +87,234 @@ def strip_headers(t:str)->str:
|
|
| 85 |
out.append(ln)
|
| 86 |
return "\n".join(out)
|
| 87 |
|
| 88 |
-
def norm_ar(t:str)->str:
|
| 89 |
t = unicodedata.normalize("NFKC", t)
|
| 90 |
t = re2.sub(r"[ـ]", "", t)
|
| 91 |
t = re2.sub(AR_DIAC, "", t)
|
| 92 |
t = re2.sub(r"[إأآا]", "ا", t)
|
| 93 |
t = re2.sub(r"[يى]", "ي", t)
|
| 94 |
-
t = re2.sub(r"
|
| 95 |
t = re2.sub(r'(\p{L})\1{2,}', r'\1', t)
|
| 96 |
t = re2.sub(r'(\p{L})\1', r'\1', t)
|
| 97 |
return t.strip()
|
| 98 |
|
| 99 |
-
def postprocess(raw:str)->str:
|
| 100 |
-
t = strip_headers(raw).replace("\r","\n")
|
| 101 |
t = re2.sub(r"\n{3,}", "\n\n", t)
|
| 102 |
t = re2.sub(r"\d+\s*[\[\(][^\]\)]*[\]\)]", " ", t)
|
| 103 |
t = re2.sub(r"\[\d+\]", " ", t)
|
| 104 |
return norm_ar(t)
|
| 105 |
|
| 106 |
-
#
|
| 107 |
SENT_SPLIT = re2.compile(r"(?<=[\.!؟\?])\s+")
|
| 108 |
AR_STOP = set("""في على من إلى عن مع لدى ذلك هذه هذا الذين التي الذي أو أم إن أن كان تكون كانوا كانت كنت ثم قد لقد ربما بل لكن إلا سوى حتى حيث كما لما ما لماذا متى أين كيف أي هناك هنا هؤلاء أولئك نحن هو هي هم هن أنت أنتم أنتن""".split())
|
| 109 |
|
| 110 |
-
def split_sents(t:str)->List[str]:
|
| 111 |
-
s=[x.strip() for x in SENT_SPLIT.split(t) if x.strip()]
|
| 112 |
-
return [x for x in s if len(x)>=25]
|
| 113 |
-
|
| 114 |
-
def yake_keywords(t:str, k:int=160)->List[str]:
|
| 115 |
-
ex = yake.KeywordExtractor(lan='ar', n=1, top=k)
|
| 116 |
-
cands = [w for w,_ in ex.extract_keywords(t)]
|
| 117 |
-
out=[]; seen=set()
|
| 118 |
-
for k in cands:
|
| 119 |
-
k=k.strip()
|
| 120 |
-
if not k or k in seen or k in AR_STOP: continue
|
| 121 |
-
if len(k)<3 or re2.match(r"^[\p{P}\p{S}]+$",k): continue
|
| 122 |
-
seen.add(k); out.append(k)
|
| 123 |
-
return out
|
| 124 |
-
|
| 125 |
-
# ---------- مولّد MCQ ----------
|
| 126 |
@dataclass
|
| 127 |
class MCQ:
|
| 128 |
id: str
|
| 129 |
question: str
|
| 130 |
choices: List[str]
|
| 131 |
answer_index: int
|
| 132 |
-
explanation: str
|
| 133 |
|
| 134 |
-
def
|
| 135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
-
def distractors(correct:str, pool:List[str], k:int=3)->List[str]:
|
| 138 |
-
L=len(correct.strip()); cand=[]
|
| 139 |
for w in pool:
|
| 140 |
-
w=w.strip()
|
| 141 |
-
if not w or w==correct or w in AR_STOP: continue
|
| 142 |
if re2.match(r"^[\p{P}\p{S}\d_]+$", w): continue
|
| 143 |
-
if abs(len(w)-L)<=3: cand.append(w)
|
| 144 |
random.shuffle(cand)
|
| 145 |
-
out=cand[:k]
|
| 146 |
-
while len(out)<k: out.append("—")
|
| 147 |
return out
|
| 148 |
|
| 149 |
-
def make_mcqs(text:str, n:int=6)->List[MCQ]:
|
| 150 |
-
sents=split_sents(text)
|
| 151 |
if not sents: raise ValueError("النص قصير أو غير صالح.")
|
| 152 |
-
kws=yake_keywords(text) or [w for w,_ in sorted(((t, text.count(t)) for t in re2.findall(r"[\p{L}\p{N}_]+",text)), key=lambda x
|
| 153 |
-
sent_for={}
|
| 154 |
for s in sents:
|
| 155 |
for kw in kws:
|
| 156 |
if good_kw(kw) and re2.search(rf"(?<!\p{{L}}){re2.escape(kw)}(?!\p{{L}})", s) and kw not in sent_for:
|
| 157 |
-
sent_for[kw]=s
|
| 158 |
-
items=[]
|
| 159 |
for kw in [k for k in kws if k in sent_for]:
|
| 160 |
-
if len(items)>=n: break
|
| 161 |
-
s=sent_for[kw]
|
| 162 |
if s in used: continue
|
| 163 |
-
q=re2.sub(rf"(?<!\p{{L}}){re2.escape(kw)}(?!\p{{L}})", "_____", s, count=1)
|
| 164 |
-
ch=distractors(kw, [x for x in kws if x!=kw], 3)+[kw]
|
| 165 |
-
random.shuffle(ch); ans=ch.index(kw)
|
| 166 |
-
|
| 167 |
-
items.append(MCQ(id=str(uuid.uuid4())[:8], question=q, choices=ch, answer_index=ans, explanation=exp))
|
| 168 |
used.add(s)
|
| 169 |
-
if not items: raise RuntimeError("تعذّر توليد أسئلة.
|
| 170 |
return items
|
| 171 |
|
| 172 |
-
#
|
| 173 |
-
def to_records(items:List[MCQ]
|
| 174 |
-
recs=[]
|
| 175 |
for it in items:
|
| 176 |
-
opts=[]
|
| 177 |
-
for i,lbl in enumerate(["A","B","C","D"]):
|
| 178 |
-
txt=(it.choices[i] if i<len(it.choices) else "—").strip()
|
| 179 |
-
txt=txt.replace(",", "،").replace("?", "؟").replace(";", "؛")
|
| 180 |
-
opts.append({"id":lbl,"text":txt or "—","is_correct":(i==it.answer_index)})
|
| 181 |
-
recs.append({
|
| 182 |
-
"id": it.id,
|
| 183 |
-
"question": it.question.strip(),
|
| 184 |
-
"options": opts,
|
| 185 |
-
"explanation": it.explanation.strip(),
|
| 186 |
-
"meta": {"source": source, "extraction_method": method, "num_questions": int(n)}
|
| 187 |
-
})
|
| 188 |
return recs
|
| 189 |
|
| 190 |
-
#
|
| 191 |
-
def correct_letter(rec):
|
| 192 |
-
for o in rec["options"]:
|
| 193 |
-
if o["is_correct"]: return o["id"]
|
| 194 |
-
return ""
|
| 195 |
-
|
| 196 |
-
def init_state(records):
|
| 197 |
-
return {"records": records, "finished": False}
|
| 198 |
-
|
| 199 |
-
# ---------- HTML للواجهة/الاختبار ----------
|
| 200 |
def render_quiz_html(records: List[dict]) -> str:
|
| 201 |
parts = []
|
| 202 |
for i, rec in enumerate(records, start=1):
|
| 203 |
-
qid = rec["id"]
|
| 204 |
-
qtxt = rec["question"]
|
| 205 |
-
opts = rec["options"]
|
| 206 |
opts_html = []
|
| 207 |
for o in opts:
|
| 208 |
-
lid = o["id"]
|
| 209 |
opts_html.append(f"""
|
| 210 |
<label class="opt">
|
| 211 |
-
<input type="radio" name="q_{qid}" value="{lid}"
|
| 212 |
<span class="opt-letter">{lid}</span>
|
| 213 |
<span class="opt-text">{txt}</span>
|
| 214 |
</label>
|
| 215 |
""")
|
| 216 |
parts.append(f"""
|
| 217 |
<div class="q-card" data-qid="{qid}">
|
| 218 |
-
<div class="q-
|
|
|
|
|
|
|
|
|
|
| 219 |
<div class="q-text">{qtxt}</div>
|
| 220 |
<div class="opts">{''.join(opts_html)}</div>
|
| 221 |
</div>
|
| 222 |
""")
|
| 223 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
|
| 225 |
-
#
|
| 226 |
def build_quiz(text_area, file_path, n, model_id, zoom):
|
| 227 |
-
|
| 228 |
-
if not
|
| 229 |
-
return None, "", "🛈 أدخل نصًا أو ارفع
|
| 230 |
-
if
|
| 231 |
-
|
| 232 |
-
raw, method = text_area, "user text"
|
| 233 |
else:
|
| 234 |
raw, method = file_to_text(file_path, model_id=model_id, zoom=float(zoom))
|
| 235 |
-
src_name = Path(file_path).name
|
| 236 |
cleaned = postprocess(raw)
|
| 237 |
items = make_mcqs(cleaned, n=int(n))
|
| 238 |
-
|
| 239 |
-
state =
|
| 240 |
-
|
| 241 |
-
return state, html, f"تم توليد {len(records)} سؤالًا."
|
| 242 |
|
| 243 |
-
#
|
| 244 |
def grade(state, answers_json):
|
| 245 |
try:
|
| 246 |
user_map = json.loads(answers_json or "{}")
|
| 247 |
except Exception:
|
| 248 |
-
|
| 249 |
recs = state["records"] if state else []
|
| 250 |
total = len(recs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
correct = 0
|
| 252 |
-
wrong_details = []
|
| 253 |
for rec in recs:
|
| 254 |
qid = rec["id"]
|
| 255 |
-
chosen =
|
| 256 |
-
cor =
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
md = ["### الإجابات الخاطئة:"]
|
| 267 |
-
for rec, chosen, cor, note in wrong_details:
|
| 268 |
-
opts = {o["id"]: o["text"] for o in rec["options"]}
|
| 269 |
-
md.append(
|
| 270 |
-
f"- **السؤال:** {rec['question']}\n"
|
| 271 |
-
f" - إجابتك: **{chosen or '—'}** — {opts.get(chosen,'')}\n"
|
| 272 |
-
f" - الصحيحة: **{cor}** — {opts.get(cor,'')}\n"
|
| 273 |
-
f" - الشرح: {rec['explanation']}\n"
|
| 274 |
-
+ (f" - ملاحظة: {note}\n" if note else "")
|
| 275 |
-
)
|
| 276 |
-
mistakes_md = "\n".join(md)
|
| 277 |
-
else:
|
| 278 |
-
mistakes_md = "### ممتاز! جميع الإجابات صحيحة ✅"
|
| 279 |
-
|
| 280 |
-
return score_md, mistakes_md
|
| 281 |
-
|
| 282 |
-
# ---------- الثيم (CSS داكن ثابت) ----------
|
| 283 |
CSS = """
|
| 284 |
:root{
|
| 285 |
-
--bg:#
|
| 286 |
-
--text:#
|
| 287 |
}
|
| 288 |
body{direction:rtl; font-family:system-ui,'Cairo','IBM Plex Arabic',sans-serif; background:var(--bg);}
|
| 289 |
-
.gradio-container{max-width:
|
| 290 |
-
.top
|
| 291 |
-
.panel{background:var(--panel);border:1px solid var(--border);border-radius:
|
| 292 |
-
.small{opacity:.
|
| 293 |
-
|
| 294 |
-
.button-primary
|
| 295 |
-
.button-primary
|
| 296 |
-
.upload-like{border:2px dashed #
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
.q-
|
| 306 |
-
.q-
|
| 307 |
-
.q-
|
|
|
|
|
|
|
| 308 |
.opts{display:flex;flex-direction:column;gap:8px}
|
| 309 |
-
.opt{display:flex;gap:10px;align-items:center;background:#
|
| 310 |
-
.opt input{accent-color:var(--
|
| 311 |
-
.opt-letter{display:inline-flex;width:28px;height:28px;border-radius:8px;background:#
|
| 312 |
-
.opt-text{color:#
|
| 313 |
-
.result-card{background:#121212;border:1px solid #2a2a2a;border-radius:16px;padding:16px;margin-top:18px}
|
| 314 |
"""
|
| 315 |
|
| 316 |
-
# ---------- واجهة Gradio ----------
|
| 317 |
with gr.Blocks(title="Question Generator", css=CSS) as demo:
|
| 318 |
-
gr.Markdown("<h2 class='top
|
| 319 |
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
text_area = gr.Textbox(lines=6, placeholder="ألصق هنا مقطع نصي...", label="أدخل نصًا
|
| 324 |
num_q = gr.Slider(4, 20, value=DEFAULT_NUM_QUESTIONS, step=1, label="عدد الأسئلة")
|
| 325 |
-
file_comp = gr.File(label="اختر ملف PDF
|
| 326 |
file_types=[".pdf",".txt"], type="filepath", elem_classes=["upload-like"])
|
| 327 |
-
with gr.Accordion("خيارات متقدمة (
|
| 328 |
trocr_model = gr.Dropdown(
|
| 329 |
choices=[
|
| 330 |
"microsoft/trocr-base-printed",
|
|
@@ -338,41 +328,78 @@ with gr.Blocks(title="Question Generator", css=CSS) as demo:
|
|
| 338 |
btn_build = gr.Button("توليد الأسئلة", elem_classes=["button-primary"])
|
| 339 |
toast = gr.Markdown("", elem_classes=["small"])
|
| 340 |
|
| 341 |
-
# حالة عامة + مكان عرض الاختبار + إرساله
|
| 342 |
state = gr.State(None)
|
| 343 |
-
quiz_html = gr.HTML("")
|
| 344 |
btn_submit = gr.Button("إنهاء وإرسال الإجابات", elem_classes=["button-primary"])
|
| 345 |
answers_box = gr.Textbox(visible=False)
|
| 346 |
score_md = gr.Markdown("")
|
| 347 |
-
|
| 348 |
|
| 349 |
-
# توليد
|
| 350 |
btn_build.click(
|
| 351 |
build_quiz,
|
| 352 |
inputs=[text_area, file_comp, num_q, trocr_model, trocr_zoom],
|
| 353 |
outputs=[state, quiz_html, toast]
|
| 354 |
)
|
| 355 |
|
| 356 |
-
# JS
|
| 357 |
js_collect = """
|
| 358 |
function () {
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
const
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
const
|
| 365 |
-
|
|
|
|
|
|
|
| 366 |
});
|
| 367 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 368 |
}
|
| 369 |
"""
|
| 370 |
|
| 371 |
-
# Submit:
|
|
|
|
| 372 |
btn_submit.click(
|
| 373 |
-
None, inputs=None, outputs=[answers_box], js=js_collect
|
|
|
|
|
|
|
|
|
|
| 374 |
).then(
|
| 375 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
)
|
| 377 |
|
| 378 |
if __name__ == "__main__":
|
|
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
+
# واجهة حديثة ثابتة: كل الأسئلة دفعة واحدة + منع الإرسال قبل الإجابة على الجميع
|
|
|
|
| 3 |
import os, json, uuid, random, unicodedata
|
| 4 |
from dataclasses import dataclass
|
| 5 |
from pathlib import Path
|
|
|
|
| 12 |
import yake
|
| 13 |
import gradio as gr
|
| 14 |
|
| 15 |
+
# ------------------ إعدادات عامة ------------------
|
| 16 |
random.seed(42)
|
|
|
|
| 17 |
DEFAULT_NUM_QUESTIONS = 6
|
| 18 |
DEFAULT_TROCR_MODEL = "microsoft/trocr-base-printed"
|
| 19 |
DEFAULT_TROCR_ZOOM = 2.6
|
| 20 |
|
| 21 |
+
# ------------------ OCR (تحميل كسول) ------------------
|
| 22 |
_OCR = {}
|
| 23 |
def get_ocr(model_id: str):
|
| 24 |
from transformers import pipeline
|
|
|
|
| 28 |
_OCR[model_id] = pipeline("image-to-text", model=model_id, device=dev)
|
| 29 |
return _OCR[model_id]
|
| 30 |
|
| 31 |
+
# ------------------ PDF/TXT → نص ------------------
|
| 32 |
def extract_text_with_pypdf(path: str) -> str:
|
| 33 |
reader = PdfReader(path)
|
| 34 |
+
chunks = []
|
| 35 |
for p in reader.pages:
|
| 36 |
+
try:
|
| 37 |
+
t = p.extract_text() or ""
|
| 38 |
+
except Exception:
|
| 39 |
+
t = ""
|
| 40 |
+
chunks.append(t)
|
| 41 |
+
return "\n".join(chunks).strip()
|
| 42 |
|
| 43 |
def pdf_to_images(path: str, zoom: float=2.5) -> List[Image.Image]:
|
| 44 |
+
doc = fitz.open(path)
|
| 45 |
+
M = fitz.Matrix(zoom, zoom)
|
| 46 |
imgs = []
|
| 47 |
for pg in doc:
|
| 48 |
pix = pg.get_pixmap(matrix=M, alpha=False)
|
| 49 |
+
imgs.append(Image.frombytes("RGB", (pix.width, pix.height), pix.samples))
|
| 50 |
doc.close()
|
| 51 |
return imgs
|
| 52 |
|
|
|
|
| 56 |
for i, img in enumerate(pdf_to_images(path, zoom=zoom), start=1):
|
| 57 |
try:
|
| 58 |
out = ocr(img)
|
| 59 |
+
txt = out[0].get("generated_text", "").strip() if out else ""
|
| 60 |
except Exception:
|
| 61 |
txt = ""
|
| 62 |
parts.append(f"--- [Page {i}] ---\n{txt}")
|
|
|
|
| 65 |
def is_good(t: str, min_chars=250, min_alpha=0.15) -> bool:
|
| 66 |
if len(t) < min_chars: return False
|
| 67 |
alnum = sum(ch.isalnum() for ch in t)
|
| 68 |
+
return (alnum / max(1, len(t))) >= min_alpha
|
| 69 |
|
| 70 |
+
def file_to_text(path: str, model_id=DEFAULT_TROCR_MODEL, zoom=DEFAULT_TROCR_ZOOM) -> Tuple[str, str]:
|
| 71 |
ext = Path(path).suffix.lower()
|
| 72 |
if ext == ".txt":
|
| 73 |
+
with open(path, "r", encoding="utf-8", errors="ignore") as f:
|
| 74 |
+
return f.read(), "plain text"
|
| 75 |
raw = extract_text_with_pypdf(path)
|
| 76 |
if is_good(raw): return raw, "embedded (pypdf)"
|
| 77 |
return extract_text_with_ocr(path, model_id, zoom), "OCR (TrOCR)"
|
| 78 |
|
| 79 |
+
# ------------------ تنظيف عربي مبسّط ------------------
|
| 80 |
AR_DIAC = r"[ًٌٍَُِّْ]"
|
| 81 |
+
def strip_headers(t: str) -> str:
|
| 82 |
+
out = []
|
| 83 |
for ln in t.splitlines():
|
| 84 |
if re2.match(r"^\s*--- \[Page \d+\] ---\s*$", ln): continue
|
| 85 |
if re2.match(r"^\s*(Page\s*\d+|صفحة\s*\d+)\s*$", ln): continue
|
|
|
|
| 87 |
out.append(ln)
|
| 88 |
return "\n".join(out)
|
| 89 |
|
| 90 |
+
def norm_ar(t: str) -> str:
|
| 91 |
t = unicodedata.normalize("NFKC", t)
|
| 92 |
t = re2.sub(r"[ـ]", "", t)
|
| 93 |
t = re2.sub(AR_DIAC, "", t)
|
| 94 |
t = re2.sub(r"[إأآا]", "ا", t)
|
| 95 |
t = re2.sub(r"[يى]", "ي", t)
|
| 96 |
+
t = re2.sub(r"\s+", " ", t)
|
| 97 |
t = re2.sub(r'(\p{L})\1{2,}', r'\1', t)
|
| 98 |
t = re2.sub(r'(\p{L})\1', r'\1', t)
|
| 99 |
return t.strip()
|
| 100 |
|
| 101 |
+
def postprocess(raw: str) -> str:
|
| 102 |
+
t = strip_headers(raw).replace("\r", "\n")
|
| 103 |
t = re2.sub(r"\n{3,}", "\n\n", t)
|
| 104 |
t = re2.sub(r"\d+\s*[\[\(][^\]\)]*[\]\)]", " ", t)
|
| 105 |
t = re2.sub(r"\[\d+\]", " ", t)
|
| 106 |
return norm_ar(t)
|
| 107 |
|
| 108 |
+
# ------------------ توليد أسئلة ------------------
|
| 109 |
SENT_SPLIT = re2.compile(r"(?<=[\.!؟\?])\s+")
|
| 110 |
AR_STOP = set("""في على من إلى عن مع لدى ذلك هذه هذا الذين التي الذي أو أم إن أن كان تكون كانوا كانت كنت ثم قد لقد ربما بل لكن إلا سوى حتى حيث كما لما ما لماذا متى أين كيف أي هناك هنا هؤلاء أولئك نحن هو هي هم هن أنت أنتم أنتن""".split())
|
| 111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
@dataclass
|
| 113 |
class MCQ:
|
| 114 |
id: str
|
| 115 |
question: str
|
| 116 |
choices: List[str]
|
| 117 |
answer_index: int
|
|
|
|
| 118 |
|
| 119 |
+
def split_sents(t: str) -> List[str]:
|
| 120 |
+
s = [x.strip() for x in SENT_SPLIT.split(t) if x.strip()]
|
| 121 |
+
return [x for x in s if len(x) >= 25]
|
| 122 |
+
|
| 123 |
+
def yake_keywords(t: str, k: int = 160) -> List[str]:
|
| 124 |
+
ex = yake.KeywordExtractor(lan='ar', n=1, top=k)
|
| 125 |
+
cands = [w for w, _ in ex.extract_keywords(t)]
|
| 126 |
+
out, seen = [], set()
|
| 127 |
+
for k in cands:
|
| 128 |
+
k = k.strip()
|
| 129 |
+
if not k or k in seen or k in AR_STOP: continue
|
| 130 |
+
if len(k) < 3 or re2.match(r"^[\p{P}\p{S}]+$", k): continue
|
| 131 |
+
seen.add(k); out.append(k)
|
| 132 |
+
return out
|
| 133 |
+
|
| 134 |
+
def good_kw(kw: str) -> bool:
|
| 135 |
+
return kw and len(kw) >= 3 and kw not in AR_STOP and not re2.match(r"^[\p{P}\p{S}\d_]+$", kw)
|
| 136 |
|
| 137 |
+
def distractors(correct: str, pool: List[str], k: int = 3) -> List[str]:
|
| 138 |
+
L = len(correct.strip()); cand = []
|
| 139 |
for w in pool:
|
| 140 |
+
w = w.strip()
|
| 141 |
+
if not w or w == correct or w in AR_STOP: continue
|
| 142 |
if re2.match(r"^[\p{P}\p{S}\d_]+$", w): continue
|
| 143 |
+
if abs(len(w) - L) <= 3: cand.append(w)
|
| 144 |
random.shuffle(cand)
|
| 145 |
+
out = cand[:k]
|
| 146 |
+
while len(out) < k: out.append("—")
|
| 147 |
return out
|
| 148 |
|
| 149 |
+
def make_mcqs(text: str, n: int = 6) -> List[MCQ]:
|
| 150 |
+
sents = split_sents(text)
|
| 151 |
if not sents: raise ValueError("النص قصير أو غير صالح.")
|
| 152 |
+
kws = yake_keywords(text) or [w for w, _ in sorted(((t, text.count(t)) for t in re2.findall(r"[\p{L}\p{N}_]+", text)), key=lambda x: -x[1])][:80]
|
| 153 |
+
sent_for = {}
|
| 154 |
for s in sents:
|
| 155 |
for kw in kws:
|
| 156 |
if good_kw(kw) and re2.search(rf"(?<!\p{{L}}){re2.escape(kw)}(?!\p{{L}})", s) and kw not in sent_for:
|
| 157 |
+
sent_for[kw] = s
|
| 158 |
+
items, used = [], set()
|
| 159 |
for kw in [k for k in kws if k in sent_for]:
|
| 160 |
+
if len(items) >= n: break
|
| 161 |
+
s = sent_for[kw]
|
| 162 |
if s in used: continue
|
| 163 |
+
q = re2.sub(rf"(?<!\p{{L}}){re2.escape(kw)}(?!\p{{L}})", "_____", s, count=1)
|
| 164 |
+
ch = distractors(kw, [x for x in kws if x != kw], 3) + [kw]
|
| 165 |
+
random.shuffle(ch); ans = ch.index(kw)
|
| 166 |
+
items.append(MCQ(id=str(uuid.uuid4())[:8], question=q, choices=ch, answer_index=ans))
|
|
|
|
| 167 |
used.add(s)
|
| 168 |
+
if not items: raise RuntimeError("تعذّر توليد أسئلة.")
|
| 169 |
return items
|
| 170 |
|
| 171 |
+
# ------------------ تحويل إلى سجلات ------------------
|
| 172 |
+
def to_records(items: List[MCQ]) -> List[dict]:
|
| 173 |
+
recs = []
|
| 174 |
for it in items:
|
| 175 |
+
opts = []
|
| 176 |
+
for i, lbl in enumerate(["A","B","C","D"]):
|
| 177 |
+
txt = (it.choices[i] if i < len(it.choices) else "—").strip()
|
| 178 |
+
txt = txt.replace(",", "،").replace("?", "؟").replace(";", "؛")
|
| 179 |
+
opts.append({"id": lbl, "text": txt or "—", "is_correct": (i == it.answer_index)})
|
| 180 |
+
recs.append({"id": it.id, "question": it.question.strip(), "options": opts})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
return recs
|
| 182 |
|
| 183 |
+
# ------------------ HTML للامتحان (كل الأسئلة دفعة واحدة) ------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
def render_quiz_html(records: List[dict]) -> str:
|
| 185 |
parts = []
|
| 186 |
for i, rec in enumerate(records, start=1):
|
| 187 |
+
qid = rec["id"]; qtxt = rec["question"]; opts = rec["options"]
|
|
|
|
|
|
|
| 188 |
opts_html = []
|
| 189 |
for o in opts:
|
| 190 |
+
lid, txt = o["id"], o["text"]
|
| 191 |
opts_html.append(f"""
|
| 192 |
<label class="opt">
|
| 193 |
+
<input type="radio" name="q_{qid}" value="{lid}">
|
| 194 |
<span class="opt-letter">{lid}</span>
|
| 195 |
<span class="opt-text">{txt}</span>
|
| 196 |
</label>
|
| 197 |
""")
|
| 198 |
parts.append(f"""
|
| 199 |
<div class="q-card" data-qid="{qid}">
|
| 200 |
+
<div class="q-header">
|
| 201 |
+
<div class="q-title">السؤال {i}</div>
|
| 202 |
+
<div class="q-badge" id="b_{qid}" hidden></div>
|
| 203 |
+
</div>
|
| 204 |
<div class="q-text">{qtxt}</div>
|
| 205 |
<div class="opts">{''.join(opts_html)}</div>
|
| 206 |
</div>
|
| 207 |
""")
|
| 208 |
+
# عدّاد تقدّم بسيط أعلى مجموعة الأسئلة
|
| 209 |
+
html = f"""
|
| 210 |
+
<div id="quiz" class="quiz-wrap">
|
| 211 |
+
<div class="progress-pill"><span id="ans_count">0</span>/<span id="total">{len(records)}</span> تمّت الإجابة</div>
|
| 212 |
+
{''.join(parts)}
|
| 213 |
+
</div>
|
| 214 |
+
<script>
|
| 215 |
+
// تحديث العدّاد كلما تغيرت إجابة
|
| 216 |
+
const updateCounter = () => {{
|
| 217 |
+
const cards = document.querySelectorAll('.q-card');
|
| 218 |
+
let filled = 0;
|
| 219 |
+
cards.forEach(c => {{
|
| 220 |
+
if (c.querySelector('input[type="radio"]:checked')) filled += 1;
|
| 221 |
+
}});
|
| 222 |
+
const el = document.getElementById('ans_count');
|
| 223 |
+
if (el) el.textContent = String(filled);
|
| 224 |
+
}};
|
| 225 |
+
document.querySelectorAll('.q-card input[type="radio"]').forEach(i => i.addEventListener('change', updateCounter));
|
| 226 |
+
updateCounter();
|
| 227 |
+
</script>
|
| 228 |
+
"""
|
| 229 |
+
return html
|
| 230 |
|
| 231 |
+
# ------------------ بناء الامتحان ------------------
|
| 232 |
def build_quiz(text_area, file_path, n, model_id, zoom):
|
| 233 |
+
txt = (text_area or "").strip()
|
| 234 |
+
if not txt and not file_path:
|
| 235 |
+
return None, "", "🛈 أدخل نصًا أو ارفع ملفًا."
|
| 236 |
+
if txt:
|
| 237 |
+
raw, method = txt, "user text"
|
|
|
|
| 238 |
else:
|
| 239 |
raw, method = file_to_text(file_path, model_id=model_id, zoom=float(zoom))
|
|
|
|
| 240 |
cleaned = postprocess(raw)
|
| 241 |
items = make_mcqs(cleaned, n=int(n))
|
| 242 |
+
recs = to_records(items)
|
| 243 |
+
state = {"records": recs, "method": method}
|
| 244 |
+
return state, render_quiz_html(recs), f"تم توليد {len(recs)} سؤالًا. أجب عن جميعها ثم اضغط إرسال."
|
|
|
|
| 245 |
|
| 246 |
+
# ------------------ التصحيح (يعيد الدرجة + خريطة الصحة) ------------------
|
| 247 |
def grade(state, answers_json):
|
| 248 |
try:
|
| 249 |
user_map = json.loads(answers_json or "{}")
|
| 250 |
except Exception:
|
| 251 |
+
return "حدث خطأ في قراءة الإجابات.", "{}"
|
| 252 |
recs = state["records"] if state else []
|
| 253 |
total = len(recs)
|
| 254 |
+
# التحقق من الإجابة على الجميع
|
| 255 |
+
missing = [r["id"] for r in recs if not user_map.get(r["id"])]
|
| 256 |
+
if missing:
|
| 257 |
+
return "⚠️ يجب الإجابة على جميع الأسئلة قبل الإرسال.", "{}"
|
| 258 |
+
correctness = {}
|
| 259 |
correct = 0
|
|
|
|
| 260 |
for rec in recs:
|
| 261 |
qid = rec["id"]
|
| 262 |
+
chosen = user_map.get(qid)
|
| 263 |
+
cor = next((o["id"] for o in rec["options"] if o["is_correct"]), "")
|
| 264 |
+
ok = (chosen == cor)
|
| 265 |
+
correctness[qid] = ok
|
| 266 |
+
if ok: correct += 1
|
| 267 |
+
score_text = f"### نتيجتك: **{correct} / {total}**"
|
| 268 |
+
return score_text, json.dumps(correctness, ensure_ascii=False)
|
| 269 |
+
|
| 270 |
+
# =======================================================
|
| 271 |
+
# واجهة
|
| 272 |
+
# =======================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
CSS = """
|
| 274 |
:root{
|
| 275 |
+
--bg:#0e0e11; --panel:#15161a; --card:#1a1b20; --muted:#a7b0be;
|
| 276 |
+
--text:#f6f7fb; --accent:#6ee7b7; --accent2:#34d399; --danger:#ef4444; --border:#262833;
|
| 277 |
}
|
| 278 |
body{direction:rtl; font-family:system-ui,'Cairo','IBM Plex Arabic',sans-serif; background:var(--bg);}
|
| 279 |
+
.gradio-container{max-width:1000px;margin:0 auto;padding:12px 12px 40px;}
|
| 280 |
+
h2.top{color:#eaeaf2;margin:4px 0 16px}
|
| 281 |
+
.panel{background:var(--panel);border:1px solid var(--border);border-radius:14px;padding:14px;box-shadow:0 16px 38px rgba(0,0,0,.35)}
|
| 282 |
+
.small{opacity:.9;color:#d9dee8}
|
| 283 |
+
|
| 284 |
+
.button-primary>button{background:linear-gradient(180deg,var(--accent),var(--accent2));border:none;color:#0b0d10;font-weight:800}
|
| 285 |
+
.button-primary>button:hover{filter:brightness(.95)}
|
| 286 |
+
.upload-like{border:2px dashed #3b3f52;background:#121318;border-radius:12px;padding:10px;color:#cfd5e3}
|
| 287 |
+
|
| 288 |
+
textarea{min-height:120px}
|
| 289 |
+
|
| 290 |
+
/* الامتحان */
|
| 291 |
+
.progress-pill{display:inline-block;background:#0f1116;border:1px solid #2a2d3a;border-radius:999px;padding:6px 12px;color:#cfd5e3;margin:10px 0}
|
| 292 |
+
.q-card{background:var(--card);border:1px solid var(--border);border-radius:14px;padding:14px;margin:12px 0}
|
| 293 |
+
.q-header{display:flex;gap:10px;align-items:center;justify-content:space-between;margin-bottom:6px}
|
| 294 |
+
.q-title{color:#eaeaf2;font-weight:800}
|
| 295 |
+
.q-badge{padding:8px 12px;border-radius:10px;font-weight:700}
|
| 296 |
+
.q-badge.ok{background:#083a2a;color:#b6f4db;border:1px solid #145b44}
|
| 297 |
+
.q-badge.err{background:#3a0d14;color:#ffd1d6;border:1px solid #6a1e2b}
|
| 298 |
+
|
| 299 |
+
.q-text{color:var(--text);font-size:1.06rem;line-height:1.8;margin:8px 0 12px}
|
| 300 |
.opts{display:flex;flex-direction:column;gap:8px}
|
| 301 |
+
.opt{display:flex;gap:10px;align-items:center;background:#14161c;border:1px solid #2a2d3a;border-radius:12px;padding:10px}
|
| 302 |
+
.opt input{accent-color:var(--accent2)}
|
| 303 |
+
.opt-letter{display:inline-flex;width:28px;height:28px;border-radius:8px;background:#0f1116;border:1px solid #2a2d3a;align-items:center;justify-content:center;font-weight:800;color:#dfe6f7}
|
| 304 |
+
.opt-text{color:#eaeaf2}
|
|
|
|
| 305 |
"""
|
| 306 |
|
|
|
|
| 307 |
with gr.Blocks(title="Question Generator", css=CSS) as demo:
|
| 308 |
+
gr.Markdown("<h2 class='top'>Question Generator</h2>")
|
| 309 |
|
| 310 |
+
with gr.Group(elem_classes=["panel"]):
|
| 311 |
+
gr.Markdown("**أدخل نصًا أو ارفع ملفًا، حدّد عدد الأسئلة، ثم اضغط توليد.**\
|
| 312 |
+
<br>يجب الإجابة على <u>جميع</u> الأسئلة قبل الإرسال.", elem_classes=["small"])
|
| 313 |
+
text_area = gr.Textbox(lines=6, placeholder="ألصق هنا مقطع نصي...", label="أدخل نصًا")
|
| 314 |
num_q = gr.Slider(4, 20, value=DEFAULT_NUM_QUESTIONS, step=1, label="عدد الأسئلة")
|
| 315 |
+
file_comp = gr.File(label="أو اختر ملف PDF/TXT", file_count="single",
|
| 316 |
file_types=[".pdf",".txt"], type="filepath", elem_classes=["upload-like"])
|
| 317 |
+
with gr.Accordion("خيارات متقدمة (PDF مصوّر)", open=False):
|
| 318 |
trocr_model = gr.Dropdown(
|
| 319 |
choices=[
|
| 320 |
"microsoft/trocr-base-printed",
|
|
|
|
| 328 |
btn_build = gr.Button("توليد الأسئلة", elem_classes=["button-primary"])
|
| 329 |
toast = gr.Markdown("", elem_classes=["small"])
|
| 330 |
|
|
|
|
| 331 |
state = gr.State(None)
|
| 332 |
+
quiz_html = gr.HTML("") # مكان عرض جميع الأسئلة دفعة واحدة
|
| 333 |
btn_submit = gr.Button("إنهاء وإرسال الإجابات", elem_classes=["button-primary"])
|
| 334 |
answers_box = gr.Textbox(visible=False)
|
| 335 |
score_md = gr.Markdown("")
|
| 336 |
+
correctness_box = gr.Textbox(visible=False) # نستقبل فيها خريطة الصحة لتلوين الواجهة
|
| 337 |
|
| 338 |
+
# توليد الامتحان
|
| 339 |
btn_build.click(
|
| 340 |
build_quiz,
|
| 341 |
inputs=[text_area, file_comp, num_q, trocr_model, trocr_zoom],
|
| 342 |
outputs=[state, quiz_html, toast]
|
| 343 |
)
|
| 344 |
|
| 345 |
+
# JS: جمع الإجابات + المنع إن كان هناك سؤال غير مُجاب
|
| 346 |
js_collect = """
|
| 347 |
function () {
|
| 348 |
+
// عدّاد وإجابات
|
| 349 |
+
const cards = Array.from(document.querySelectorAll('.q-card'));
|
| 350 |
+
const map = {};
|
| 351 |
+
let missing = 0;
|
| 352 |
+
cards.forEach(c => {
|
| 353 |
+
const qid = c.getAttribute('data-qid');
|
| 354 |
+
const chosen = c.querySelector('input[type="radio"]:checked');
|
| 355 |
+
if (!chosen) { missing += 1; }
|
| 356 |
+
map[qid] = chosen ? chosen.value : null;
|
| 357 |
});
|
| 358 |
+
if (missing > 0) {
|
| 359 |
+
// أظهر رسالة منع بسيطة قرب أعلى الاختبار
|
| 360 |
+
let pill = document.querySelector('.progress-pill');
|
| 361 |
+
if (pill) {
|
| 362 |
+
pill.style.borderColor = '#6a1e2b';
|
| 363 |
+
pill.style.color = '#ffd1d6';
|
| 364 |
+
pill.textContent = `لا يمكن الإرسال: ${missing} سؤال/أسئلة بدون إجابة`;
|
| 365 |
+
}
|
| 366 |
+
return ["", ""]; // لا نرسل شيئًا للتصحيح
|
| 367 |
+
}
|
| 368 |
+
return [JSON.stringify(map), "go"]; // go = سمح بالإرسال
|
| 369 |
}
|
| 370 |
"""
|
| 371 |
|
| 372 |
+
# عند الضغط Submit:
|
| 373 |
+
# 1) اجمع الإجابات (JS). إذا لم يجب على الجميع، لن نرسل للتصحيح.
|
| 374 |
btn_submit.click(
|
| 375 |
+
None, inputs=None, outputs=[answers_box, correctness_box], js=js_collect
|
| 376 |
+
).then(
|
| 377 |
+
# 2) صحّح فقط إذا وُجدت إجابات (answers_box غير فارغ)
|
| 378 |
+
grade, inputs=[state, answers_box], outputs=[score_md, correctness_box]
|
| 379 |
).then(
|
| 380 |
+
# 3) لون الواجهة بالصح/الخطأ (Correct!/Incorrect.) بدون تعليل
|
| 381 |
+
None, inputs=[correctness_box], outputs=None,
|
| 382 |
+
js="""
|
| 383 |
+
(correctness_json) => {
|
| 384 |
+
if (!correctness_json) return;
|
| 385 |
+
let okmap = {};
|
| 386 |
+
try { okmap = JSON.parse(correctness_json); } catch(e){ return; }
|
| 387 |
+
Object.entries(okmap).forEach(([qid, ok]) => {
|
| 388 |
+
const badge = document.getElementById('b_'+qid);
|
| 389 |
+
if (!badge) return;
|
| 390 |
+
badge.hidden = false;
|
| 391 |
+
if (ok) {
|
| 392 |
+
badge.classList.remove('err'); badge.classList.add('ok');
|
| 393 |
+
badge.textContent = 'Correct!';
|
| 394 |
+
} else {
|
| 395 |
+
badge.classList.remove('ok'); badge.classList.add('err');
|
| 396 |
+
badge.textContent = 'Incorrect.';
|
| 397 |
+
}
|
| 398 |
+
});
|
| 399 |
+
const el = document.querySelector('.progress-pill');
|
| 400 |
+
if (el) { el.style.borderColor = '#2a2d3a'; el.style.color = '#cfd5e3'; }
|
| 401 |
+
}
|
| 402 |
+
"""
|
| 403 |
)
|
| 404 |
|
| 405 |
if __name__ == "__main__":
|