Spaces:

Leen172
/

Question_generator

Sleeping

App Files Files Community

Question_generator / app.py

Leen172

Update app.py

741560b verified about 2 months ago

raw

history blame contribute delete

36.1 kB

	# -- coding: utf-8 --
	# Question Generator — Final Publishable Build (Lite/Full)
	# صفحات ثابتة + Submit لكل سؤال فعليًا + منع تغيّر أبعاد صفحة الإدخال
	# طور "فراغ" + طور "فهم مباشر" (mT5) مع fallbacks، صعوبة، BM25، فلترة قوية للمشتّتات، وتنويع على مستوى الفقرات.

	import os, json, uuid, random, unicodedata
	from dataclasses import dataclass
	from pathlib import Path
	from typing import List, Tuple, Optional

	from PIL import Image
	from pypdf import PdfReader
	import fitz # PyMuPDF
	import regex as re2
	import yake
	import gradio as gr

	# ------------------ إعدادات عامّة ------------------
	random.seed(42)
	DEFAULT_NUM_QUESTIONS = 6
	DEFAULT_TROCR_MODEL = "microsoft/trocr-base-printed"
	DEFAULT_TROCR_ZOOM = 2.6
	QUESTION_MODES = ["فراغ", "فهم مباشر"]
	DIFFICULTY_MODES = ["سهل", "متوسط", "صعب"]

	# BM25 (اختياري)
	try:
	from rank_bm25 import BM25Okapi
	_HAS_BM25 = True
	except Exception:
	_HAS_BM25 = False

	# ------------------ OCR (تحميل كسول) ------------------
	_OCR = {}
	def get_ocr(model_id: str):
	try:
	from transformers import pipeline
	import torch
	dev = 0 if torch.cuda.is_available() else -1
	if model_id not in _OCR:
	_OCR[model_id] = pipeline("image-to-text", model=model_id, device=dev)
	return _OCR[model_id]
	except Exception:
	# بديل آمن: دالة تُعيد نصًا فارغًا
	return lambda im: [{"generated_text": ""}]

	# ------------------ PDF/TXT → نص ------------------
	def extract_text_with_pypdf(path: str) -> str:
	reader = PdfReader(path)
	out = []
	for p in reader.pages:
	try:
	t = p.extract_text() or ""
	except Exception:
	t = ""
	out.append(t)
	return "\n".join(out).strip()

	def pdf_to_images(path: str, zoom: float=2.5) -> List[Image.Image]:
	doc = fitz.open(path); M = fitz.Matrix(zoom, zoom)
	imgs = []
	for pg in doc:
	pix = pg.get_pixmap(matrix=M, alpha=False)
	imgs.append(Image.frombytes("RGB",(pix.width,pix.height),pix.samples))
	doc.close()
	return imgs

	def extract_text_with_ocr(path: str, model_id: str, zoom: float) -> str:
	ocr = get_ocr(model_id)
	parts = []
	for i, img in enumerate(pdf_to_images(path, zoom=zoom), start=1):
	try:
	out = ocr(img)
	txt = out[0].get("generated_text","").strip() if out else ""
	except Exception:
	txt = ""
	parts.append(f"--- [Page {i}] ---\n{txt}")
	return "\n\n".join(parts).strip()

	def is_good(t: str, min_chars=250, min_alpha=0.15) -> bool:
	if len(t) < min_chars: return False
	alnum = sum(ch.isalnum() for ch in t)
	return (alnum/max(1,len(t))) >= min_alpha

	def file_to_text(path: str, model_id=DEFAULT_TROCR_MODEL, zoom=DEFAULT_TROCR_ZOOM) -> Tuple[str,str]:
	ext = Path(path).suffix.lower()
	if ext == ".txt":
	with open(path,"r",encoding="utf-8",errors="ignore") as f:
	return f.read(), "plain text"
	raw = extract_text_with_pypdf(path)
	if is_good(raw): return raw, "embedded (pypdf)"
	return extract_text_with_ocr(path, model_id, zoom), "OCR (TrOCR)"

	# ------------------ تنظيف عربي ------------------
	AR_DIAC = r"[ًٌٍَُِّْ]"
	def strip_headers(t:str)->str:
	out=[]
	for ln in t.splitlines():
	if re2.match(r"^\s--- \[Page \d+\] ---\s$", ln): continue
	if re2.match(r"^\s(Page\s\d+\|صفحة\s\d+)\s$", ln): continue
	if re2.match(r"^\s[-–—_]{3,}\s*$", ln): continue
	out.append(ln)
	return "\n".join(out)

	def norm_ar(t:str)->str:
	t = unicodedata.normalize("NFKC", t)
	t = re2.sub(r"[ـ]", "", t)
	t = re2.sub(AR_DIAC, "", t)
	t = re2.sub(r"[إأآا]", "ا", t)
	t = re2.sub(r"[يى]", "ي", t)
	t = re2.sub(r"\s+", " ", t)
	t = re2.sub(r'(\p{L})\1{2,}', r'\1', t)
	t = re2.sub(r'(\p{L})\1', r'\1', t)
	return t.strip()

	def postprocess(raw:str)->str:
	t = strip_headers(raw).replace("\r","\n")
	t = re2.sub(r"\n{3,}", "\n\n", t)
	t = re2.sub(r"\d+\s[\[$][^\]$][\]\)]", " ", t)
	t = re2.sub(r"\[\d+\]", " ", t)
	return norm_ar(t)

	# ------------------ بنية السؤال ------------------
	SENT_SPLIT = re2.compile(r"(?<=[\.!؟\?])\s+")
	AR_STOP = set("""في على من إلى عن مع لدى ذلك هذه هذا الذين التي الذي أو أم إن أن كان تكون كانوا كانت كنت ثم قد لقد ربما بل لكن إلا سوى حتى حيث كما لما ما لماذا متى أين كيف أي هناك هنا هؤلاء أولئك نحن هو هي هم هن أنت أنتم أنتن""".split())

	@dataclass
	class MCQ:
	id: str
	question: str
	choices: List[str]
	answer_index: int

	def split_sents(t:str)->List[str]:
	s=[x.strip() for x in SENT_SPLIT.split(t) if x.strip()]
	return [x for x in s if len(x)>=25]

	# ====== (1) عبارات مفتاحية (YAKE) ======
	def yake_keywords(t: str, k: int = 260) -> List[str]:
	phrases = []
	seen = set()
	for n in [3, 2, 1]:
	try:
	ex = yake.KeywordExtractor(lan='ar', n=n, top=k)
	pairs = ex.extract_keywords(t)
	except Exception:
	pairs = []
	for w, _ in pairs:
	w = re2.sub(r"\s+", " ", w.strip())
	if not w or w in seen:
	continue
	if re2.match(r"^[\p{P}\p{S}\d_]+$", w):
	continue
	if 2 <= len(w) <= 40:
	phrases.append(w)
	seen.add(w)
	return phrases

	def good_kw(kw:str)->bool:
	return kw and len(kw)>=2 and kw not in AR_STOP and not re2.match(r"^[\p{P}\p{S}\d_]+$", kw)

	# ====== POS/NER اختياري ======
	_HAS_CAMEL = False
	try:
	from camel_tools.morphology.analyzer import Analyzer
	from camel_tools.ner import NERecognizer
	_HAS_CAMEL = True
	_AN = Analyzer.builtin_analyzer()
	_NER = NERecognizer.pretrained()
	except Exception:
	_HAS_CAMEL = False

	NER_TAGS = {"PER","LOC","ORG","MISC"}

	def ar_pos(word: str) -> str:
	if not _HAS_CAMEL:
	if re2.match(r"^(في\|على\|الى\|إلى\|من\|عن\|حتى\|ثم\|بل\|لكن\|أو\|و)$", word): return "PART"
	if re2.match(r"^[\p{N}]+$", word): return "NUM"
	if re2.search(r"(ة\|ات\|ون\|ين\|ان)$", word): return "NOUN"
	return "X"
	try:
	ana = _AN.analyze(word)
	if not ana: return "X"
	from collections import Counter
	pos_candidates = [a.get('pos','X') for a in ana]
	return Counter(pos_candidates).most_common(1)[0][0] if pos_candidates else "X"
	except Exception:
	return "X"

	def is_named_entity(token: str) -> bool:
	if not _HAS_CAMEL:
	return False
	try:
	tag = _NER.predict_sentence([token])[0]
	return tag in NER_TAGS
	except Exception:
	return False

	def is_clean_sentence(s: str) -> bool:
	if not (60 <= len(s) <= 240): return False
	if re2.search(r"https?://\|www\.", s): return False
	if re2.search(r"\d{2,}", s): return False
	return True

	def safe_keyword(k: str) -> bool:
	if not good_kw(k): return False
	if is_named_entity(k): return False
	if ar_pos(k) in {"PRON","PART"}: return False
	return True

	# ====== Embeddings/Masking/Cross-Encoder (اختياري) ======
	_EMB = None
	def get_embedder():
	global _EMB
	if _EMB is None:
	try:
	from sentence_transformers import SentenceTransformer
	_EMB = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
	except Exception:
	_EMB = False
	return _EMB

	def nearest_terms(target: str, pool: List[str], k: int = 24) -> List[Tuple[str, float]]:
	emb = get_embedder()
	if not emb:
	return []
	cand = [w for w in pool if w != target and len(w) >= 2 and not re2.match(r"^[\p{P}\p{S}\d_]+$", w)]
	if not cand:
	return []
	vecs = emb.encode([target] + cand, normalize_embeddings=True)
	t, C = vecs[0], vecs[1:]
	import numpy as np
	sims = (C @ t)
	idx = np.argsort(-sims)[:k]
	return [(cand[i], float(sims[i])) for i in idx]

	_MLM = None
	def get_masker():
	global _MLM
	if _MLM is None:
	try:
	from transformers import pipeline
	_MLM = pipeline("fill-mask", model="aubmindlab/bert-base-arabertv02")
	except Exception:
	_MLM = False
	return _MLM

	def mlm_distractors(sentence_with_blank: str, correct: str, k: int = 18) -> List[str]:
	masker = get_masker()
	if not masker:
	return []
	masked = sentence_with_blank.replace("_____", masker.tokenizer.mask_token)
	try:
	outs = masker(masked, top_k=max(25, k+7))
	cands = []
	for o in outs:
	tok = o["token_str"].strip()
	if tok and tok != correct and len(tok) >= 2 and not re2.match(r"^[\p{P}\p{S}\d_]+$", tok):
	cands.append(tok)
	uniq, seen = [], set()
	for w in cands:
	if w not in seen:
	uniq.append(w); seen.add(w)
	return uniq[:k]
	except Exception:
	return []

	_CE = None
	def get_cross_encoder():
	global _CE
	if _CE is None:
	try:
	from sentence_transformers import CrossEncoder
	_CE = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
	except Exception:
	_CE = False
	return _CE

	def rank_by_ce(sentence_with_blank: str, candidates: List[str]) -> List[str]:
	ce = get_cross_encoder()
	if not ce or not candidates:
	return candidates
	pairs = [(sentence_with_blank.replace("_____", c), c) for c in candidates]
	try:
	scores = ce.predict([p[0] for p in pairs])
	ranked = [c for _, c in sorted(zip(scores, [p[1] for p in pairs]), key=lambda x:-x[0])]
	return ranked
	except Exception:
	return candidates

	# --------- أدوات مساعدة للمشتّتات ---------
	def word_tokens(s: str) -> List[str]:
	s = norm_ar(s)
	return re2.findall(r"\p{L}+", s)

	def token_set(s: str) -> set:
	return set([t for t in word_tokens(s) if t not in AR_STOP])

	def jaccard(a: str, b: str) -> float:
	A, B = token_set(a), token_set(b)
	if not A or not B: return 0.0
	return len(A & B) / max(1, len(A \| B))

	def is_sub_or_super(a: str, b: str) -> bool:
	A, B = norm_ar(a), norm_ar(b)
	return (A in B) or (B in A)

	def appears_as_long_fragment_in_sentence(w: str, sentence: str) -> bool:
	toks = word_tokens(w)
	if len(toks) < 3:
	return False
	return re2.search(rf"(?<!\p{{L}}){re2.escape(norm_ar(w))}(?!\p{{L}})", norm_ar(sentence)) is not None

	def choice_length_ok(w: str) -> bool:
	n = len(word_tokens(w))
	return 1 <= n <= 6

	def paragraph_index_map(text: str, sentences: List[str]) -> dict:
	paras = [norm_ar(p) for p in re2.split(r"\n{2,}", text) if p.strip()]
	mapping = {}
	for i, s in enumerate(sentences):
	ns = norm_ar(s)
	pid = None
	for j, p in enumerate(paras):
	if ns and ns in p:
	pid = j; break
	mapping[s] = pid if pid is not None else -1
	return mapping

	def looks_like_title_fragment(s: str) -> bool:
	return ":" in s and s.index(":") < max(10, len(s)//6)

	def is_nouny_phrase(w: str) -> bool:
	toks = word_tokens(w)
	if not (1 <= len(toks) <= 4): return False
	if re2.search(r"(يفعل\|تفعل\|يشهد\|تقوم\|يمكن\|قد\|سوف)$", w): return False
	return True

	def best_keyword_in_sentence(sentence: str, global_text: str) -> Optional[str]:
	if looks_like_title_fragment(sentence):
	parts = sentence.split(":", 1)
	sentence = parts[1] if len(parts) > 1 else sentence
	try:
	ex = yake.KeywordExtractor(lan='ar', n=3, top=24)
	pairs = ex.extract_keywords(sentence)
	except Exception:
	pairs = []
	cands = []
	for w, _ in pairs:
	w = re2.sub(r"\s+", " ", w.strip())
	if not w or not good_kw(w) or not safe_keyword(w):
	continue
	if not is_nouny_phrase(w):
	continue
	if not re2.search(rf"(?<!\p{{L}}){re2.escape(w)}(?!\p{{L}})", sentence):
	continue
	freq_weight = global_text.count(w)
	cands.append((w, len(w) + 0.7*freq_weight))
	if not cands:
	toks = [t for t in re2.findall(r"\p{L}+", sentence) if good_kw(t) and safe_keyword(t)]
	toks = [t for t in toks if is_nouny_phrase(t)]
	toks.sort(key=len, reverse=True)
	return toks[0] if toks else None
	cands.sort(key=lambda x: -x[1])
	return cands[0][0]

	def similarity_caps(difficulty: str):
	if difficulty == "سهل":
	return 0.88
	if difficulty == "صعب":
	return 0.95
	return 0.92

	def tokenize_ar(s: str) -> List[str]:
	s = norm_ar(s)
	toks = re2.findall(r"\p{L}+", s)
	return [t for t in toks if len(t) >= 2 and t not in AR_STOP]

	def bm25_build(sentences: List[str]):
	if not _HAS_BM25 or not sentences:
	return None, []
	corpus_tokens = [tokenize_ar(s) for s in sentences]
	bm = BM25Okapi(corpus_tokens)
	return bm, corpus_tokens

	def bm25_candidates(correct: str, sentences: List[str], bm, corpus_tokens, top: int = 20) -> List[str]:
	if not bm: return []
	q = tokenize_ar(correct)
	scores = bm.get_scores(q)
	idxs = sorted(range(len(scores)), key=lambda i: -scores[i])[:min(top, len(scores))]
	pool = set()
	for i in idxs:
	for tok in corpus_tokens[i]:
	if tok != correct and good_kw(tok):
	pool.add(tok)
	return list(pool)

	def typo_like_variants(answer: str, k: int = 4) -> List[str]:
	"""مشتّتات شكلية: تعريف/تنكير، ي/ى، ة/ه، حذف حرف."""
	a = norm_ar(answer)
	vars = set()
	if a.startswith("ال"):
	vars.add(a[2:])
	else:
	vars.add("ال" + a)
	vars.add(a.replace("ي", "ى"))
	vars.add(a.replace("ى", "ي"))
	vars.add(a.replace("ة", "ه"))
	vars.add(a.replace("ه", "ة"))
	if len(a) > 5:
	mid = len(a)//2
	vars.add(a[:mid] + a[mid+1:])
	out = [v for v in vars if v and norm_ar(v) != norm_ar(a)]
	return out[:k]

	# ====== مشتّتات ذكية ======
	def pos_compatible(a: str, b: str) -> bool:
	pa, pb = ar_pos(a), ar_pos(b)
	if "X" in (pa, pb):
	return True
	return pa == pb

	def length_close(a: str, b: str) -> bool:
	return abs(len(a) - len(b)) <= max(6, len(b)//2)

	def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: int = 3,
	all_sentences: Optional[List[str]] = None, difficulty: str = "متوسط") -> List[str]:
	base: List[str] = []

	# (0) مشتّتات شكلية أولاً
	base.extend(typo_like_variants(correct, k=4))

	# (أ) جيران دلاليين
	base.extend([w for w,_ in nearest_terms(correct, phrase_pool, k=24)])

	# (ب) FILL-MASK
	for w in mlm_distractors(sentence.replace(correct, "_____"), correct, k=18):
	if w not in base:
	base.append(w)

	# (ج) BM25
	if all_sentences:
	bm, corp = bm25_build(all_sentences)
	for w in bm25_candidates(correct, all_sentences, bm, corp, top=18):
	if w not in base:
	base.append(w)

	# فلترة صارمة
	clean: List[str] = []
	for w in base:
	w = (w or "").strip()
	if not w or w == correct:
	continue
	if not choice_length_ok(w):
	continue
	if appears_as_long_fragment_in_sentence(w, sentence):
	continue
	if is_named_entity(w):
	continue
	if not pos_compatible(w, correct):
	continue
	if not length_close(w, correct):
	continue
	if is_sub_or_super(w, correct):
	continue
	if jaccard(w, correct) >= 0.5:
	continue
	clean.append(w)

	# ترتيب (اختياري) + فلتر قرب دلالي
	clean = rank_by_ce(sentence.replace(correct, "_____"), clean)[:max(k*4, k)]
	cap = similarity_caps(difficulty)
	try:
	emb = get_embedder()
	if emb and clean:
	vecs = emb.encode([correct] + clean, normalize_embeddings=True)
	c, others = vecs[0], vecs[1:]
	import numpy as np
	sims = others @ c
	filtered = [w for w, s in zip(clean, sims) if s < cap]
	if len(filtered) >= k:
	clean = filtered
	except Exception:
	pass

	# تجميع أخير
	out, seen = [], set()
	for w in clean:
	if w in seen:
	continue
	seen.add(w); out.append(w)
	if len(out) >= k:
	break

	# تعويض إذا لزم
	if len(out) < k:
	extras = [w for w in phrase_pool
	if w not in out and w != correct and choice_length_ok(w)
	and not appears_as_long_fragment_in_sentence(w, sentence)
	and not is_sub_or_super(w, correct)
	and jaccard(w, correct) < 0.5]
	out.extend(extras[:(k-len(out))])
	if len(out) < k:
	out.extend([w for w in ["…"]*(k-len(out))]) # لن تُقبل لاحقًا إن لم نكمل 4 خيارات
	return out[:k]

	# ====== mT5 (اختياري) ======
	_MT5 = {"tok": None, "model": None, "ok": False}
	def get_mt5():
	if _MT5["tok"] is not None or _MT5["model"] is not None or _MT5["ok"]:
	return _MT5["tok"], _MT5["model"], _MT5["ok"]
	try:
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	_MT5["tok"] = AutoTokenizer.from_pretrained("google/mt5-small")
	_MT5["model"] = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")
	_MT5["ok"] = True
	except Exception:
	_MT5["tok"] = None; _MT5["model"] = None; _MT5["ok"] = False
	return _MT5["tok"], _MT5["model"], _MT5["ok"]

	def parse_json_block(s: str) -> Optional[dict]:
	try:
	return json.loads(s)
	except Exception:
	pass
	m = re2.search(r"\{.*\}", s, flags=re2.DOTALL)
	if m:
	try:
	return json.loads(m.group(0))
	except Exception:
	return None
	return None

	def comp_prompt(sentence: str) -> str:
	return (
	"أنت منشئ أسئلة متعددة الخيارات باللغة العربية.\n"
	"من الجملة التالية، أنشئ سؤال فهم مباشر واحدًا مع أربع خيارات وإشارة للجواب الصحيح.\n"
	"أعد فقط JSON بهذا الشكل:\n"
	"{\n"
	"\"question\": \"...\",\n"
	"\"choices\": [\"...\",\"...\",\"...\",\"...\"],\n"
	"\"answer_index\": 0\n"
	"}\n\n"
	f"الجملة: {sentence}"
	)

	def gen_one_comp_q(sentence: str, tok, model, max_new_tokens=128) -> Optional[MCQ]:
	try:
	import torch
	device = "cuda" if torch.cuda.is_available() else "cpu"
	model = model.to(device)
	inp = tok(comp_prompt(sentence), return_tensors="pt").to(device)
	out = model.generate(
	**inp,
	max_new_tokens=max_new_tokens,
	do_sample=True,
	temperature=0.8,
	top_p=0.9,
	num_return_sequences=1,
	eos_token_id=tok.eos_token_id
	)
	text = tok.decode(out[0], skip_special_tokens=True)
	data = parse_json_block(text) or {}
	q = str(data.get("question","")).strip()
	choices = data.get("choices", [])
	ai = data.get("answer_index", 0)
	if not q or not isinstance(choices, list) or len(choices) < 4:
	return None
	choices = [str(c).strip() for c in choices][:4]
	ai = ai if isinstance(ai, int) and 0 <= ai < 4 else 0
	return MCQ(id=str(uuid.uuid4())[:8], question=q, choices=choices, answer_index=ai)
	except Exception:
	return None

	def make_comp_mcqs(text: str, n: int = 6, difficulty: str = "متوسط") -> List[MCQ]:
	tok, model, ok = get_mt5()
	if not ok:
	return make_mcqs(text, n, difficulty=difficulty)

	sents_all = split_sents(text)
	sents = [s for s in sents_all if is_clean_sentence(s)] or sents_all[:]
	if not sents:
	return make_mcqs(text, n, difficulty=difficulty)

	# دمج جمل قصيرة لمقاطع مفيدة
	def make_chunks(sents, max_len=220):
	chunks = []
	i = 0
	while i < len(sents):
	cur = sents[i]
	j = i + 1
	while j < len(sents) and len(cur) + 1 + len(sents[j]) <= max_len:
	cur = cur + " " + sents[j]
	j += 1
	chunks.append(cur)
	i = j
	return chunks

	candidates = sents[:] + make_chunks(sents, max_len=220)
	random.shuffle(candidates)

	items: List[MCQ] = []
	tried = 0
	for s in candidates:
	if len(items) >= n: break
	mcq = gen_one_comp_q(s, tok, model)
	tried += 1
	if mcq:
	q = re2.sub(r"\s+", " ", mcq.question).strip()
	if not (12 <= len(q) <= 220):
	continue
	choices = [re2.sub(r"\s+", " ", c).strip() for c in mcq.choices]
	seen=set(); clean=[]
	for c in choices:
	if c and c not in seen:
	seen.add(c); clean.append(c)
	clean = (clean + ["…","…","…","…"])[:4]
	ai = mcq.answer_index if isinstance(mcq.answer_index,int) and 0<=mcq.answer_index<4 else 0
	items.append(MCQ(id=str(uuid.uuid4())[:8], question=q, choices=clean, answer_index=ai))
	if tried >= n * 12:
	break

	if not items:
	return make_mcqs(text, n, difficulty=difficulty)
	return items[:n]

	# ------------------ مُولّد أسئلة "فراغ" (نهائي) ------------------
	def make_mcqs(text: str, n: int = 6, difficulty: str = "متوسط") -> List[MCQ]:
	all_sents = split_sents(text)
	sents = [s for s in all_sents if is_clean_sentence(s)] or all_sents[:]
	if not sents:
	raise ValueError("النص قصير أو غير صالح.")

	keyphrases = yake_keywords(text, k=260)
	keyphrases = [kp for kp in keyphrases if safe_keyword(kp) and 2 <= len(kp) <= 40]

	sent_for: dict = {}
	for s in sents:
	for kp in keyphrases:
	if kp in sent_for:
	continue
	if re2.search(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", s):
	sent_for[kp] = s
	if len(sent_for) >= n * 5:
	break

	para_map = paragraph_index_map(text, sents)
	used_sentences: set = set()
	items: List[MCQ] = []

	MAX_PER_PARA = 2
	para_count: dict = {}

	def add_item_from_pair(sentence: str, kp: str) -> bool:
	nonlocal items, used_sentences, para_count
	pid = para_map.get(sentence, -1)
	if para_count.get(pid, 0) >= MAX_PER_PARA:
	return False
	if not re2.search(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", sentence):
	return False
	q = re2.sub(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", "_____", sentence, count=1)
	pool = [x for x in keyphrases if x != kp] or keyphrases[:]
	ch = smart_distractors(kp, pool, sentence, k=3,
	all_sentences=all_sents, difficulty=difficulty) + [kp]

	choices, seen = [], set()
	for c in ch:
	c = (c or "").strip()
	if not c or c in seen:
	continue
	if not choice_length_ok(c):
	continue
	if appears_as_long_fragment_in_sentence(c, sentence):
	continue
	if is_sub_or_super(c, kp) or jaccard(c, kp) >= 0.5:
	continue
	seen.add(c); choices.append(c)

	if kp not in choices:
	choices.append(kp); seen.add(kp)
	if len(choices) < 4:
	return False

	choices = choices[:4]
	random.shuffle(choices)
	ans = choices.index(kp)

	items.append(MCQ(id=str(uuid.uuid4())[:8], question=q, choices=choices, answer_index=ans))
	used_sentences.add(sentence)
	para_count[pid] = para_count.get(pid, 0) + 1
	return True

	# تمريرة أولى: تنويع على الفقرات
	for kp in sorted(sent_for.keys(), key=lambda x: (-len(x), x)):
	if len(items) >= n: break
	s = sent_for[kp]
	if s in used_sentences:
	continue
	_ = add_item_from_pair(s, kp)

	def fill_from_sentences(candidates: List[str]):
	for s in candidates:
	if len(items) >= n: break
	if s in used_sentences:
	continue
	kp = None
	for kpp, ss in sent_for.items():
	if ss == s:
	kp = kpp; break
	if kp is None:
	kp = best_keyword_in_sentence(s, text)
	if not kp:
	continue
	_ = add_item_from_pair(s, kp)

	if len(items) < n:
	remaining_new_para = [s for s in sents if para_count.get(para_map.get(s, -1), 0) < MAX_PER_PARA]
	fill_from_sentences(remaining_new_para)
	if len(items) < n:
	leftovers = [s for s in sents if s not in used_sentences]
	fill_from_sentences(leftovers)

	if not items:
	raise RuntimeError("تعذّر توليد أسئلة.")
	return items[:n]

	# ------------------ تحويل إلى سجلات العرض ------------------
	def clean_option_text(t: str) -> str:
	t = (t or "").strip()
	t = re2.sub(AR_DIAC, "", t)
	t = re2.sub(r"\s+", " ", t)
	t = re2.sub(r"^[\p{P}\p{S}_-]+\|[\p{P}\p{S}_-]+$", "", t)
	# قصّ لطول معقول
	t = re2.sub(r"^(.{,60})(?:\s.*)?$", r"\1", t)
	return t or "…"

	def to_records(items:List[MCQ])->List[dict]:
	recs=[]
	for it in items:
	opts=[]
	used=set()
	for i,lbl in enumerate(["A","B","C","D"]):
	txt=(it.choices[i] if i<len(it.choices) else "…")
	txt=clean_option_text(txt.replace(",", "،").replace("?", "؟").replace(";", "؛"))
	if txt in used:
	txt = f"{txt}‌{i+1}"
	used.add(txt)
	opts.append({"id":lbl,"text":txt,"is_correct":(i==it.answer_index)})
	recs.append({"id":it.id,"question":it.question.strip(),"options":opts})
	return recs

	# ------------------ صفحة الأسئلة (HTML فقط) ------------------
	def render_quiz_html(records: List[dict]) -> str:
	parts=[]
	for i, rec in enumerate(records, start=1):
	qid = rec["id"]
	qtxt = rec["question"]
	cor = next((o["id"] for o in rec["options"] if o["is_correct"]), "")
	opts_html=[]
	for o in rec["options"]:
	lid, txt = o["id"], o["text"]
	opts_html.append(f"""
	<label class="opt" data-letter="{lid}">
	<input type="radio" name="q_{qid}" value="{lid}">
	<span class="opt-letter">{lid}</span>
	<span class="opt-text">{txt}</span>
	</label>
	""")
	parts.append(f"""
	<div class="q-card" data-qid="{qid}" data-correct="{cor}">
	<div class="q-header">
	<div class="q-title">السؤال {i}</div>
	<div class="q-badge" id="b_{qid}" hidden></div>
	</div>
	<div class="q-text">{qtxt}</div>
	<div class="opts">{''.join(opts_html)}</div>
	<div class="q-actions">
	<button class="q-submit">Submit</button>
	<span class="q-note" id="n_{qid}"></span>
	</div>
	</div>
	""")
	return f"""<div id="quiz" class="quiz-wrap">{''.join(parts)}</div>"""

	# ------------------ بناء الامتحان وتبديل الصفحات ------------------
	def build_quiz(text_area, file_path, n, model_id, zoom, mode, difficulty):
	text_area = (text_area or "").strip()
	if not text_area and not file_path:
	return "", gr.update(visible=True), gr.update(visible=False), "🛈 الصق نصًا أو ارفع ملفًا أولًا."
	raw = text_area if text_area else file_to_text(file_path, model_id=model_id, zoom=float(zoom))[0]
	cleaned = postprocess(raw)

	used_mode = mode
	try:
	if mode == "فهم مباشر":
	tok, model, ok = get_mt5()
	if ok:
	items = make_comp_mcqs(cleaned, n=int(n), difficulty=difficulty)
	else:
	items = make_mcqs(cleaned, n=int(n), difficulty=difficulty)
	used_mode = "فراغ (fallback)"
	else:
	items = make_mcqs(cleaned, n=int(n), difficulty=difficulty)
	except Exception:
	items = make_mcqs(cleaned, n=int(n), difficulty=difficulty)
	used_mode = "فراغ (fallback)"

	recs = to_records(items)
	warn = f"نمط مُستخدَم: {used_mode} — عدد الأسئلة: {len(items)}"
	return render_quiz_html(recs), gr.update(visible=False), gr.update(visible=True), warn

	# ------------------ CSS ------------------
	CSS = """
	:root{
	--bg:#0e0e11; --panel:#15161a; --card:#1a1b20; --muted:#a7b0be;
	--text:#f6f7fb; --accent:#6ee7b7; --accent2:#34d399; --danger:#ef4444; --border:#262833;
	}
	body{direction:rtl; font-family:system-ui,'Cairo','IBM Plex Arabic',sans-serif; background:var(--bg);}
	.gradio-container{max-width:980px;margin:0 auto;padding:12px 12px 40px;}
	h2.top{color:#eaeaf2;margin:6px 0 16px}

	/* صفحة الإدخال ثابتة الارتفاع ولا تتغير أبعادها */
	.input-panel{background:var(--panel);border:1px solid var(--border);border-radius:14px;padding:16px;
	box-shadow:0 16px 38px rgba(0,0,0,.35); min-height:360px; display:flex; flex-direction:column; gap:12px;}
	.small{opacity:.9;color:#d9dee8}

	/* إخفاء معاينة الملف */
	[data-testid="file"] .file-preview, [data-testid="file"] .file-preview * { display:none !important; }
	[data-testid="file"] .grid-wrap { display:block !important; }
	.upload-like{border:2px dashed #3b3f52;background:#121318;border-radius:12px;padding:12px;color:#cfd5e3;min-height:90px}

	.button-primary>button{background:linear-gradient(180deg,var(--accent),var(--accent2));border:none;color:#0b0d10;font-weight:800}
	.button-primary>button:hover{filter:brightness(.95)}
	textarea{min-height:120px}

	/* صفحة الأسئلة */
	.q-card{background:var(--card);border:1px solid var(--border);border-radius:14px;padding:14px;margin:12px 0}
	.q-header{display:flex;gap:10px;align-items:center;justify-content:space-between;margin-bottom:6px}
	.q-title{color:#eaeaf2;font-weight:800}
	.q-badge{padding:8px 12px;border-radius:10px;font-weight:700}
	.q-badge.ok{background:#083a2a;color:#b6f4db;border:1px solid #145b44}
	.q-badge.err{background:#3a0d14;color:#ffd1d6;border:1px solid #6a1e2b}

	.q-text{color:#eaeaf2;font-size:1.06rem;line-height:1.8;margin:8px 0 12px}
	.opts{display:flex;flex-direction:column;gap:8px}
	.opt{display:flex;gap:10px;align-items:center;background:#14161c;border:1px solid #2a2d3a;border-radius:12px;padding:10px;transition:background .15s,border-color .15s}
	.opt input{accent-color:var(--accent2)}
	.opt-letter{display:inline-flex;width:28px;height:28px;border-radius:8px;background:#0f1116;border:1px solid #2a2d3a;align-items:center;justify-content:center;font-weight:800;color:#dfe6f7}
	.opt-text{color:#eaeaf2}
	.opt.ok{background:#0f2f22;border-color:#1b6a52}
	.opt.err{background:#3a0d14;border-color:#6a1e2b}

	.q-actions{display:flex;gap:10px;align-items:center;margin-top:10px}
	.q-actions .q-submit{
	background:#2dd4bf;border:none;color:#0b0د10;font-weight:800;border-radius:10px;padding:8px 14px;cursor:pointer;
	}
	.q-actions .q-submit:disabled{opacity:.5;cursor:not-allowed}
	.q-note{color:#ffd1d6}
	.q-note.warn{color:#ffd1d6}
	"""

	# ------------------ JS: ربط Submit + إبراز الصح ------------------
	ATTACH_LISTENERS_JS = """
	() => {
	if (window.__q_submit_bound_multi2) { return 'already'; }
	window.__q_submit_bound_multi2 = true;

	document.addEventListener('click', function(e){
	if (!e.target \|\| !e.target.classList \|\| !e.target.classList.contains('q-submit')) return;

	const card = e.target.closest('.q-card');
	if (!card) return;

	const qid = card.getAttribute('data-qid');
	const correct = card.getAttribute('data-correct');
	const note = document.getElementById('n_'+qid);
	const badge = document.getElementById('b_'+qid);
	const chosen = card.querySelector('input[type="radio"]:checked');

	if (!chosen) {
	if (note){ note.textContent = 'اختر إجابة أولاً'; note.className='q-note warn'; }
	return;
	}

	const chosenLabel = chosen.closest('.opt');

	if (chosen.value === correct) {
	chosenLabel.classList.add('ok');
	if (badge){ badge.hidden=false; badge.className='q-badge ok'; badge.textContent='Correct!'; }
	card.querySelectorAll('input[type="radio"]').forEach(i => i.disabled = true);
	e.target.disabled = true;
	if (note) note.textContent = '';

	const qNode = card.querySelector('.q-text');
	if (qNode){
	const full = qNode.textContent \|\| qNode.innerText \|\| '';
	const correctText = [...card.querySelectorAll('.opt')].find(o =>
	o.querySelector('input').value === correct
	)?.querySelector('.opt-text')?.textContent \|\| '';
	if (full && correctText && full.includes('_____')){
	const highlighted = full.replace('_____', `<mark style="background:#2dd4bf22;border:1px solid #2dd4bf55;border-radius:6px;padding:0 4px">${correctText}</mark>`);
	qNode.innerHTML = highlighted;
	}
	}
	return;
	}

	chosenLabel.classList.add('err');
	if (badge){ badge.hidden=false; badge.className='q-badge err'; badge.textContent='Incorrect.'; }
	if (note) note.textContent = '';
	});

	return 'wired-multi2';
	}
	"""

	# ------------------ واجهة Gradio ------------------
	with gr.Blocks(title="Question Generator", css=CSS) as demo:
	gr.Markdown("<h2 class='top'>Question Generator</h2>")

	page1 = gr.Group(visible=True, elem_classes=["input-panel"])
	with page1:
	gr.Markdown("اختر أحد الخيارين ثم اضغط الزر.", elem_classes=["small"])
	text_area = gr.Textbox(lines=6, placeholder="ألصق نصك هنا...", label="لصق نص")
	file_comp = gr.File(label="أو ارفع ملف (PDF / TXT)", file_count="single",
	file_types=[".pdf",".txt"], type="filepath", elem_classes=["upload-like"])
	num_q = gr.Slider(4, 20, value=DEFAULT_NUM_QUESTIONS, step=1, label="عدد الأسئلة")

	mode_radio = gr.Radio(choices=QUESTION_MODES, value="فراغ", label="نوع السؤال")
	difficulty_radio = gr.Radio(choices=DIFFICULTY_MODES, value="متوسط", label="درجة الصعوبة")

	with gr.Accordion("خيارات PDF المصوّر (اختياري)", open=False):
	trocr_model = gr.Dropdown(
	choices=[
	"microsoft/trocr-base-printed",
	"microsoft/trocr-large-printed",
	"microsoft/trocr-base-handwritten",
	"microsoft/trocr-large-handwritten",
	],
	value=DEFAULT_TROCR_MODEL, label="نموذج TrOCR"
	)
	trocr_zoom = gr.Slider(2.0, 3.5, value=DEFAULT_TROCR_ZOOM, step=0.1, label="Zoom OCR")

	btn_build = gr.Button("generate quistion", elem_classes=["button-primary"])
	warn = gr.Markdown("", elem_classes=["small"])

	page2 = gr.Group(visible=False)
	with page2:
	quiz_html = gr.HTML("")
	js_wired = gr.Textbox(visible=False)

	btn_build.click(
	build_quiz,
	inputs=[text_area, file_comp, num_q, trocr_model, trocr_zoom, mode_radio, difficulty_radio],
	outputs=[quiz_html, page1, page2, warn]
	).then(
	None, inputs=None, outputs=[js_wired], js=ATTACH_LISTENERS_JS
	)

	if __name__ == "__main__":
	demo.queue().launch()