Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
# صفحتان ثابتتان + Submit لكل سؤال يعمل فعليًا + منع تغيّر أبعاد صفحة الإدخال
|
| 3 |
|
| 4 |
-
import os, json, uuid, random, unicodedata
|
| 5 |
from dataclasses import dataclass
|
| 6 |
from pathlib import Path
|
| 7 |
from typing import List, Tuple, Optional
|
|
@@ -93,7 +93,7 @@ def norm_ar(t:str)->str:
|
|
| 93 |
t = re2.sub(AR_DIAC, "", t)
|
| 94 |
t = re2.sub(r"[إأآا]", "ا", t)
|
| 95 |
t = re2.sub(r"[يى]", "ي", t)
|
| 96 |
-
t = re2.sub(r"\s+", " ", t)
|
| 97 |
t = re2.sub(r'(\p{L})\1{2,}', r'\1', t)
|
| 98 |
t = re2.sub(r'(\p{L})\1', r'\1', t)
|
| 99 |
return t.strip()
|
|
@@ -138,7 +138,6 @@ def yake_keyphrases(t: str, top_k: int = 180) -> List[str]:
|
|
| 138 |
continue
|
| 139 |
if 2 <= len(w) <= 42:
|
| 140 |
phrases.append(w); seen.add(w)
|
| 141 |
-
# إزالة العبارات التي هي جزء من أطول
|
| 142 |
phrases_sorted = sorted(phrases, key=lambda x: (-len(x), x))
|
| 143 |
kept=[]
|
| 144 |
for p in phrases_sorted:
|
|
@@ -191,7 +190,6 @@ def mlm_fill(sentence_with_blank: str, correct: str, k: int = 20) -> List[str]:
|
|
| 191 |
tok = o["token_str"].strip()
|
| 192 |
if tok and tok != correct and len(tok) >= 2 and not re2.match(r"^[\p{P}\p{S}\d_]+$", tok):
|
| 193 |
cands.append(tok)
|
| 194 |
-
# فريد مع الحفاظ على الترتيب
|
| 195 |
seen=set(); uniq=[]
|
| 196 |
for w in cands:
|
| 197 |
if w not in seen:
|
|
@@ -269,102 +267,158 @@ def sentence_score(s: str) -> float:
|
|
| 269 |
bonus = 0.2 if ("،" in s or ":" in s) else 0.0
|
| 270 |
return base + bonus + penalties
|
| 271 |
|
| 272 |
-
#
|
| 273 |
-
def rank_by_sentence_coherence(sentence_with_blank: str, correct: str, candidates: List[str], topk: int=3) -> List[str]:
|
| 274 |
-
emb = get_embedder()
|
| 275 |
-
if not emb or not candidates:
|
| 276 |
-
return candidates[:topk]
|
| 277 |
-
filled = [sentence_with_blank.replace("_____", c) for c in candidates]
|
| 278 |
-
ref = sentence_with_blank.replace("_____", correct)
|
| 279 |
-
vecs = embed_texts([ref] + filled)
|
| 280 |
-
if vecs is None:
|
| 281 |
-
return candidates[:topk]
|
| 282 |
-
import numpy as np
|
| 283 |
-
ref_vec = vecs[0]
|
| 284 |
-
cand_vecs = vecs[1:]
|
| 285 |
-
sims = cand_vecs @ ref_vec
|
| 286 |
-
order = list(reversed(sorted(range(len(candidates)), key=lambda i: sims[i])))
|
| 287 |
-
ranked = [candidates[i] for i in order]
|
| 288 |
-
return ranked[:topk]
|
| 289 |
|
| 290 |
-
#
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
stats = {}
|
| 294 |
-
for t in toks:
|
| 295 |
-
tt = norm_ar(t)
|
| 296 |
-
if not good_kw(tt):
|
| 297 |
-
continue
|
| 298 |
-
stats[tt] = stats.get(tt, 0) + 1
|
| 299 |
-
top = [w for w,_ in sorted(stats.items(), key=lambda kv: -kv[1])]
|
| 300 |
-
return top[:limit]
|
| 301 |
-
|
| 302 |
-
# ================== (NEW) موازنة الطول والتطويل ==================
|
| 303 |
-
|
| 304 |
-
# كاش صغير لعبارة الصحيحة لاستخدامها أثناء التطويل
|
| 305 |
ref_phrase_cache = {}
|
| 306 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 307 |
def word_len(s: str) -> int:
|
| 308 |
return len([w for w in re2.split(r"\s+", s.strip()) if w])
|
| 309 |
|
| 310 |
-
def within_ratio(cand: str, target_len: int, tol: float = 0.
|
| 311 |
L = word_len(cand)
|
| 312 |
return (target_len*(1-tol) <= L <= target_len*(1+tol))
|
| 313 |
|
| 314 |
-
# قوالب عربية عامة للتطويل عند غياب الـMLM أو فشل توقع مناسب
|
| 315 |
-
GENERIC_PREFIXES = ["تقنيات", "مجال", "أنظمة", "تطبيقات", "مفاهيم", "ممارسات", "نماذج", "آليات"]
|
| 316 |
-
GENERIC_SUFFIXES = ["الذكية", "التعليمية", "الحديثة", "المتقدمة", "المبتكرة", "الرقمية"]
|
| 317 |
-
|
| 318 |
def shape_phrase_like(ref: str, cand: str) -> str:
|
| 319 |
-
"""مواءمة التعريف/التنكير لتقارب الشكل العام."""
|
| 320 |
return with_same_definiteness(ref, cand)
|
| 321 |
|
| 322 |
def try_mlm_expand(cand: str, sentence_with_blank: str, target_len: int) -> Optional[str]:
|
| 323 |
-
"""توسيع المشتّت عبر MLم بإضافة كلمة قبل/بعد ليقترب الطول من الصحيحة."""
|
| 324 |
masker = get_masker()
|
| 325 |
if not masker:
|
| 326 |
return None
|
| 327 |
-
trials = [
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
for
|
| 332 |
try:
|
| 333 |
-
outs = masker(
|
| 334 |
except Exception:
|
| 335 |
continue
|
| 336 |
for o in outs:
|
| 337 |
tok = o["token_str"].strip()
|
| 338 |
-
if not
|
| 339 |
continue
|
| 340 |
-
if
|
|
|
|
|
|
|
| 341 |
phrase = f"{tok} {cand}"
|
| 342 |
else:
|
|
|
|
|
|
|
| 343 |
phrase = f"{cand} {tok}"
|
| 344 |
-
|
| 345 |
-
if within_ratio(phrase, target_len) and norm_ar(phrase) != norm_ar(ref_phrase_cache.get("correct","")):
|
| 346 |
return phrase
|
| 347 |
return None
|
| 348 |
|
| 349 |
def fallback_expand(cand: str, target_len: int) -> str:
|
| 350 |
-
|
| 351 |
-
for p in GENERIC_PREFIXES:
|
| 352 |
phrase = f"{p} {cand}"
|
| 353 |
-
if within_ratio(phrase, target_len):
|
| 354 |
-
return phrase
|
| 355 |
-
for sfx in
|
| 356 |
phrase = f"{cand} {sfx}"
|
| 357 |
-
if within_ratio(phrase, target_len):
|
| 358 |
-
return phrase
|
| 359 |
-
|
| 360 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 361 |
|
| 362 |
# --- (J) مشتّتات ذكية تضمن دائمًا ≥3 خيارات فعلية + موازنة الطول ---
|
| 363 |
def smart_distractors(correct: str, phrase_pool: List[str], sentence_with_blank: str, backup_terms: List[str], k: int = 3) -> List[str]:
|
| 364 |
target = correct.strip()
|
| 365 |
-
ref_phrase_cache["correct"] = target
|
| 366 |
|
| 367 |
-
# 1) مصادر متعددة
|
| 368 |
neigh = nearest_terms(target, phrase_pool, k=48)
|
| 369 |
mlm = mlm_fill(sentence_with_blank, target, k=24)
|
| 370 |
|
|
@@ -379,7 +433,6 @@ def smart_distractors(correct: str, phrase_pool: List[str], sentence_with_blank:
|
|
| 379 |
if w not in seen:
|
| 380 |
seen.add(w); raw_pool.append(w)
|
| 381 |
|
| 382 |
-
# 2) إن لم يكفِ، أضف من backup_terms (من النص كله)
|
| 383 |
for w in backup_terms:
|
| 384 |
if len(raw_pool) >= max(60, k*10): break
|
| 385 |
if not w or norm_ar(w) == norm_ar(target):
|
|
@@ -389,7 +442,6 @@ def smart_distractors(correct: str, phrase_pool: List[str], sentence_with_blank:
|
|
| 389 |
if w not in seen:
|
| 390 |
seen.add(w); raw_pool.append(w)
|
| 391 |
|
| 392 |
-
# 3) فلترة POS إن توفّر
|
| 393 |
filtered = []
|
| 394 |
for w in raw_pool:
|
| 395 |
if same_pos(target, w):
|
|
@@ -399,39 +451,35 @@ def smart_distractors(correct: str, phrase_pool: List[str], sentence_with_blank:
|
|
| 399 |
if not filtered:
|
| 400 |
filtered = raw_pool[:max(24, k*6)]
|
| 401 |
|
| 402 |
-
# 4) موازنة الطول (أساسي): اجعل المشتّت قريب طولًا من الصحيحة
|
| 403 |
target_words = word_len(target)
|
| 404 |
shaped = []
|
| 405 |
for w in filtered:
|
| 406 |
-
cand = shape_phrase_like(target, w)
|
| 407 |
-
if within_ratio(cand, target_words, tol=0.
|
| 408 |
-
shaped.append(cand)
|
| 409 |
continue
|
| 410 |
-
# جرّب توسيع بالـMLM
|
| 411 |
expanded = try_mlm_expand(cand, sentence_with_blank, target_words)
|
| 412 |
-
if expanded and within_ratio(expanded, target_words, tol=0.
|
| 413 |
-
shaped.append(expanded)
|
| 414 |
continue
|
| 415 |
-
# fallback بقوالب عامة
|
| 416 |
fb = fallback_expand(cand, target_words)
|
| 417 |
-
|
|
|
|
| 418 |
|
| 419 |
-
# إزالة أي تطويل خرج متطابقًا مع الصحيحة بعد التطبيع
|
| 420 |
shaped = [s for s in shaped if norm_ar(s) != norm_ar(target)]
|
| 421 |
|
| 422 |
-
|
| 423 |
-
|
|
|
|
| 424 |
|
| 425 |
-
# 6) خذ أفضل k؛ وإن لم يكفِ، أكمل من shaped ثم filtered ثم raw_pool ثم backup_terms
|
| 426 |
out = []
|
| 427 |
for src in [ranked, shaped, filtered, raw_pool, backup_terms]:
|
| 428 |
for w in src:
|
| 429 |
if len(out) >= k: break
|
| 430 |
-
if w and norm_ar(w) != norm_ar(target) and w not in out:
|
| 431 |
out.append(w)
|
| 432 |
if len(out) >= k: break
|
| 433 |
|
| 434 |
-
# ضمان العدد بدون أي placeholders
|
| 435 |
if len(out) < k:
|
| 436 |
while len(out) < k and ranked:
|
| 437 |
out.append(ranked[len(out) % len(ranked)])
|
|
@@ -440,6 +488,9 @@ def smart_distractors(correct: str, phrase_pool: List[str], sentence_with_blank:
|
|
| 440 |
|
| 441 |
# ------------------ مُولِّد الأسئلة ------------------
|
| 442 |
def make_mcqs(text:str, n:int=6)->List[MCQ]:
|
|
|
|
|
|
|
|
|
|
| 443 |
sents = split_sents(text)
|
| 444 |
if not sents:
|
| 445 |
raise ValueError("النص قصير أو غير صالح.")
|
|
@@ -450,7 +501,6 @@ def make_mcqs(text:str, n:int=6)->List[MCQ]:
|
|
| 450 |
freq = [w for w,_ in sorted(((t, text.count(t)) for t in tokens), key=lambda x:-x[1])]
|
| 451 |
keyphrases = freq[:160]
|
| 452 |
|
| 453 |
-
# مصطلحات احتياطية عامة من النص لضمان تعبئة المشتتات دائمًا
|
| 454 |
backup_terms = harvest_backup_terms(text, limit=400)
|
| 455 |
|
| 456 |
kp2best_sent = {}
|
|
@@ -479,12 +529,18 @@ def make_mcqs(text:str, n:int=6)->List[MCQ]:
|
|
| 479 |
|
| 480 |
q = re2.sub(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", "_____", s, count=1)
|
| 481 |
|
| 482 |
-
# مشتّتات ذكية تضمن دائمًا ≥3 ومتوازنة الطول
|
| 483 |
pool = [x for x in keyphrases if x != kp]
|
| 484 |
distracts = smart_distractors(kp, pool, q, backup_terms, k=3)
|
| 485 |
|
| 486 |
ch = distracts + [kp]
|
| 487 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 488 |
ans = ch.index(kp)
|
| 489 |
|
| 490 |
items.append(MCQ(id=str(uuid.uuid4())[:8], question=q, choices=ch, answer_index=ans))
|
|
@@ -501,7 +557,6 @@ def to_records(items:List[MCQ])->List[dict]:
|
|
| 501 |
for i,lbl in enumerate(["A","B","C","D"]):
|
| 502 |
txt=(it.choices[i] if i<len(it.choices) else "—").strip()
|
| 503 |
txt=txt.replace(",", "،").replace("?", "؟").replace(";", "؛")
|
| 504 |
-
# منع أي Placeholder يظهر للمستخدم
|
| 505 |
if txt == "—" or not txt:
|
| 506 |
txt = "خيار"
|
| 507 |
opts.append({"id":lbl,"text":txt or "خيار","is_correct":(i==it.answer_index)})
|
|
@@ -587,7 +642,7 @@ textarea{min-height:120px}
|
|
| 587 |
.q-badge.ok{background:#0f2f22;color:#b6f4db;border:1px solid #145b44}
|
| 588 |
.q-badge.err{background:#3a0d14;color:#ffd1d6;border:1px solid #6a1e2b}
|
| 589 |
|
| 590 |
-
.q-text{color
|
| 591 |
.opts{display:flex;flex-direction:column;gap:8px}
|
| 592 |
.opt{display:flex;gap:10px;align-items:center;background:#14161c;border:1px solid #2a2d3a;border-radius:12px;padding:10px;transition:background .15s,border-color .15s}
|
| 593 |
.opt input{accent-color:var(--accent2)}
|
|
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
# صفحتان ثابتتان + Submit لكل سؤال يعمل فعليًا + منع تغيّر أبعاد صفحة الإدخال
|
| 3 |
|
| 4 |
+
import os, json, uuid, random, unicodedata, difflib
|
| 5 |
from dataclasses import dataclass
|
| 6 |
from pathlib import Path
|
| 7 |
from typing import List, Tuple, Optional
|
|
|
|
| 93 |
t = re2.sub(AR_DIAC, "", t)
|
| 94 |
t = re2.sub(r"[إأآا]", "ا", t)
|
| 95 |
t = re2.sub(r"[يى]", "ي", t)
|
| 96 |
+
t = re2.sub(r"\س+", " ", t) if False else re2.sub(r"\s+", " ", t)
|
| 97 |
t = re2.sub(r'(\p{L})\1{2,}', r'\1', t)
|
| 98 |
t = re2.sub(r'(\p{L})\1', r'\1', t)
|
| 99 |
return t.strip()
|
|
|
|
| 138 |
continue
|
| 139 |
if 2 <= len(w) <= 42:
|
| 140 |
phrases.append(w); seen.add(w)
|
|
|
|
| 141 |
phrases_sorted = sorted(phrases, key=lambda x: (-len(x), x))
|
| 142 |
kept=[]
|
| 143 |
for p in phrases_sorted:
|
|
|
|
| 190 |
tok = o["token_str"].strip()
|
| 191 |
if tok and tok != correct and len(tok) >= 2 and not re2.match(r"^[\p{P}\p{S}\d_]+$", tok):
|
| 192 |
cands.append(tok)
|
|
|
|
| 193 |
seen=set(); uniq=[]
|
| 194 |
for w in cands:
|
| 195 |
if w not in seen:
|
|
|
|
| 267 |
bonus = 0.2 if ("،" in s or ":" in s) else 0.0
|
| 268 |
return base + bonus + penalties
|
| 269 |
|
| 270 |
+
# ================== (NEW) جودة المشتِّتات والتطويل ==================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
|
| 272 |
+
# كاش نص كامل لتحسين تقييم الجودة
|
| 273 |
+
global_full_text_cache = ""
|
| 274 |
+
# كاش عبارة صحيحة لتجنّب التطابق بعد التطويل
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
ref_phrase_cache = {}
|
| 276 |
|
| 277 |
+
ADJ_WHITELIST = {"التعليمية","الذكية","الرقمية","الافتراضية","التكيفية","الحديثة","المتقدمة"}
|
| 278 |
+
NOUN_PREFIXES = {"مجال","تقنيات","أنظمة","مفاهيم","نماذج","ممارسات","آليات","تطبيقات"}
|
| 279 |
+
|
| 280 |
+
def is_arabic_word(w:str)->bool:
|
| 281 |
+
return bool(re2.match(r"^[\p{Arabic}]+$", w))
|
| 282 |
+
|
| 283 |
+
def clean_spaces(s:str)->str:
|
| 284 |
+
s = re2.sub(r"\s+", " ", s).strip()
|
| 285 |
+
s = re2.sub(r"\bال\s+ال\b", "ال", s)
|
| 286 |
+
return s
|
| 287 |
+
|
| 288 |
+
def bad_token(w:str)->bool:
|
| 289 |
+
return (not is_arabic_word(w)) or (len(w) < 2 or len(w) > 18)
|
| 290 |
+
|
| 291 |
+
def looks_weird(phrase:str)->bool:
|
| 292 |
+
toks = [t for t in re2.split(r"\s+", phrase.strip()) if t]
|
| 293 |
+
if len(toks) == 0: return True
|
| 294 |
+
if any(bad_token(t) for t in toks): return True
|
| 295 |
+
for i in range(1, len(toks)):
|
| 296 |
+
if toks[i] == toks[i-1]:
|
| 297 |
+
return True
|
| 298 |
+
if len(set(toks)) <= len(toks) - 1:
|
| 299 |
+
if any(toks.count(t) > 1 for t in toks):
|
| 300 |
+
return True
|
| 301 |
+
pos = [phrase_pos(t) or "" for t in toks]
|
| 302 |
+
streak = 0
|
| 303 |
+
for p in pos:
|
| 304 |
+
if p.startswith("ADJ"):
|
| 305 |
+
streak += 1
|
| 306 |
+
if streak > 2: return True
|
| 307 |
+
else:
|
| 308 |
+
streak = 0
|
| 309 |
+
return False
|
| 310 |
+
|
| 311 |
+
def quality_score(phrase:str, sentence:str, full_text:str)->float:
|
| 312 |
+
phrase = clean_spaces(phrase)
|
| 313 |
+
if looks_weird(phrase):
|
| 314 |
+
return 0.0
|
| 315 |
+
hits = sum(1 for t in set(phrase.split()) if t in full_text)
|
| 316 |
+
toks = phrase.split()
|
| 317 |
+
pos0 = phrase_pos(toks[0]) or ""
|
| 318 |
+
pos1 = phrase_pos(toks[1]) if len(toks)>1 else ""
|
| 319 |
+
nominal_bonus = 0.2 if (pos0.startswith("N") and (not pos1 or pos1.startswith("ADJ"))) else 0.0
|
| 320 |
+
return min(1.0, 0.3 + 0.1*hits + nominal_bonus)
|
| 321 |
+
|
| 322 |
def word_len(s: str) -> int:
|
| 323 |
return len([w for w in re2.split(r"\s+", s.strip()) if w])
|
| 324 |
|
| 325 |
+
def within_ratio(cand: str, target_len: int, tol: float = 0.15) -> bool:
|
| 326 |
L = word_len(cand)
|
| 327 |
return (target_len*(1-tol) <= L <= target_len*(1+tol))
|
| 328 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 329 |
def shape_phrase_like(ref: str, cand: str) -> str:
|
|
|
|
| 330 |
return with_same_definiteness(ref, cand)
|
| 331 |
|
| 332 |
def try_mlm_expand(cand: str, sentence_with_blank: str, target_len: int) -> Optional[str]:
|
|
|
|
| 333 |
masker = get_masker()
|
| 334 |
if not masker:
|
| 335 |
return None
|
| 336 |
+
trials = [
|
| 337 |
+
sentence_with_blank.replace("_____", f"{masker.tokenizer.mask_token} {cand}"),
|
| 338 |
+
sentence_with_blank.replace("_____", f"{cand} {masker.tokenizer.mask_token}")
|
| 339 |
+
]
|
| 340 |
+
for masked in trials:
|
| 341 |
try:
|
| 342 |
+
outs = masker(masked, top_k=12)
|
| 343 |
except Exception:
|
| 344 |
continue
|
| 345 |
for o in outs:
|
| 346 |
tok = o["token_str"].strip()
|
| 347 |
+
if not is_arabic_word(tok):
|
| 348 |
continue
|
| 349 |
+
if masked.startswith(masker.tokenizer.mask_token):
|
| 350 |
+
if tok not in NOUN_PREFIXES:
|
| 351 |
+
continue
|
| 352 |
phrase = f"{tok} {cand}"
|
| 353 |
else:
|
| 354 |
+
if tok not in ADJ_WHITELIST:
|
| 355 |
+
continue
|
| 356 |
phrase = f"{cand} {tok}"
|
| 357 |
+
phrase = clean_spaces(phrase)
|
| 358 |
+
if within_ratio(phrase, target_len, tol=0.15) and norm_ar(phrase) != norm_ar(ref_phrase_cache.get("correct","")) and not looks_weird(phrase):
|
| 359 |
return phrase
|
| 360 |
return None
|
| 361 |
|
| 362 |
def fallback_expand(cand: str, target_len: int) -> str:
|
| 363 |
+
for p in NOUN_PREFIXES:
|
|
|
|
| 364 |
phrase = f"{p} {cand}"
|
| 365 |
+
if within_ratio(phrase, target_len, tol=0.15):
|
| 366 |
+
return clean_spaces(phrase)
|
| 367 |
+
for sfx in ADJ_WHITELIST:
|
| 368 |
phrase = f"{cand} {sfx}"
|
| 369 |
+
if within_ratio(phrase, target_len, tol=0.15):
|
| 370 |
+
return clean_spaces(phrase)
|
| 371 |
+
candidates = [f"{p} {cand}" for p in NOUN_PREFIXES] + [f"{cand} {sfx}" for sfx in ADJ_WHITELIST]
|
| 372 |
+
candidates = sorted(candidates, key=lambda ph: abs(word_len(ph) - target_len))
|
| 373 |
+
return clean_spaces(candidates[0])
|
| 374 |
+
|
| 375 |
+
# --- (H*) ترتيب المرشّحات بالانسجام + الجودة + منع التشابه ---
|
| 376 |
+
def rank_by_sentence_coherence(sentence_with_blank: str, correct: str, candidates: List[str], topk: int=3, full_text: str="") -> List[str]:
|
| 377 |
+
emb = get_embedder()
|
| 378 |
+
if not candidates:
|
| 379 |
+
return []
|
| 380 |
+
coherence = {}
|
| 381 |
+
if emb:
|
| 382 |
+
filled = [sentence_with_blank.replace("_____", c) for c in candidates]
|
| 383 |
+
ref = sentence_with_blank.replace("_____", correct)
|
| 384 |
+
vecs = embed_texts([ref] + filled)
|
| 385 |
+
if vecs is not None:
|
| 386 |
+
import numpy as np
|
| 387 |
+
ref_vec = vecs[0]; cand_vecs = vecs[1:]
|
| 388 |
+
sims = cand_vecs @ ref_vec
|
| 389 |
+
for i, c in enumerate(candidates):
|
| 390 |
+
coherence[c] = float(sims[i])
|
| 391 |
+
qscore = {c: quality_score(c, sentence_with_blank, full_text) for c in candidates}
|
| 392 |
+
def final_score(c):
|
| 393 |
+
coh = coherence.get(c, 0.0)
|
| 394 |
+
return 0.7*coh + 0.3*qscore.get(c, 0.0)
|
| 395 |
+
ranked = sorted(candidates, key=lambda c: final_score(c), reverse=True)
|
| 396 |
+
|
| 397 |
+
kept = []
|
| 398 |
+
for c in ranked:
|
| 399 |
+
if all(difflib.SequenceMatcher(None, c, x).ratio() < 0.90 for x in kept):
|
| 400 |
+
kept.append(c)
|
| 401 |
+
if len(kept) >= topk:
|
| 402 |
+
break
|
| 403 |
+
return kept[:topk]
|
| 404 |
+
|
| 405 |
+
# --- (I) حصاد مصطلحات احتياطية عالية التكرار من النص كله ---
|
| 406 |
+
def harvest_backup_terms(text: str, limit: int = 400) -> List[str]:
|
| 407 |
+
toks = re2.findall(r"[\p{L}][\p{L}\p{N}_\-]{1,}", text)
|
| 408 |
+
stats = {}
|
| 409 |
+
for t in toks:
|
| 410 |
+
tt = norm_ar(t)
|
| 411 |
+
if not good_kw(tt):
|
| 412 |
+
continue
|
| 413 |
+
stats[tt] = stats.get(tt, 0) + 1
|
| 414 |
+
top = [w for w,_ in sorted(stats.items(), key=lambda kv: -kv[1])]
|
| 415 |
+
return top[:limit]
|
| 416 |
|
| 417 |
# --- (J) مشتّتات ذكية تضمن دائمًا ≥3 خيارات فعلية + موازنة الطول ---
|
| 418 |
def smart_distractors(correct: str, phrase_pool: List[str], sentence_with_blank: str, backup_terms: List[str], k: int = 3) -> List[str]:
|
| 419 |
target = correct.strip()
|
| 420 |
+
ref_phrase_cache["correct"] = target
|
| 421 |
|
|
|
|
| 422 |
neigh = nearest_terms(target, phrase_pool, k=48)
|
| 423 |
mlm = mlm_fill(sentence_with_blank, target, k=24)
|
| 424 |
|
|
|
|
| 433 |
if w not in seen:
|
| 434 |
seen.add(w); raw_pool.append(w)
|
| 435 |
|
|
|
|
| 436 |
for w in backup_terms:
|
| 437 |
if len(raw_pool) >= max(60, k*10): break
|
| 438 |
if not w or norm_ar(w) == norm_ar(target):
|
|
|
|
| 442 |
if w not in seen:
|
| 443 |
seen.add(w); raw_pool.append(w)
|
| 444 |
|
|
|
|
| 445 |
filtered = []
|
| 446 |
for w in raw_pool:
|
| 447 |
if same_pos(target, w):
|
|
|
|
| 451 |
if not filtered:
|
| 452 |
filtered = raw_pool[:max(24, k*6)]
|
| 453 |
|
|
|
|
| 454 |
target_words = word_len(target)
|
| 455 |
shaped = []
|
| 456 |
for w in filtered:
|
| 457 |
+
cand = shape_phrase_like(target, w)
|
| 458 |
+
if within_ratio(cand, target_words, tol=0.15) and not looks_weird(cand):
|
| 459 |
+
shaped.append(clean_spaces(cand))
|
| 460 |
continue
|
|
|
|
| 461 |
expanded = try_mlm_expand(cand, sentence_with_blank, target_words)
|
| 462 |
+
if expanded and within_ratio(expanded, target_words, tol=0.15) and not looks_weird(expanded):
|
| 463 |
+
shaped.append(clean_spaces(expanded))
|
| 464 |
continue
|
|
|
|
| 465 |
fb = fallback_expand(cand, target_words)
|
| 466 |
+
if not looks_weird(fb):
|
| 467 |
+
shaped.append(clean_spaces(fb))
|
| 468 |
|
|
|
|
| 469 |
shaped = [s for s in shaped if norm_ar(s) != norm_ar(target)]
|
| 470 |
|
| 471 |
+
ranked = rank_by_sentence_coherence(
|
| 472 |
+
sentence_with_blank, target, shaped, topk=max(k, 12), full_text=global_full_text_cache
|
| 473 |
+
)
|
| 474 |
|
|
|
|
| 475 |
out = []
|
| 476 |
for src in [ranked, shaped, filtered, raw_pool, backup_terms]:
|
| 477 |
for w in src:
|
| 478 |
if len(out) >= k: break
|
| 479 |
+
if w and norm_ar(w) != norm_ar(target) and w not in out and not looks_weird(w):
|
| 480 |
out.append(w)
|
| 481 |
if len(out) >= k: break
|
| 482 |
|
|
|
|
| 483 |
if len(out) < k:
|
| 484 |
while len(out) < k and ranked:
|
| 485 |
out.append(ranked[len(out) % len(ranked)])
|
|
|
|
| 488 |
|
| 489 |
# ------------------ مُولِّد الأسئلة ------------------
|
| 490 |
def make_mcqs(text:str, n:int=6)->List[MCQ]:
|
| 491 |
+
global global_full_text_cache
|
| 492 |
+
global_full_text_cache = text
|
| 493 |
+
|
| 494 |
sents = split_sents(text)
|
| 495 |
if not sents:
|
| 496 |
raise ValueError("النص قصير أو غير صالح.")
|
|
|
|
| 501 |
freq = [w for w,_ in sorted(((t, text.count(t)) for t in tokens), key=lambda x:-x[1])]
|
| 502 |
keyphrases = freq[:160]
|
| 503 |
|
|
|
|
| 504 |
backup_terms = harvest_backup_terms(text, limit=400)
|
| 505 |
|
| 506 |
kp2best_sent = {}
|
|
|
|
| 529 |
|
| 530 |
q = re2.sub(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", "_____", s, count=1)
|
| 531 |
|
|
|
|
| 532 |
pool = [x for x in keyphrases if x != kp]
|
| 533 |
distracts = smart_distractors(kp, pool, q, backup_terms, k=3)
|
| 534 |
|
| 535 |
ch = distracts + [kp]
|
| 536 |
+
|
| 537 |
+
# ترتيب غير عشوائي: تدوير حتمي لموضع الصحيحة
|
| 538 |
+
# 1) ضع الصحيحة مؤقتًا في النهاية
|
| 539 |
+
ch_sorted = sorted(ch, key=lambda c: c != kp)
|
| 540 |
+
# 2) تدوير بناءً على رقم السؤال (طول القائمة الحالية) وهاش العبارة
|
| 541 |
+
rot = (len(items) + (hash(kp) & 3)) % 4
|
| 542 |
+
ch = ch_sorted[-rot:] + ch_sorted[:-rot]
|
| 543 |
+
|
| 544 |
ans = ch.index(kp)
|
| 545 |
|
| 546 |
items.append(MCQ(id=str(uuid.uuid4())[:8], question=q, choices=ch, answer_index=ans))
|
|
|
|
| 557 |
for i,lbl in enumerate(["A","B","C","D"]):
|
| 558 |
txt=(it.choices[i] if i<len(it.choices) else "—").strip()
|
| 559 |
txt=txt.replace(",", "،").replace("?", "؟").replace(";", "؛")
|
|
|
|
| 560 |
if txt == "—" or not txt:
|
| 561 |
txt = "خيار"
|
| 562 |
opts.append({"id":lbl,"text":txt or "خيار","is_correct":(i==it.answer_index)})
|
|
|
|
| 642 |
.q-badge.ok{background:#0f2f22;color:#b6f4db;border:1px solid #145b44}
|
| 643 |
.q-badge.err{background:#3a0d14;color:#ffd1d6;border:1px solid #6a1e2b}
|
| 644 |
|
| 645 |
+
.q-text{color:#الtext;font-size:1.06rem;line-height:1.8;margin:8px 0 12px}
|
| 646 |
.opts{display:flex;flex-direction:column;gap:8px}
|
| 647 |
.opt{display:flex;gap:10px;align-items:center;background:#14161c;border:1px solid #2a2d3a;border-radius:12px;padding:10px;transition:background .15s,border-color .15s}
|
| 648 |
.opt input{accent-color:var(--accent2)}
|