Spaces:

Leen172
/

Question_generator

Sleeping

App Files Files Community

Leen172 commited on Oct 28

Commit

254fb45

verified ·

1 Parent(s): 739131d

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -34

app.py CHANGED Viewed

@@ -21,8 +21,6 @@ import regex as re2
 import yake
 from tqdm import tqdm
-# ملاحظة: سنستورد torch/transformers داخل الدوال (تحميل كسول) لسرعة الإقلاع.
 # =========================
 # إعدادات عامة
 # =========================
@@ -32,7 +30,7 @@ DEFAULT_NUM_QUESTIONS = 8
 DEFAULT_TROCR_MODEL = "microsoft/trocr-base-printed"  # أسرع من large
 DEFAULT_TROCR_ZOOM = 2.8
-# كاش بسيط للـ OCR pipeline
 _OCR_PIPE = {}
 def _get_ocr_pipeline(model_id: str):
     """تحميل كسول + كاش لنموذج TrOCR."""
@@ -144,13 +142,15 @@ def normalize_arabic(text: str) -> str:
     text = re2.sub(r"[إأآا]", "ا", text)
     text = re2.sub(r"[يى]", "ي", text)
     text = re2.sub(r"\s+", " ", text)
     return text.strip()
 def arabic_ocr_fixes(text: str) -> str:
     fixes = {
         " الصطناعي": " الاصطناعي",
         "صطناعي": "اصطناعي",
-        "التعل م": "التعلم",
         "الذكاء الاصطناعيي": "الذكاء الاصطناعي",
         "ذكاء صطناعي": "ذكاء اصطناعي",
         "الذكاء الاصطناعي.": "الذكاء الاصطناعي.",
@@ -158,6 +158,8 @@ def arabic_ocr_fixes(text: str) -> str:
         " مع غني": " غني",
         "مع غني ": " غني ",
         " غير المشبعة": " غيرُ المشبعة",
     }
     for wrong, right in fixes.items():
         text = text.replace(wrong, right)
@@ -218,7 +220,15 @@ def split_sentences(text: str) -> List[str]:
     sents = [s.strip() for s in SENT_SPLIT.split(text) if s.strip()]
     return [s for s in sents if len(s) >= 25]
 def build_distractors(correct: str, pool: List[str], k: int = 3) -> List[str]:
     cand = []
     for w in pool:
         if not w:
@@ -226,11 +236,13 @@ def build_distractors(correct: str, pool: List[str], k: int = 3) -> List[str]:
         w2 = w.strip()
         if w2 == correct.strip():
             continue
-        if len(w2) < 3:
             continue
-        if w2 in AR_STOP:
             continue
-        cand.append(w2)
     random.shuffle(cand)
     out = []
@@ -239,7 +251,7 @@ def build_distractors(correct: str, pool: List[str], k: int = 3) -> List[str]:
         if len(out) == k:
             break
-    fillers = ["—", "-", "—-"]
     while len(out) < k:
         out.append(random.choice(fillers))
     return out
@@ -261,6 +273,8 @@ def make_mcqs_from_text(text: str, n: int = 8, lang: str = 'ar') -> List[MCQ]:
     sent_for_kw = {}
     for s in sentences:
         for kw in keywords:
             if re2.search(rf"(?<!\p{{L}}){re2.escape(kw)}(?!\p{{L}})", s) and kw not in sent_for_kw:
                 sent_for_kw[kw] = s
@@ -271,6 +285,8 @@ def make_mcqs_from_text(text: str, n: int = 8, lang: str = 'ar') -> List[MCQ]:
     for kw in pool_iter:
         if len(items) >= n:
             break
         s = sent_for_kw[kw]
         if s in used_sents:
             continue
@@ -323,7 +339,7 @@ def is_bad_choice(txt: str) -> bool:
         return True
     return False
-def build_json_records(items: List[MCQ], lang: str, source_pdf: str, method: str):
     json_data = []
     letters = ["A", "B", "C", "D"]
     for it in items:
@@ -353,14 +369,15 @@ def build_json_records(items: List[MCQ], lang: str, source_pdf: str, method: str
                 "lang": lang,
                 "normalized": True,
                 "source_pdf": source_pdf,
-                "extraction_method": method
             }
         }
         json_data.append(record)
     return json_data
 # =========================
-# 7) الدالة الرئيسية (تتعامل مع Filepath من Gradio)
 # =========================
 def process_pdf(pdf_file_path,
                 num_questions=DEFAULT_NUM_QUESTIONS,
@@ -370,27 +387,37 @@ def process_pdf(pdf_file_path,
     logs = []
     try:
         if not pdf_file_path:
-            return {}, None, "يرجى رفع ملف PDF أولاً."
         # pdf_file_path قد يكون str أو NamedString -> خذه كمسار
         src_path = str(pdf_file_path)
-        # اسم ملف مناسب
         name_guess = getattr(pdf_file_path, "name", "") if hasattr(pdf_file_path, "name") else ""
-        filename = Path(name_guess).name or Path(src_path).name or "input.pdf"
         if not Path(filename).suffix:
-            filename += ".pdf"
-        workdir = tempfile.mkdtemp(prefix="mcq_")
-        pdf_path = os.path.join(workdir, filename)
-        shutil.copy(src_path, pdf_path)
-        logs.append(f"تم نسخ الملف إلى: {pdf_path}")
-        # 1) استخراج النص
-        raw_text, out_txt_path, method = pdf_to_txt(
-            pdf_path=pdf_path,
-            ocr_model=trocr_model,
-            ocr_zoom=float(trocr_zoom)
-        )
         logs.append(f"طريقة الاستخراج: {method}")
         # 2) تنظيف/تطبيع
@@ -403,7 +430,9 @@ def process_pdf(pdf_file_path,
         logs.append(f"تم توليد {len(items)} سؤالاً.")
         # 4) بناء JSON
-        json_records = build_json_records(items, lang=lang, source_pdf=Path(filename).name, method=method)
         json_str = json.dumps(json_records, ensure_ascii=False, indent=2)
         # 5) حفظ ملف JSON للتنزيل
@@ -423,15 +452,15 @@ def process_pdf(pdf_file_path,
 # =========================
 import gradio as gr
-with gr.Blocks(title="PDF → MCQ JSON (Arabic YAKE / TrOCR)") as demo:
-    gr.Markdown("## تحويل PDF إلى أسئلة اختيار من متعدد وإرجاع JSON جاهز للواجهة")
     with gr.Row():
         inp_pdf = gr.File(
-            label="ارفع PDF",
             file_count="single",
-            file_types=[".pdf"],
-            type="filepath",  # مهم: يُعيد مسار الملف
         )
         with gr.Column():
             num_q = gr.Slider(4, 20, value=DEFAULT_NUM_QUESTIONS, step=1, label="عدد الأسئلة")
@@ -444,7 +473,7 @@ with gr.Blocks(title="PDF → MCQ JSON (Arabic YAKE / TrOCR)") as demo:
                     "microsoft/trocr-large-handwritten",
                 ],
                 value=DEFAULT_TROCR_MODEL,
-                label="موديل TrOCR"
             )
     btn = gr.Button("تشغيل المعالجة", variant="primary")
@@ -459,6 +488,5 @@ with gr.Blocks(title="PDF → MCQ JSON (Arabic YAKE / TrOCR)") as demo:
     )
 # ملاحظة: Spaces تتعرف تلقائياً على المتغير "demo".
-# لو شغّلت محلياً:
 if __name__ == "__main__":
     demo.queue().launch()

 import yake
 from tqdm import tqdm
 # =========================
 # إعدادات عامة
 # =========================
 DEFAULT_TROCR_MODEL = "microsoft/trocr-base-printed"  # أسرع من large
 DEFAULT_TROCR_ZOOM = 2.8
+# كاش بسيط للـ OCR pipeline (تحميل كسول)
 _OCR_PIPE = {}
 def _get_ocr_pipeline(model_id: str):
     """تحميل كسول + كاش لنموذج TrOCR."""
     text = re2.sub(r"[إأآا]", "ا", text)
     text = re2.sub(r"[يى]", "ي", text)
     text = re2.sub(r"\s+", " ", text)
+    # إزالة التكرار الزائد للحروف (مثل جذرياا -> جذريا)
+    text = re2.sub(r'(\p{L})\1{2,}', r'\1', text)  # أكثر من مرتين
+    text = re2.sub(r'(\p{L})\1', r'\1', text)      # التكرار المتبقي
     return text.strip()
 def arabic_ocr_fixes(text: str) -> str:
     fixes = {
         " الصطناعي": " الاصطناعي",
         "صطناعي": "اصطناعي",
         "الذكاء الاصطناعيي": "الذكاء الاصطناعي",
         "ذكاء صطناعي": "ذكاء اصطناعي",
         "الذكاء الاصطناعي.": "الذكاء الاصطناعي.",
         " مع غني": " غني",
         "مع غني ": " غني ",
         " غير المشبعة": " غيرُ المشبعة",
+        "الااصطناعي": "الاصطناعي",
+        "وشخصياا": "وشخصياً",
     }
     for wrong, right in fixes.items():
         text = text.replace(wrong, right)
     sents = [s.strip() for s in SENT_SPLIT.split(text) if s.strip()]
     return [s for s in sents if len(s) >= 25]
+def _is_good_kw(kw: str) -> bool:
+    if not kw or len(kw) < 3: return False
+    if kw in AR_STOP: return False
+    if re2.match(r"^[\p{P}\p{S}\d_]+$", kw): return False
+    return True
 def build_distractors(correct: str, pool: List[str], k: int = 3) -> List[str]:
+    """ملهيات أقرب طولياً للسياق."""
+    target_len = len(correct.strip())
     cand = []
     for w in pool:
         if not w:
         w2 = w.strip()
         if w2 == correct.strip():
             continue
+        if len(w2) < 3 or w2 in AR_STOP:
             continue
+        if re2.match(r"^[\p{P}\p{S}\d_]+$", w2):
             continue
+        # تقارب طولي
+        if abs(len(w2) - target_len) <= 3:
+            cand.append(w2)
     random.shuffle(cand)
     out = []
         if len(out) == k:
             break
+    fillers = ["—", "— —", "—-"]
     while len(out) < k:
         out.append(random.choice(fillers))
     return out
     sent_for_kw = {}
     for s in sentences:
         for kw in keywords:
+            if not _is_good_kw(kw):
+                continue
             if re2.search(rf"(?<!\p{{L}}){re2.escape(kw)}(?!\p{{L}})", s) and kw not in sent_for_kw:
                 sent_for_kw[kw] = s
     for kw in pool_iter:
         if len(items) >= n:
             break
+        if not _is_good_kw(kw):
+            continue
         s = sent_for_kw[kw]
         if s in used_sents:
             continue
         return True
     return False
+def build_json_records(items: List[MCQ], lang: str, source_pdf: str, method: str, num_questions: int):
     json_data = []
     letters = ["A", "B", "C", "D"]
     for it in items:
                 "lang": lang,
                 "normalized": True,
                 "source_pdf": source_pdf,
+                "extraction_method": method,
+                "num_questions": int(num_questions),
             }
         }
         json_data.append(record)
     return json_data
 # =========================
+# 7) الدالة الرئيسية (دعم PDF و TXT)
 # =========================
 def process_pdf(pdf_file_path,
                 num_questions=DEFAULT_NUM_QUESTIONS,
     logs = []
     try:
         if not pdf_file_path:
+            return {}, None, "يرجى رفع ملف PDF/TXT أولاً."
         # pdf_file_path قد يكون str أو NamedString -> خذه كمسار
         src_path = str(pdf_file_path)
         name_guess = getattr(pdf_file_path, "name", "") if hasattr(pdf_file_path, "name") else ""
+        filename = Path(name_guess).name or Path(src_path).name or "input"
+        workdir = tempfile.mkdtemp(prefix="mcq_")
+        # تأكد من الامتداد
+        ext = Path(filename).suffix.lower()
+        if ext not in [".pdf", ".txt"]:
+            # حاول تخمين نوعه، افتراض PDF
+            ext = ".pdf"
         if not Path(filename).suffix:
+            filename += ext
+        local_path = os.path.join(workdir, filename)
+        shutil.copy(src_path, local_path)
+        logs.append(f"تم نسخ الملف إلى: {local_path}")
+        # 1) استخراج النص بحسب النوع
+        if ext == ".txt":
+            with open(local_path, "r", encoding="utf-8", errors="ignore") as f:
+                raw_text = f.read()
+            method = "plain text (no PDF)"
+        else:
+            raw_text, out_txt_path, method = pdf_to_txt(
+                pdf_path=local_path,
+                ocr_model=trocr_model,
+                ocr_zoom=float(trocr_zoom)
+            )
         logs.append(f"طريقة الاستخراج: {method}")
         # 2) تنظيف/تطبيع
         logs.append(f"تم توليد {len(items)} سؤالاً.")
         # 4) بناء JSON
+        json_records = build_json_records(
+            items, lang=lang, source_pdf=Path(filename).name, method=method, num_questions=num_questions
+        )
         json_str = json.dumps(json_records, ensure_ascii=False, indent=2)
         # 5) حفظ ملف JSON للتنزيل
 # =========================
 import gradio as gr
+with gr.Blocks(title="PDF/TXT → MCQ JSON (Arabic YAKE / TrOCR)") as demo:
+    gr.Markdown("## تحويل PDF/TXT إلى أسئلة اختيار من متعدد وإرجاع JSON جاهز للواجهة")
     with gr.Row():
         inp_pdf = gr.File(
+            label="ارفع PDF أو TXT",
             file_count="single",
+            file_types=[".pdf", ".txt"],
+            type="filepath",  # يُعيد مسار الملف
         )
         with gr.Column():
             num_q = gr.Slider(4, 20, value=DEFAULT_NUM_QUESTIONS, step=1, label="عدد الأسئلة")
                     "microsoft/trocr-large-handwritten",
                 ],
                 value=DEFAULT_TROCR_MODEL,
+                label="موديل TrOCR (للـ PDF المصوّر)"
             )
     btn = gr.Button("تشغيل المعالجة", variant="primary")
     )
 # ملاحظة: Spaces تتعرف تلقائياً على المتغير "demo".
 if __name__ == "__main__":
     demo.queue().launch()