Spaces:

pnnbao-ump
/

MedCrab

Running on Zero

App Files Files Community

pnnbao-ump commited on 13 days ago

Commit

2945c35

1 Parent(s): 8f62483

first init

Browse files

Files changed (8) hide show

README.md +47 -7
app.py +471 -0
medcrab/__init__.py +3 -0
medcrab/__pycache__/__init__.cpython-312.pyc +0 -0
medcrab/__pycache__/medcrab.cpython-312.pyc +0 -0
medcrab/medcrab.py +150 -0
readme_md.md +61 -0
requirements.txt +11 -0

README.md CHANGED Viewed

@@ -1,14 +1,54 @@
 ---
-title: MedCrab
-emoji: 🏃
-colorFrom: yellow
-colorTo: purple
 sdk: gradio
-sdk_version: 6.0.1
 app_file: app.py
-pinned: false
 license: cc-by-nc-4.0
 short_description: Medical PDF Translator
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: MedCrab Translation
+emoji: 🦀
+colorFrom: purple
+colorTo: blue
 sdk: gradio
+sdk_version: 5.0.0
 app_file: app.py
+pinned: true
 license: cc-by-nc-4.0
 short_description: Medical PDF Translator
 ---
+# 🦀 MedCrab Translation
+Ứng dụng quét OCR tài liệu y khoa và dịch trực tiếp sang tiếng Việt với hiệu ứng streaming.
+## Tính năng
+- 📄 Hỗ trợ PDF và hình ảnh
+- 🔍 OCR chính xác với DeepSeek-OCR
+- 🦀 Dịch y khoa chuyên sâu với MedCrab
+- ⚡ Streaming real-time
+- 🎨 Giao diện thân thiện
+## Sử dụng
+1. Tải lên file PDF hoặc hình ảnh y khoa
+2. Chọn số trang (nếu là PDF)
+3. Chọn chế độ OCR
+4. Nhấn "Quét OCR + Dịch tiếng Việt"
+## Chế độ OCR
+- **Crab**: Chế độ cân bằng (khuyên dùng)
+- **Base**: Chế độ nhanh
+## Yêu cầu GPU
+Space này cần GPU để chạy. Hugging Face cung cấp GPU miễn phí với giới hạn thời gian sử dụng.
+## License
+Creative Commons Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)
+This work is licensed under CC BY-NC 4.0. You are free to:
+- Share: copy and redistribute the material
+- Adapt: remix, transform, and build upon the material
+Under the following terms:
+- Attribution: You must give appropriate credit
+- NonCommercial: You may not use the material for commercial purposes
+See: https://creativecommons.org/licenses/by-nc/4.0/

app.py ADDED Viewed

	@@ -0,0 +1,471 @@

+import gradio as gr
+from transformers import AutoModel, AutoTokenizer
+from medcrab import MedCrabTranslator
+import torch
+import os
+import sys
+import tempfile
+import shutil
+from PIL import Image, ImageOps
+import fitz
+import re
+import time
+from threading import Thread
+from queue import Queue
+from io import StringIO, BytesIO
+import spaces
+# ==================== DEEPSEEK OCR SETUP ====================
+OCR_MODEL_NAME = 'deepseek-ai/DeepSeek-OCR'
+print("🔄 Loading OCR model...")
+ocr_tokenizer = AutoTokenizer.from_pretrained(OCR_MODEL_NAME, trust_remote_code=True)
+try:
+    ocr_model = AutoModel.from_pretrained(
+        OCR_MODEL_NAME,
+        attn_implementation='flash_attention_2',
+        torch_dtype=torch.bfloat16,
+        trust_remote_code=True,
+        use_safetensors=True
+    )
+    print("✅ Using Flash Attention 2")
+except (ImportError, ValueError):
+    print("⚠️ Flash Attention 2 not available, using eager attention")
+    ocr_model = AutoModel.from_pretrained(
+        OCR_MODEL_NAME,
+        attn_implementation='eager',
+        torch_dtype=torch.bfloat16,
+        trust_remote_code=True,
+        use_safetensors=True
+    )
+ocr_model = ocr_model.eval()
+MODEL_CONFIGS = {
+    "Crab": {"base_size": 1024, "image_size": 640, "crop_mode": True},
+    "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
+}
+# ==================== MEDCRAB TRANSLATOR SETUP ====================
+print("🦀 Loading MedCrab translator...")
+translator = None
+def init_translator():
+    global translator
+    if translator is None:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        translator = MedCrabTranslator(device=device)
+        print(f"✅ MedCrab translator loaded on {device}")
+# ==================== TEXT CLEANING FUNCTIONS ====================
+def clean_mathrm(text):
+    """Chuyển đổi LaTeX sang HTML với subscript/superscript chỉ trong môi trường toán học"""
+    if not text:
+        return ""
+    def process_math_block(match):
+        math_content = match.group(1)
+        math_content = re.sub(r'\\mathrm\{([^}]*)\}', r'\1', math_content)
+        math_content = re.sub(r'\^\{([^}]+)\}', r'<sup>\1</sup>', math_content)
+        math_content = re.sub(r'\^([A-Za-z0-9+\-]+)', r'<sup>\1</sup>', math_content)
+        math_content = re.sub(r'_\{([^}]+)\}', r'<sub>\1</sub>', math_content)
+        math_content = re.sub(r'_([A-Za-z0-9+\-]+)', r'<sub>\1</sub>', math_content)
+        replacements = {
+            r'\times': '×', r'\pm': '±', r'\div': '÷', r'\cdot': '·',
+            r'\approx': '≈', r'\leq': '≤', r'\geq': '≥', r'\neq': '≠',
+            r'\rightarrow': '→', r'\leftarrow': '←',
+            r'\Rightarrow': '⇒', r'\Leftarrow': '⇐',
+        }
+        for latex_cmd, unicode_char in replacements.items():
+            math_content = math_content.replace(latex_cmd, unicode_char)
+        return math_content
+    text = re.sub(r'\\\((.+?)\\\)', process_math_block, text, flags=re.DOTALL)
+    def process_bracket_block(m):
+        class FakeMatch:
+            def __init__(self, content):
+                self.content = content
+            def group(self, n):
+                return self.content
+        content = process_math_block(FakeMatch(m.group(1)))
+        return '[' + content + ']'
+    text = re.sub(r'\\\[(.+?)\\\]', process_bracket_block, text, flags=re.DOTALL)
+    text = re.sub(r'\\mathrm\{([^}]*)\}', r'\1', text)
+    text = text.replace(r'\%', '%')
+    lines = text.split('\n')
+    cleaned_lines = [re.sub(r'[ \t]+', ' ', line).strip() for line in lines]
+    text = '\n'.join(cleaned_lines)
+    return text.strip()
+def clean_output(text, include_images=False, remove_labels=False):
+    if not text:
+        return ""
+    pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
+    matches = re.findall(pattern, text, re.DOTALL)
+    img_num = 0
+    for match in matches:
+        if '<|ref|>image<|/ref|>' in match[0]:
+            if include_images:
+                text = text.replace(match[0], f'\n\n**[Figure {img_num + 1}]**\n\n', 1)
+                img_num += 1
+            else:
+                text = text.replace(match[0], '', 1)
+        else:
+            if remove_labels:
+                text = text.replace(match[0], '', 1)
+            else:
+                text = text.replace(match[0], match[1], 1)
+    text = clean_mathrm(text)
+    return text.strip()
+# ==================== OCR FUNCTIONS ====================
+@spaces.GPU
+def ocr_process_image(image, mode="Crab"):
+    if image is None:
+        return "Error: Upload image"
+    if image.mode in ('RGBA', 'LA', 'P'):
+        image = image.convert('RGB')
+    image = ImageOps.exif_transpose(image)
+    config = MODEL_CONFIGS[mode]
+    prompt = "<image>\n<|grounding|>Convert the document to markdown."
+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
+    image.save(tmp.name, 'JPEG', quality=95)
+    tmp.close()
+    out_dir = tempfile.mkdtemp()
+    stdout = sys.stdout
+    sys.stdout = StringIO()
+    ocr_model.infer(
+        tokenizer=ocr_tokenizer,
+        prompt=prompt,
+        image_file=tmp.name,
+        output_path=out_dir,
+        base_size=config["base_size"],
+        image_size=config["image_size"],
+        crop_mode=config["crop_mode"]
+    )
+    result = '\n'.join([l for l in sys.stdout.getvalue().split('\n')
+                        if not any(s in l for s in ['image:', 'other:', 'PATCHES', '====', 'BASE:', '%|', 'torch.Size'])]).strip()
+    sys.stdout = stdout
+    os.unlink(tmp.name)
+    shutil.rmtree(out_dir, ignore_errors=True)
+    if not result:
+        return "No text detected"
+    markdown = clean_output(result, True, True)
+    return markdown
+def ocr_process_pdf(path, mode, page_num):
+    doc = fitz.open(path)
+    total_pages = len(doc)
+    if page_num < 1 or page_num > total_pages:
+        doc.close()
+        return f"Invalid page number. PDF has {total_pages} pages."
+    page = doc.load_page(page_num - 1)
+    pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False)
+    img = Image.open(BytesIO(pix.tobytes("png")))
+    doc.close()
+    return ocr_process_image(img, mode)
+def ocr_process_file(path, mode, page_num):
+    if not path:
+        return "Error: Upload file"
+    if path.lower().endswith('.pdf'):
+        return ocr_process_pdf(path, mode, page_num)
+    else:
+        return ocr_process_image(Image.open(path), mode)
+# ==================== TRANSLATION FUNCTIONS ====================
+def split_by_sentences(text: str, max_words: int = 100):
+    def count_words(t):
+        return len(t.strip().split())
+    chunks = []
+    lines = text.split('\n')
+    i = 0
+    while i < len(lines):
+        line = lines[i]
+        empty_count = 0
+        if not line.strip():
+            while i < len(lines) and not lines[i].strip():
+                empty_count += 1
+                i += 1
+            if chunks:
+                prev_text, prev_newlines = chunks[-1]
+                chunks[-1] = (prev_text, prev_newlines + empty_count)
+            continue
+        line = line.strip()
+        is_last_line = (i == len(lines) - 1)
+        if count_words(line) <= max_words:
+            chunks.append((line, 0 if is_last_line else 1))
+            i += 1
+            continue
+        sentences = re.split(r'(?<=[.!?])\s+', line)
+        current_chunk = ""
+        current_words = 0
+        for sent_idx, sentence in enumerate(sentences):
+            sentence = sentence.strip()
+            if not sentence:
+                continue
+            sentence_words = count_words(sentence)
+            if sentence_words > max_words:
+                if current_chunk:
+                    chunks.append((current_chunk.strip(), 0))
+                    current_chunk = ""
+                    current_words = 0
+                sub_parts = re.split(r',\s*', sentence)
+                temp_chunk = ""
+                temp_words = 0
+                for part in sub_parts:
+                    part_words = count_words(part)
+                    if temp_words + part_words > max_words and temp_chunk:
+                        chunks.append((temp_chunk.strip(), 0))
+                        temp_chunk = part
+                        temp_words = part_words
+                    else:
+                        if temp_chunk:
+                            temp_chunk += ", " + part
+                        else:
+                            temp_chunk = part
+                        temp_words += part_words
+                if temp_chunk.strip():
+                    current_chunk = temp_chunk.strip()
+                    current_words = temp_words
+            elif current_words + sentence_words <= max_words:
+                if current_chunk:
+                    current_chunk += " " + sentence
+                else:
+                    current_chunk = sentence
+                current_words += sentence_words
+            else:
+                chunks.append((current_chunk.strip(), 0))
+                current_chunk = sentence
+                current_words = sentence_words
+        if current_chunk.strip():
+            chunks.append((current_chunk.strip(), 0 if is_last_line else 1))
+        i += 1
+    return chunks
+@spaces.GPU
+def translate_chunk(chunk_text):
+    init_translator()
+    return translator.translate(chunk_text, max_new_tokens=2048).strip()
+def streaming_translate(text: str):
+    if not text or not text.strip():
+        yield '<div style="padding:20px; color:#ff6b6b;">⚠️ Vui lòng nhập văn bản tiếng Anh để dịch.</div>'
+        return
+    chunks = split_by_sentences(text, max_words=100)
+    accumulated = ""
+    for i, (chunk_text, newline_count) in enumerate(chunks):
+        try:
+            translated = translate_chunk(chunk_text)
+            if accumulated and not accumulated.endswith('\n'):
+                accumulated += " " + translated
+            else:
+                accumulated += translated
+            chunk_start = len(accumulated) - len(translated)
+            for j in range(len(translated)):
+                current_display = accumulated[:chunk_start + j + 1]
+                html_output = f'<div style="padding:20px; line-height:1.8; font-size:15px; white-space:pre-wrap; font-family:Arial,sans-serif;">{current_display}</div>'
+                yield html_output
+                time.sleep(0.015)
+            if newline_count > 0:
+                actual_newlines = min(newline_count, 2)
+                accumulated += "\n" * actual_newlines
+                html_output = f'<div style="padding:20px; line-height:1.8; font-size:15px; white-space:pre-wrap; font-family:Arial,sans-serif;">{accumulated}</div>'
+                yield html_output
+        except Exception as e:
+            yield f'<div style="padding:20px; color:#ff6b6b;">❌ Lỗi dịch chunk {i+1}: {str(e)}</div>'
+            return
+# ==================== UI HELPER FUNCTIONS ====================
+def load_image(file_path, page_num_str="1"):
+    if not file_path:
+        return None
+    try:
+        try:
+            page_num = int(page_num_str)
+        except (ValueError, TypeError):
+            page_num = 1
+        if file_path.lower().endswith('.pdf'):
+            doc = fitz.open(file_path)
+            page_idx = max(0, min(page_num - 1, len(doc) - 1))
+            page = doc.load_page(page_idx)
+            pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False)
+            img = Image.open(BytesIO(pix.tobytes("png")))
+            doc.close()
+            return img
+        else:
+            return Image.open(file_path)
+    except Exception as e:
+        print(f"Error loading image: {e}")
+        return None
+def get_pdf_page_count(file_path):
+    if not file_path or not file_path.lower().endswith('.pdf'):
+        return 1
+    try:
+        doc = fitz.open(file_path)
+        count = len(doc)
+        doc.close()
+        return count
+    except Exception as e:
+        print(f"Error reading PDF page count: {e}")
+        return 1
+def update_page_info(file_path):
+    if not file_path:
+        return gr.update(label="Số trang (chỉ dùng cho PDF, mặc định: 1)")
+    if file_path.lower().endswith('.pdf'):
+        page_count = get_pdf_page_count(file_path)
+        return gr.update(
+            label=f"Số trang (PDF có {page_count} trang, nhập 1-{page_count})",
+            value="1"
+        )
+    return gr.update(
+        label="Số trang (chỉ dùng cho PDF, mặc định: 1)",
+        value="1"
+    )
+# ==================== COMBINED OCR + TRANSLATION ====================
+def ocr_and_translate_streaming(file_path, mode, page_num_str):
+    """Hàm kết hợp: OCR trước, sau đó dịch streaming"""
+    if not file_path:
+        yield '<div style="padding:20px; color:#ff6b6b;">⚠️ Vui lòng tải file lên trước!</div>'
+        return
+    yield '<div style="padding:20px; color:#4CAF50;">🔍 Đang quét OCR...</div>'
+    try:
+        try:
+            page_num = int(page_num_str)
+        except (ValueError, TypeError):
+            page_num = 1
+        markdown = ocr_process_file(file_path, mode, page_num)
+        if not markdown or markdown.startswith("Error") or markdown.startswith("Invalid"):
+            yield f'<div style="padding:20px; color:#ff6b6b;">❌ Lỗi OCR: {markdown}</div>'
+            return
+    except Exception as e:
+        yield f'<div style="padding:20px; color:#ff6b6b;">❌ Lỗi OCR: {str(e)}</div>'
+        return
+    yield '<div style="padding:20px; color:#2196F3;">🦀 Đang dịch...</div>'
+    time.sleep(0.5)
+    try:
+        yield from streaming_translate(markdown)
+    except Exception as e:
+        yield f'<div style="padding:20px; color:#ff6b6b;">❌ Lỗi dịch: {str(e)}</div>'
+# ==================== GRADIO INTERFACE ====================
+css = """
+footer { visibility: hidden }
+.main-title {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    color: white;
+    padding: 15px;
+    border-radius: 10px;
+    text-align: center;
+    margin-bottom: 20px;
+}
+"""
+with gr.Blocks(theme=gr.themes.Soft(), css=css, title="OCR + Translation") as demo:
+    gr.Markdown("""
+    <div class="main-title">
+    <h1>🦀 MedCrab Translation</h1>
+    <p><b>Quét PDF Y khoa → Dịch trực tiếp sang tiếng Việt (Streaming)</b></p>
+    </div>
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### 📤 Tải file lên")
+            file_in = gr.File(label="PDF hoặc Hình ảnh", file_types=["image", ".pdf"], type="filepath")
+            input_img = gr.Image(label="Xem trước", type="pil", height=300)
+            page_input = gr.Textbox(
+                label="Số trang (chỉ dùng cho PDF, mặc định: 1)",
+                value="1",
+                placeholder="Nhập số trang..."
+            )
+            mode = gr.Dropdown(list(MODEL_CONFIGS.keys()), value="Crab", label="Chế độ OCR")
+            gr.Markdown("### 🦀 Quét và Dịch")
+            process_btn = gr.Button("🚀 Quét OCR + Dịch tiếng Việt", variant="primary", size="lg")
+        with gr.Column(scale=2):
+            gr.Markdown("### 📄 Kết quả dịch tiếng Việt (Streaming)")
+            translation_output = gr.HTML(label="", value="")
+    with gr.Accordion("ℹ️ Hướng dẫn sử dụng", open=False):
+        gr.Markdown("""
+        **Quy trình đơn giản:**
+        1. 📤 Tải lên file PDF hoặc hình ảnh y khoa
+        2. 🚀 Nhấn nút "Quét OCR + Dịch tiếng Việt"
+        **Chế độ OCR:**
+        - **Crab**: 1024 base + 640 tiles (Tốt nhất, cân bằng)
+        - **Base**: 1024×1024 (Nhanh hơn)
+        **Lưu ý:** Space này sử dụng GPU miễn phí của Hugging Face, có thể mất vài giây để khởi động.
+        """)
+    file_in.change(load_image, [file_in, page_input], [input_img])
+    file_in.change(update_page_info, [file_in], [page_input])
+    page_input.change(load_image, [file_in, page_input], [input_img])
+    process_btn.click(
+        ocr_and_translate_streaming,
+        [file_in, mode, page_input],
+        [translation_output]
+    )
+if __name__ == "__main__":
+    print("🚀 Starting MedCrab Translation on Hugging Face Spaces...")
+    demo.queue(max_size=20).launch()

medcrab/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .medcrab import MedCrabTranslator
2	+
3	+ __all__ = ["MedCrabTranslator"]

medcrab/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (207 Bytes). View file

medcrab/__pycache__/medcrab.cpython-312.pyc ADDED Viewed

Binary file (6.88 kB). View file

medcrab/medcrab.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import re
+class MedCrabTranslator:
+    """
+    Translator class cho MedCrab-1.5B, dịch văn bản y khoa từ tiếng Anh sang tiếng Việt.
+    """
+    def __init__(self, model_name: str = "pnnbao-ump/MedCrab-1.5b", device: str = "cuda"):
+        self.model_name = model_name
+        self.device = device
+        # Load tokenizer và model
+        print(f"Loading model {model_name} on {device} ...")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            device_map="auto" if device == "cuda" else None,
+            torch_dtype=torch.bfloat16
+        )
+        self.model.eval()
+    def _build_prompt(self, text: str) -> str:
+        """Tạo prompt chuẩn cho MedCrab"""
+        return (
+            f"<|user|>: Translate the following medical text from English to Vietnamese:"
+            f"<|ENGLISH_TEXT_START|>{text}<|ENGLISH_TEXT_END|>\n"
+            "<|assistant|>:<|VIETNAMESE_TEXT_START|>"
+        )
+    def translate(self, text: str, max_new_tokens: int = 2048) -> str:
+        """
+        Dịch văn bản tiếng Anh sang tiếng Việt.
+        Args:
+            text: Văn bản tiếng Anh
+            max_new_tokens: số token tối đa sinh ra
+        Returns:
+            Văn bản tiếng Việt
+        """
+        prompt = self._build_prompt(text)
+        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
+        # Lấy EOS token, fallback nếu không có
+        eos_id = self.tokenizer.get_vocab().get("<|VIETNAMESE_TEXT_END|>", self.tokenizer.eos_token_id)
+        # Generate với autocast
+        with torch.no_grad():
+            output_tokens = self.model.generate(
+                **inputs,
+                max_new_tokens=max_new_tokens,
+                eos_token_id=eos_id,
+                do_sample=False,
+                use_cache=True,
+            )
+        # Bỏ phần prompt
+        input_len = inputs["input_ids"].shape[-1]
+        decoded = self.tokenizer.decode(output_tokens[0, input_len:], skip_special_tokens=True)
+        # Tách token end nếu có
+        if "<|VIETNAMESE_TEXT_END|>" in decoded:
+            decoded = decoded.split("<|VIETNAMESE_TEXT_END|>")[0]
+        # Optional: post-processing spacing thuật ngữ y khoa
+        decoded = self._post_process(decoded)
+        return decoded.strip()
+    def _post_process(self, text: str) -> str:
+        """
+        Sửa spacing cho các thuật ngữ y khoa bị dính.
+        Ví dụ: 'phối tửthụ thể' -> 'phối tử – thụ thể'
+        """
+        text = text.replace("phối tửthụ thể", "phối tử – thụ thể")
+        text = text.replace("miễn dịchchuyển hóa", "miễn dịch – chuyển hóa")
+        text = text.replace("oxy hóakhử", "oxy hóa - khử")
+        # có thể thêm các quy tắc khác
+        return text
+    def _split_into_chunks(self, text: str, max_words: int = 150) -> list[tuple[str, str]]:
+        """
+        Chia văn bản dài thành các đoạn nhỏ, GHI NHỚ dấu phân cách gốc.
+        Return: list[(chunk_text, separator)]
+        """
+        # Tách câu và GIỮ LẠI dấu phân cách
+        pattern = r'([.!?])\n+'
+        parts = re.split(pattern, text)
+        sentences = []
+        i = 0
+        while i < len(parts):
+            if i + 1 < len(parts) and parts[i+1] and parts[i+1] in '.!?':
+                # Ghép câu với dấu câu của nó
+                sentences.append((parts[i] + parts[i+1], parts[i+1]))
+                i += 2
+            elif parts[i].strip():
+                # Đoạn text không có dấu câu đặc biệt hoặc là newline
+                sep = '\n' if '\n' in parts[i] else ' '
+                sentences.append((parts[i].strip(), sep))
+                i += 1
+            else:
+                i += 1
+        chunks = []
+        current_chunk = []
+        word_count = 0
+        last_separator = ' '
+        for sentence, separator in sentences:
+            words_in_sentence = sentence.split()
+            if word_count + len(words_in_sentence) > max_words:
+                if current_chunk:
+                    chunks.append((" ".join(current_chunk), last_separator))
+                current_chunk = words_in_sentence
+                word_count = len(words_in_sentence)
+            else:
+                current_chunk.extend(words_in_sentence)
+                word_count += len(words_in_sentence)
+            last_separator = separator
+        if current_chunk:
+            chunks.append((" ".join(current_chunk), last_separator))
+        return chunks
+    def translate_long_text(self, text: str, max_new_tokens: int = 2048) -> str:
+        """
+        Dịch văn bản dài, GHỮ NGUYÊN dấu phân cách gốc khi ghép.
+        """
+        chunks = self._split_into_chunks(text, max_words=150)
+        translated_parts = []
+        for i, (chunk, separator) in enumerate(chunks):
+            translated = self.translate(chunk, max_new_tokens=max_new_tokens)
+            # Ghép chunk với dấu phân cách phù hợp
+            if i < len(chunks) - 1:  # Không phải chunk cuối
+                if separator == '\n':
+                    translated_parts.append(translated + '\n')
+                elif separator in '.!?':
+                    translated_parts.append(translated + ' ')  # Dấu câu đã có sẵn trong translated
+                else:
+                    translated_parts.append(translated + ' ')
+            else:  # Chunk cuối
+                translated_parts.append(translated)
+        return ''.join(translated_parts)

readme_md.md ADDED Viewed

	@@ -0,0 +1,61 @@

+---
+title: MedCrab Translation
+emoji: 🦀
+colorFrom: purple
+colorTo: blue
+sdk: gradio
+sdk_version: 5.0.0
+app_file: app.py
+pinned: false
+license: cc-by-nc-4.0
+python_version: 3.10
+models:
+  - deepseek-ai/DeepSeek-OCR
+tags:
+  - medical
+  - translation
+  - ocr
+  - vietnamese
+---
+# 🦀 MedCrab Translation
+Ứng dụng quét OCR tài liệu y khoa và dịch trực tiếp sang tiếng Việt với hiệu ứng streaming.
+## Tính năng
+- 📄 Hỗ trợ PDF và hình ảnh
+- 🔍 OCR chính xác với DeepSeek-OCR
+- 🦀 Dịch y khoa chuyên sâu với MedCrab
+- ⚡ Streaming real-time
+- 🎨 Giao diện thân thiện
+## Sử dụng
+1. Tải lên file PDF hoặc hình ảnh y khoa
+2. Chọn số trang (nếu là PDF)
+3. Chọn chế độ OCR
+4. Nhấn "Quét OCR + Dịch tiếng Việt"
+## Chế độ OCR
+- **Crab**: Chế độ cân bằng (khuyên dùng)
+- **Base**: Chế độ nhanh
+## Yêu cầu GPU
+Space này cần GPU để chạy. Hugging Face cung cấp GPU miễn phí với giới hạn thời gian sử dụng.
+## License
+Creative Commons Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)
+This work is licensed under CC BY-NC 4.0. You are free to:
+- Share: copy and redistribute the material
+- Adapt: remix, transform, and build upon the material
+Under the following terms:
+- Attribution: You must give appropriate credit
+- NonCommercial: You may not use the material for commercial purposes
+See: https://creativecommons.org/licenses/by-nc/4.0/

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+torch==2.6.0
+transformers==4.46.3
+tokenizers==0.20.3
+accelerate
+einops
+addict
+easydict
+torchvision
+flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+PyMuPDF
+hf_transfer