Spaces:
Sleeping
Sleeping
| import logging | |
| import json | |
| import yaml | |
| import gradio as gr | |
| import gradio.themes as themes | |
| from pathlib import Path | |
| from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend | |
| from docling.datamodel.base_models import InputFormat | |
| from docling.document_converter import DocumentConverter, PdfFormatOption, WordFormatOption | |
| from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode, EasyOcrOptions | |
| from docling.utils.export import generate_multimodal_pages | |
| from docling.utils.utils import create_hash | |
| import pandas as pd | |
| import time | |
| import datetime | |
| # Set up logging | |
| logging.basicConfig(level=logging.INFO) | |
| _log = logging.getLogger(__name__) | |
| # OCR Configuration | |
| ocr_options = EasyOcrOptions(force_full_page_ocr=True) | |
| pipeline_options = PdfPipelineOptions(do_table_structure=True) | |
| pipeline_options.do_ocr = True # Enable OCR for images and text | |
| pipeline_options.table_structure_options.do_cell_matching = True | |
| pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE # More accurate table model | |
| pipeline_options.ocr_options = ocr_options | |
| pipeline_options.ocr_options.lang = ["id", "en"] # OCR languages | |
| # Function to handle document conversion and exports | |
| def export_tables_and_figures(conv_res, output_dir): | |
| """Exports tables, figures, and multimodal pages from the converted document.""" | |
| start_time = time.time() | |
| output_files = [] | |
| # Export tables | |
| for table_ix, table in enumerate(conv_res.document.tables): | |
| table_df = table.export_to_dataframe() | |
| table_csv_filename = output_dir / f"{conv_res.input.file.stem}-table-{table_ix + 1}.csv" | |
| table_html_filename = output_dir / f"{conv_res.input.file.stem}-table-{table_ix + 1}.html" | |
| _log.info(f"Saving CSV table to {table_csv_filename}") | |
| table_df.to_csv(table_csv_filename) | |
| _log.info(f"Saving HTML table to {table_html_filename}") | |
| with table_html_filename.open("w") as fp: | |
| fp.write(table.export_to_html()) | |
| # Append to output files | |
| output_files.append(table_csv_filename) | |
| output_files.append(table_html_filename) | |
| # Export pictures (e.g., images with OCR or annotations) | |
| for picture_ix, picture in enumerate(conv_res.document.pictures): # Changed 'figures' to 'pictures' | |
| if picture.image: # Check if picture.image is not None | |
| picture_image_filename = output_dir / f"{conv_res.input.file.stem}-picture-{picture_ix + 1}.png" | |
| _log.info(f"Saving Picture to {picture_image_filename}") | |
| picture.image.save(picture_image_filename) | |
| # Append to output files | |
| output_files.append(picture_image_filename) | |
| else: | |
| _log.warning(f"Skipping picture {picture_ix + 1} due to missing image.") | |
| # Export multimodal pages | |
| rows = [] | |
| for content_text, content_md, content_dt, page_cells, page_segments, page in generate_multimodal_pages(conv_res): | |
| try: | |
| dpi = page._default_image_scale * 72 | |
| # Ensure page.image exists and handle the case where it may be None | |
| image_width = image_height = 0 | |
| image_bytes = None | |
| if page.image: | |
| image_width = page.image.width | |
| image_height = page.image.height | |
| image_bytes = page.image.tobytes() | |
| rows.append({ | |
| "document": conv_res.input.file.name, | |
| "hash": conv_res.input.document_hash, | |
| "page_hash": create_hash(conv_res.input.document_hash + ":" + str(page.page_no - 1)), | |
| "image": { | |
| "width": image_width, | |
| "height": image_height, | |
| "bytes": image_bytes, | |
| }, | |
| "cells": page_cells, | |
| "contents": content_text, | |
| "contents_md": content_md, | |
| "contents_dt": content_dt, | |
| "segments": page_segments, | |
| "extra": { | |
| "page_num": page.page_no + 1, | |
| "width_in_points": page.size.width, | |
| "height_in_points": page.size.height, | |
| "dpi": dpi, | |
| }, | |
| }) | |
| except Exception as e: | |
| _log.warning(f"Failed to process page {page.page_no + 1}: {e}") | |
| # Generate one Parquet from all documents | |
| df = pd.json_normalize(rows) | |
| now = datetime.datetime.now() | |
| output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet" | |
| df.to_parquet(output_filename) | |
| # Append to output files | |
| output_files.append(output_filename) | |
| end_time = time.time() - start_time | |
| _log.info(f"Tables, figures, and multimodal pages exported in {end_time:.2f} seconds.") | |
| return [str(file.resolve()) for file in output_files] | |
| # Main conversion function | |
| def convert_document(input_file): | |
| # Create a temporary output directory | |
| output_dir = Path("scratch") | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| # Create DocumentConverter instance | |
| doc_converter = DocumentConverter( | |
| allowed_formats=[InputFormat.PDF, InputFormat.IMAGE, InputFormat.DOCX, InputFormat.HTML], | |
| format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend)} | |
| ) | |
| # Convert the input file | |
| input_path = Path(input_file.name) | |
| conv_results = doc_converter.convert_all([input_path]) | |
| # Export to markdown, json, yaml with UTF-8 encoding | |
| output_files = [] | |
| for res in conv_results: | |
| out_path = output_dir / res.input.file.stem | |
| out_path.mkdir(parents=True, exist_ok=True) | |
| # Export Markdown and JSON with utf-8 encoding | |
| with (out_path / f"{res.input.file.stem}.md").open("w", encoding="utf-8") as fp: | |
| fp.write(res.document.export_to_markdown()) | |
| with (out_path / f"{res.input.file.stem}.json").open("w", encoding="utf-8") as fp: | |
| fp.write(json.dumps(res.document.export_to_dict(), ensure_ascii=False)) | |
| with (out_path / f"{res.input.file.stem}.yaml").open("w", encoding="utf-8") as fp: | |
| fp.write(yaml.safe_dump(res.document.export_to_dict(), allow_unicode=True)) | |
| # Append to output files | |
| output_files.append(str((out_path / f"{res.input.file.stem}.md").resolve())) | |
| output_files.append(str((out_path / f"{res.input.file.stem}.json").resolve())) | |
| output_files.append(str((out_path / f"{res.input.file.stem}.yaml").resolve())) | |
| # Export tables, figures, and multimodal content | |
| output_files.extend(export_tables_and_figures(res, out_path)) | |
| return output_files | |
| # Create the Gradio interface | |
| def gradio_interface(input_file): | |
| output_files = convert_document(input_file) | |
| return output_files | |
| # Create the Gradio interface with a theme | |
| iface = gr.Interface( | |
| fn=gradio_interface, | |
| inputs=gr.File(file_count="single", type="filepath"), | |
| outputs=gr.File(file_count="multiple"), | |
| title="Document Conversion with OCR", | |
| description="Upload your document or image, and get the converted output with OCR and other exports.", | |
| allow_flagging="never", | |
| theme=themes.Base(primary_hue="teal", secondary_hue="teal", neutral_hue="slate"), # Set the theme here | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() |