Spaces:

muhammadsalmanalfaridzi
/

DuckLink

Sleeping

App Files Files Community

DuckLink / app.py

muhammadsalmanalfaridzi

Update app.py

79a92e5 verified 9 months ago

raw

history blame contribute delete

7.31 kB

	import logging
	import json
	import yaml
	import gradio as gr
	import gradio.themes as themes
	from pathlib import Path
	from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
	from docling.datamodel.base_models import InputFormat
	from docling.document_converter import DocumentConverter, PdfFormatOption, WordFormatOption
	from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode, EasyOcrOptions
	from docling.utils.export import generate_multimodal_pages
	from docling.utils.utils import create_hash
	import pandas as pd
	import time
	import datetime

	# Set up logging
	logging.basicConfig(level=logging.INFO)
	_log = logging.getLogger(__name__)

	# OCR Configuration
	ocr_options = EasyOcrOptions(force_full_page_ocr=True)
	pipeline_options = PdfPipelineOptions(do_table_structure=True)
	pipeline_options.do_ocr = True # Enable OCR for images and text
	pipeline_options.table_structure_options.do_cell_matching = True
	pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE # More accurate table model
	pipeline_options.ocr_options = ocr_options
	pipeline_options.ocr_options.lang = ["id", "en"] # OCR languages

	# Function to handle document conversion and exports
	def export_tables_and_figures(conv_res, output_dir):
	"""Exports tables, figures, and multimodal pages from the converted document."""
	start_time = time.time()

	output_files = []

	# Export tables
	for table_ix, table in enumerate(conv_res.document.tables):
	table_df = table.export_to_dataframe()
	table_csv_filename = output_dir / f"{conv_res.input.file.stem}-table-{table_ix + 1}.csv"
	table_html_filename = output_dir / f"{conv_res.input.file.stem}-table-{table_ix + 1}.html"

	_log.info(f"Saving CSV table to {table_csv_filename}")
	table_df.to_csv(table_csv_filename)

	_log.info(f"Saving HTML table to {table_html_filename}")
	with table_html_filename.open("w") as fp:
	fp.write(table.export_to_html())

	# Append to output files
	output_files.append(table_csv_filename)
	output_files.append(table_html_filename)

	# Export pictures (e.g., images with OCR or annotations)
	for picture_ix, picture in enumerate(conv_res.document.pictures): # Changed 'figures' to 'pictures'
	if picture.image: # Check if picture.image is not None
	picture_image_filename = output_dir / f"{conv_res.input.file.stem}-picture-{picture_ix + 1}.png"
	_log.info(f"Saving Picture to {picture_image_filename}")
	picture.image.save(picture_image_filename)

	# Append to output files
	output_files.append(picture_image_filename)
	else:
	_log.warning(f"Skipping picture {picture_ix + 1} due to missing image.")

	# Export multimodal pages
	rows = []
	for content_text, content_md, content_dt, page_cells, page_segments, page in generate_multimodal_pages(conv_res):
	try:
	dpi = page._default_image_scale * 72
	# Ensure page.image exists and handle the case where it may be None
	image_width = image_height = 0
	image_bytes = None
	if page.image:
	image_width = page.image.width
	image_height = page.image.height
	image_bytes = page.image.tobytes()

	rows.append({
	"document": conv_res.input.file.name,
	"hash": conv_res.input.document_hash,
	"page_hash": create_hash(conv_res.input.document_hash + ":" + str(page.page_no - 1)),
	"image": {
	"width": image_width,
	"height": image_height,
	"bytes": image_bytes,
	},
	"cells": page_cells,
	"contents": content_text,
	"contents_md": content_md,
	"contents_dt": content_dt,
	"segments": page_segments,
	"extra": {
	"page_num": page.page_no + 1,
	"width_in_points": page.size.width,
	"height_in_points": page.size.height,
	"dpi": dpi,
	},
	})
	except Exception as e:
	_log.warning(f"Failed to process page {page.page_no + 1}: {e}")

	# Generate one Parquet from all documents
	df = pd.json_normalize(rows)
	now = datetime.datetime.now()
	output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
	df.to_parquet(output_filename)

	# Append to output files
	output_files.append(output_filename)

	end_time = time.time() - start_time
	_log.info(f"Tables, figures, and multimodal pages exported in {end_time:.2f} seconds.")

	return [str(file.resolve()) for file in output_files]

	# Main conversion function
	def convert_document(input_file):
	# Create a temporary output directory
	output_dir = Path("scratch")
	output_dir.mkdir(parents=True, exist_ok=True)

	# Create DocumentConverter instance
	doc_converter = DocumentConverter(
	allowed_formats=[InputFormat.PDF, InputFormat.IMAGE, InputFormat.DOCX, InputFormat.HTML],
	format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend)}
	)

	# Convert the input file
	input_path = Path(input_file.name)
	conv_results = doc_converter.convert_all([input_path])

	# Export to markdown, json, yaml with UTF-8 encoding
	output_files = []
	for res in conv_results:
	out_path = output_dir / res.input.file.stem
	out_path.mkdir(parents=True, exist_ok=True)

	# Export Markdown and JSON with utf-8 encoding
	with (out_path / f"{res.input.file.stem}.md").open("w", encoding="utf-8") as fp:
	fp.write(res.document.export_to_markdown())
	with (out_path / f"{res.input.file.stem}.json").open("w", encoding="utf-8") as fp:
	fp.write(json.dumps(res.document.export_to_dict(), ensure_ascii=False))
	with (out_path / f"{res.input.file.stem}.yaml").open("w", encoding="utf-8") as fp:
	fp.write(yaml.safe_dump(res.document.export_to_dict(), allow_unicode=True))

	# Append to output files
	output_files.append(str((out_path / f"{res.input.file.stem}.md").resolve()))
	output_files.append(str((out_path / f"{res.input.file.stem}.json").resolve()))
	output_files.append(str((out_path / f"{res.input.file.stem}.yaml").resolve()))

	# Export tables, figures, and multimodal content
	output_files.extend(export_tables_and_figures(res, out_path))

	return output_files

	# Create the Gradio interface
	def gradio_interface(input_file):
	output_files = convert_document(input_file)
	return output_files

	# Create the Gradio interface with a theme
	iface = gr.Interface(
	fn=gradio_interface,
	inputs=gr.File(file_count="single", type="filepath"),
	outputs=gr.File(file_count="multiple"),
	title="Document Conversion with OCR",
	description="Upload your document or image, and get the converted output with OCR and other exports.",
	allow_flagging="never",
	theme=themes.Base(primary_hue="teal", secondary_hue="teal", neutral_hue="slate"), # Set the theme here
	)

	if __name__ == "__main__":
	iface.launch()