"""Helper script to go from PDF to PNG ZIP Files we can use in HTML on the LB.""" from __future__ import annotations import zipfile from pathlib import Path from multiprocessing import Pool, cpu_count from pdf2image import convert_from_path ROOT_DIR = Path("./data") DPI = 800 # you can lower this if files are huge / too slow def process_pdf(pdf_path_str: str) -> None: pdf_path = Path(pdf_path_str).resolve() zip_path = pdf_path.with_suffix(".png.zip") print(f"Converting {pdf_path}...") # Convert all pages of the PDF images = convert_from_path(str(pdf_path), dpi=DPI) # Save pages as PNGs (multi-page safe naming) png_path = pdf_path.with_suffix(".png") images[0].save(png_path, "PNG") png_paths = [png_path] # Zip all PNGs into one archive with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zipf: for png_path in png_paths: zipf.write(png_path, arcname=png_path.name) # Clean up PNGs and original PDF for png_path in png_paths: png_path.unlink(missing_ok=True) pdf_path.unlink(missing_ok=True) def main() -> None: pdf_paths = [str(p) for p in ROOT_DIR.rglob("*.pdf")] if not pdf_paths: print("No PDFs found.") return # Use one process per CPU, but not more than number of PDFs n_procs = min(cpu_count(), len(pdf_paths)) print(f"Found {len(pdf_paths)} PDFs. Using {n_procs} processes.") with Pool(processes=n_procs) as pool: # imap_unordered gives you streaming results + simple progress printing for _ in pool.imap_unordered(process_pdf, pdf_paths): pass if __name__ == "__main__": main()