# %% [markdown] # ### Step 1: Reading PDF Files # %% pip install pdf2image # %% # Setup directories pdf_directory = r"F:\Preprocessing" output_directory = r"F:\Images" os.makedirs(output_directory, exist_ok=True) # Poppler path poppler_path = r"F:\poppler-24.08.0\Library\bin" # %% [markdown] # ### Step 2: Convert PDF files to Images # %% import os import cv2 import numpy as np from pdf2image import convert_from_path import glob # Hàm kiểm tra Poppler def check_poppler(): return os.path.exists(os.path.join(poppler_path, "pdftoppm.exe")) # Hàm chuyển PDF sang ảnh def pdf_to_images(pdf_path, output_dir, dpi=300): try: pages = convert_from_path(pdf_path, dpi=dpi, poppler_path=poppler_path) for i, page in enumerate(pages): image_name = f"{os.path.splitext(os.path.basename(pdf_path))[0]}_page_{i+1}.jpg" image_path = os.path.join(output_dir, image_name) page.save(image_path, "JPEG", quality=95) return len(pages) # Trả về số lượng ảnh được tạo except Exception as e: print(f"✗ Error processing {pdf_path}: {e}") return 0 # Xử lý toàn bộ file PDF def process_all_pdfs(): pdf_files = glob.glob(os.path.join(pdf_directory, "*.pdf")) total_images = 0 if not pdf_files: print(f"No PDF files found in {pdf_directory}") return for pdf_file in pdf_files: num_pages = pdf_to_images(pdf_file, output_directory) total_images += num_pages print(f"\n✓ Tổng số file PDF: {len(pdf_files)}") print(f"✓ Tổng số ảnh đã chuyển đổi: {total_images}") # MAIN EXECUTION if __name__ == "__main__": print("PDF TO IMAGES CONVERTER") print(f"Input directory: {pdf_directory}") print(f"Output directory: {output_directory}") print(f"Poppler path: {poppler_path}") print() if not os.path.exists(pdf_directory): print(f"✗ Input directory does not exist: {pdf_directory}") exit(1) if not check_poppler(): print("\n❌ Please check Poppler installation:") print(f"1. Make sure pdftoppm.exe exists in: {poppler_path}") exit(1) process_all_pdfs() print("\n✓ Processing completed!") # %% [markdown] # ### Step 3: Image Preprocessing # %% import os import cv2 import numpy as np from PIL import Image def preprocess_image(image_path): pil_img = Image.open(image_path) img = np.array(pil_img) gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8)) contrast_img = clahe.apply(gray) _, binary = cv2.threshold(contrast_img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) kernel = np.ones((1, 1), np.uint8) bold_img = cv2.dilate(binary, kernel, iterations=1) return bold_img # Thư mục đầu vào và đầu ra input_folder = r"F:\Images" output_folder = r"F:\Images_Processed" os.makedirs(output_folder, exist_ok=True) # Duyệt qua tất cả ảnh for filename in os.listdir(input_folder): if filename.lower().endswith((".jpg", ".jpeg", ".png", ".bmp")): input_path = os.path.join(input_folder, filename) output_path = os.path.join(output_folder, filename) try: processed_img = preprocess_image(input_path) # Chuyển ảnh về PIL để lưu với Unicode path pil_result = Image.fromarray(processed_img) pil_result.save(output_path) except Exception as e: print(f"❌ Lỗi xử lý {filename}: {e}")