DamLoan commited on
Commit
298678f
·
verified ·
1 Parent(s): f61200c

Upload preprocess.py

Browse files
Files changed (1) hide show
  1. preprocess.py +125 -0
preprocess.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %% [markdown]
2
+ # ### Step 1: Reading PDF Files
3
+
4
+ # %%
5
+ pip install pdf2image
6
+
7
+ # %%
8
+ # Setup directories
9
+ pdf_directory = r"F:\Preprocessing"
10
+ output_directory = r"F:\Images"
11
+ os.makedirs(output_directory, exist_ok=True)
12
+
13
+ # Poppler path
14
+ poppler_path = r"F:\poppler-24.08.0\Library\bin"
15
+
16
+ # %% [markdown]
17
+ # ### Step 2: Convert PDF files to Images
18
+
19
+ # %%
20
+ import os
21
+ import cv2
22
+ import numpy as np
23
+ from pdf2image import convert_from_path
24
+ import glob
25
+
26
+ # Hàm kiểm tra Poppler
27
+ def check_poppler():
28
+ return os.path.exists(os.path.join(poppler_path, "pdftoppm.exe"))
29
+
30
+ # Hàm chuyển PDF sang ảnh
31
+ def pdf_to_images(pdf_path, output_dir, dpi=300):
32
+ try:
33
+ pages = convert_from_path(pdf_path, dpi=dpi, poppler_path=poppler_path)
34
+ for i, page in enumerate(pages):
35
+ image_name = f"{os.path.splitext(os.path.basename(pdf_path))[0]}_page_{i+1}.jpg"
36
+ image_path = os.path.join(output_dir, image_name)
37
+ page.save(image_path, "JPEG", quality=95)
38
+ return len(pages) # Trả về số lượng ảnh được tạo
39
+ except Exception as e:
40
+ print(f"✗ Error processing {pdf_path}: {e}")
41
+ return 0
42
+
43
+ # Xử lý toàn bộ file PDF
44
+ def process_all_pdfs():
45
+ pdf_files = glob.glob(os.path.join(pdf_directory, "*.pdf"))
46
+ total_images = 0
47
+
48
+ if not pdf_files:
49
+ print(f"No PDF files found in {pdf_directory}")
50
+ return
51
+
52
+ for pdf_file in pdf_files:
53
+ num_pages = pdf_to_images(pdf_file, output_directory)
54
+ total_images += num_pages
55
+
56
+ print(f"\n✓ Tổng số file PDF: {len(pdf_files)}")
57
+ print(f"✓ Tổng số ảnh đã chuyển đổi: {total_images}")
58
+
59
+ # MAIN EXECUTION
60
+ if __name__ == "__main__":
61
+ print("PDF TO IMAGES CONVERTER")
62
+ print(f"Input directory: {pdf_directory}")
63
+ print(f"Output directory: {output_directory}")
64
+ print(f"Poppler path: {poppler_path}")
65
+ print()
66
+
67
+ if not os.path.exists(pdf_directory):
68
+ print(f"✗ Input directory does not exist: {pdf_directory}")
69
+ exit(1)
70
+
71
+ if not check_poppler():
72
+ print("\n❌ Please check Poppler installation:")
73
+ print(f"1. Make sure pdftoppm.exe exists in: {poppler_path}")
74
+ exit(1)
75
+
76
+ process_all_pdfs()
77
+ print("\n✓ Processing completed!")
78
+
79
+ # %% [markdown]
80
+ # ### Step 3: Image Preprocessing
81
+
82
+ # %%
83
+ import os
84
+ import cv2
85
+ import numpy as np
86
+ from PIL import Image
87
+
88
+ def preprocess_image(image_path):
89
+ pil_img = Image.open(image_path)
90
+ img = np.array(pil_img)
91
+
92
+ gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
93
+ clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
94
+ contrast_img = clahe.apply(gray)
95
+ _, binary = cv2.threshold(contrast_img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
96
+ kernel = np.ones((1, 1), np.uint8)
97
+ bold_img = cv2.dilate(binary, kernel, iterations=1)
98
+
99
+ return bold_img
100
+
101
+ # Thư mục đầu vào và đầu ra
102
+ input_folder = r"F:\Images"
103
+ output_folder = r"F:\Images_Processed"
104
+ os.makedirs(output_folder, exist_ok=True)
105
+
106
+ # Duyệt qua tất cả ảnh
107
+ for filename in os.listdir(input_folder):
108
+ if filename.lower().endswith((".jpg", ".jpeg", ".png", ".bmp")):
109
+ input_path = os.path.join(input_folder, filename)
110
+ output_path = os.path.join(output_folder, filename)
111
+
112
+ try:
113
+ processed_img = preprocess_image(input_path)
114
+
115
+ # Chuyển ảnh về PIL để lưu với Unicode path
116
+ pil_result = Image.fromarray(processed_img)
117
+ pil_result.save(output_path)
118
+
119
+ except Exception as e:
120
+ print(f"❌ Lỗi xử lý {filename}: {e}")
121
+
122
+
123
+
124
+
125
+