Spaces:
Running
Running
π general ease of use
Browse filesSigned-off-by: peter szemraj <peterszemraj@gmail.com>
app.py
CHANGED
|
@@ -72,7 +72,9 @@ def convert_PDF(pdf_obj, language: str = "en", max_pages=20,):
|
|
| 72 |
if isinstance(pdf_obj, list):
|
| 73 |
pdf_obj = pdf_obj[0]
|
| 74 |
file_path = Path(pdf_obj.name)
|
| 75 |
-
|
|
|
|
|
|
|
| 76 |
conversion_stats = convert_PDF_to_Text(
|
| 77 |
file_path,
|
| 78 |
ocr_model=ocr_model,
|
|
@@ -90,7 +92,11 @@ def convert_PDF(pdf_obj, language: str = "en", max_pages=20,):
|
|
| 90 |
html += f"<p>WARNING - PDF was truncated to {max_pages} pages</p>"
|
| 91 |
html += f"<p>Runtime: {rt} minutes on CPU for {num_pages} pages</p>"
|
| 92 |
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
|
| 96 |
if __name__ == "__main__":
|
|
@@ -125,7 +131,7 @@ if __name__ == "__main__":
|
|
| 125 |
with gr.Column():
|
| 126 |
|
| 127 |
gr.Markdown("## Load Inputs")
|
| 128 |
-
gr.Markdown("Upload your own file
|
| 129 |
gr.Markdown("_If no file is uploaded, a sample PDF will be used_")
|
| 130 |
|
| 131 |
|
|
@@ -135,13 +141,12 @@ if __name__ == "__main__":
|
|
| 135 |
type="file",
|
| 136 |
value= _here / "example_file.pdf",
|
| 137 |
)
|
| 138 |
-
# load_file_button = gr.Button("Load Uploaded File")
|
| 139 |
|
| 140 |
gr.Markdown("---")
|
| 141 |
|
| 142 |
with gr.Column():
|
| 143 |
gr.Markdown("## Convert PDF to Text")
|
| 144 |
-
convert_button = gr.Button("Convert PDF!")
|
| 145 |
out_placeholder = gr.HTML("<p><em>Output will appear below:</em></p>")
|
| 146 |
gr.Markdown("### Output")
|
| 147 |
OCR_text = gr.Textbox(
|
|
@@ -153,11 +158,8 @@ if __name__ == "__main__":
|
|
| 153 |
type="file",
|
| 154 |
interactive=False,
|
| 155 |
)
|
| 156 |
-
# load_file_button.click(
|
| 157 |
-
# fn=load_uploaded_file, inputs=uploaded_file, outputs=[pdf_obj]
|
| 158 |
-
# )
|
| 159 |
|
| 160 |
convert_button.click(
|
| 161 |
-
fn=convert_PDF, inputs=[uploaded_file], outputs=[OCR_text, out_placeholder]
|
| 162 |
)
|
| 163 |
demo.launch(enable_queue=True)
|
|
|
|
| 72 |
if isinstance(pdf_obj, list):
|
| 73 |
pdf_obj = pdf_obj[0]
|
| 74 |
file_path = Path(pdf_obj.name)
|
| 75 |
+
if not file_path.suffix == ".pdf":
|
| 76 |
+
logging.error(f"File {file_path} is not a PDF file")
|
| 77 |
+
return "File is not a PDF file", None, None
|
| 78 |
conversion_stats = convert_PDF_to_Text(
|
| 79 |
file_path,
|
| 80 |
ocr_model=ocr_model,
|
|
|
|
| 92 |
html += f"<p>WARNING - PDF was truncated to {max_pages} pages</p>"
|
| 93 |
html += f"<p>Runtime: {rt} minutes on CPU for {num_pages} pages</p>"
|
| 94 |
|
| 95 |
+
_output_name = f"RESULT_{file_path.stem}_OCR.txt"
|
| 96 |
+
with open(_output_name, "w", encoding="utf-8", errors="ignore") as f:
|
| 97 |
+
f.write(converted_txt)
|
| 98 |
+
|
| 99 |
+
return converted_txt, html, _output_name
|
| 100 |
|
| 101 |
|
| 102 |
if __name__ == "__main__":
|
|
|
|
| 131 |
with gr.Column():
|
| 132 |
|
| 133 |
gr.Markdown("## Load Inputs")
|
| 134 |
+
gr.Markdown("Upload your own file & replace the default")
|
| 135 |
gr.Markdown("_If no file is uploaded, a sample PDF will be used_")
|
| 136 |
|
| 137 |
|
|
|
|
| 141 |
type="file",
|
| 142 |
value= _here / "example_file.pdf",
|
| 143 |
)
|
|
|
|
| 144 |
|
| 145 |
gr.Markdown("---")
|
| 146 |
|
| 147 |
with gr.Column():
|
| 148 |
gr.Markdown("## Convert PDF to Text")
|
| 149 |
+
convert_button = gr.Button("Convert PDF!", variant="primary")
|
| 150 |
out_placeholder = gr.HTML("<p><em>Output will appear below:</em></p>")
|
| 151 |
gr.Markdown("### Output")
|
| 152 |
OCR_text = gr.Textbox(
|
|
|
|
| 158 |
type="file",
|
| 159 |
interactive=False,
|
| 160 |
)
|
|
|
|
|
|
|
|
|
|
| 161 |
|
| 162 |
convert_button.click(
|
| 163 |
+
fn=convert_PDF, inputs=[uploaded_file], outputs=[OCR_text, out_placeholder, text_file]
|
| 164 |
)
|
| 165 |
demo.launch(enable_queue=True)
|