Spaces:
Running
Running
π¨ format
Browse filesSigned-off-by: peter szemraj <peterszemraj@gmail.com>
- pdf2text.py +8 -30
pdf2text.py
CHANGED
|
@@ -14,32 +14,23 @@ logging.basicConfig(
|
|
| 14 |
)
|
| 15 |
|
| 16 |
|
| 17 |
-
import gc
|
| 18 |
import os
|
| 19 |
import pprint as pp
|
| 20 |
import re
|
| 21 |
import shutil
|
| 22 |
import time
|
| 23 |
-
from datetime import datetime
|
| 24 |
-
from os.path import basename,
|
| 25 |
from pathlib import Path
|
| 26 |
-
|
| 27 |
-
import pandas as pd
|
| 28 |
-
import wordninja
|
| 29 |
from cleantext import clean
|
| 30 |
-
from natsort import natsorted
|
| 31 |
-
from tqdm.auto import tqdm
|
| 32 |
-
import os
|
| 33 |
-
import shutil
|
| 34 |
-
from os.path import join
|
| 35 |
-
from spellchecker import SpellChecker
|
| 36 |
from doctr.io import DocumentFile
|
| 37 |
from doctr.models import ocr_predictor
|
| 38 |
from libretranslatepy import LibreTranslateAPI
|
| 39 |
-
from
|
| 40 |
-
import
|
| 41 |
-
from
|
| 42 |
-
|
| 43 |
|
| 44 |
def fast_scandir(dirname):
|
| 45 |
# return all subfolders in a given filepath
|
|
@@ -127,9 +118,6 @@ def corr(
|
|
| 127 |
return s
|
| 128 |
|
| 129 |
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
def fix_punct_spaces(string):
|
| 134 |
"""
|
| 135 |
fix_punct_spaces - replace spaces around punctuation with punctuation. For example, "hello , there" -> "hello, there"
|
|
@@ -176,8 +164,6 @@ def clean_OCR(ugly_text: str):
|
|
| 176 |
return fix_punct_spaces(cleaned_text)
|
| 177 |
|
| 178 |
|
| 179 |
-
|
| 180 |
-
|
| 181 |
def move2completed(from_dir, filename, new_folder="completed", verbose=False):
|
| 182 |
|
| 183 |
# this is the better version
|
|
@@ -207,7 +193,6 @@ def move2completed(from_dir, filename, new_folder="completed", verbose=False):
|
|
| 207 |
"""
|
| 208 |
|
| 209 |
|
| 210 |
-
|
| 211 |
custom_replace_list = {
|
| 212 |
"t0": "to",
|
| 213 |
"'$": "'s",
|
|
@@ -224,7 +209,6 @@ replace_corr_exceptions = {
|
|
| 224 |
}
|
| 225 |
|
| 226 |
|
| 227 |
-
|
| 228 |
spell = SpellChecker()
|
| 229 |
|
| 230 |
|
|
@@ -278,7 +262,7 @@ def eval_and_replace(text: str, match_token: str = "- ") -> str:
|
|
| 278 |
return text
|
| 279 |
|
| 280 |
|
| 281 |
-
def cleantxt_ocr(ugly_text, lower=False, lang:str="en") -> str:
|
| 282 |
"""
|
| 283 |
cleantxt_ocr - clean text from OCR
|
| 284 |
|
|
@@ -362,9 +346,6 @@ def result2text(result, as_text=False) -> str or list:
|
|
| 362 |
return "\n".join(full_doc) if as_text else full_doc
|
| 363 |
|
| 364 |
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
def convert_PDF_to_Text(
|
| 369 |
PDF_file,
|
| 370 |
ocr_model=None,
|
|
@@ -409,7 +390,6 @@ def convert_PDF_to_Text(
|
|
| 409 |
return results_dict
|
| 410 |
|
| 411 |
|
| 412 |
-
|
| 413 |
# @title translation functions
|
| 414 |
|
| 415 |
lt = LibreTranslateAPI("https://translate.astian.org/")
|
|
@@ -447,5 +427,3 @@ def translate_doc(filepath, lang_start, lang_end="en", verbose=False):
|
|
| 447 |
if verbose:
|
| 448 |
print("finished translating the document! - ", datetime.now())
|
| 449 |
return out_path
|
| 450 |
-
|
| 451 |
-
|
|
|
|
| 14 |
)
|
| 15 |
|
| 16 |
|
|
|
|
| 17 |
import os
|
| 18 |
import pprint as pp
|
| 19 |
import re
|
| 20 |
import shutil
|
| 21 |
import time
|
| 22 |
+
from datetime import date, datetime
|
| 23 |
+
from os.path import basename, dirname, join
|
| 24 |
from pathlib import Path
|
| 25 |
+
|
|
|
|
|
|
|
| 26 |
from cleantext import clean
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
from doctr.io import DocumentFile
|
| 28 |
from doctr.models import ocr_predictor
|
| 29 |
from libretranslatepy import LibreTranslateAPI
|
| 30 |
+
from natsort import natsorted
|
| 31 |
+
from spellchecker import SpellChecker
|
| 32 |
+
from tqdm.auto import tqdm
|
| 33 |
+
|
| 34 |
|
| 35 |
def fast_scandir(dirname):
|
| 36 |
# return all subfolders in a given filepath
|
|
|
|
| 118 |
return s
|
| 119 |
|
| 120 |
|
|
|
|
|
|
|
|
|
|
| 121 |
def fix_punct_spaces(string):
|
| 122 |
"""
|
| 123 |
fix_punct_spaces - replace spaces around punctuation with punctuation. For example, "hello , there" -> "hello, there"
|
|
|
|
| 164 |
return fix_punct_spaces(cleaned_text)
|
| 165 |
|
| 166 |
|
|
|
|
|
|
|
| 167 |
def move2completed(from_dir, filename, new_folder="completed", verbose=False):
|
| 168 |
|
| 169 |
# this is the better version
|
|
|
|
| 193 |
"""
|
| 194 |
|
| 195 |
|
|
|
|
| 196 |
custom_replace_list = {
|
| 197 |
"t0": "to",
|
| 198 |
"'$": "'s",
|
|
|
|
| 209 |
}
|
| 210 |
|
| 211 |
|
|
|
|
| 212 |
spell = SpellChecker()
|
| 213 |
|
| 214 |
|
|
|
|
| 262 |
return text
|
| 263 |
|
| 264 |
|
| 265 |
+
def cleantxt_ocr(ugly_text, lower=False, lang: str = "en") -> str:
|
| 266 |
"""
|
| 267 |
cleantxt_ocr - clean text from OCR
|
| 268 |
|
|
|
|
| 346 |
return "\n".join(full_doc) if as_text else full_doc
|
| 347 |
|
| 348 |
|
|
|
|
|
|
|
|
|
|
| 349 |
def convert_PDF_to_Text(
|
| 350 |
PDF_file,
|
| 351 |
ocr_model=None,
|
|
|
|
| 390 |
return results_dict
|
| 391 |
|
| 392 |
|
|
|
|
| 393 |
# @title translation functions
|
| 394 |
|
| 395 |
lt = LibreTranslateAPI("https://translate.astian.org/")
|
|
|
|
| 427 |
if verbose:
|
| 428 |
print("finished translating the document! - ", datetime.now())
|
| 429 |
return out_path
|
|
|
|
|
|