Spaces:
Running
on
T4
Running
on
T4
Update auditqa/doc_process.py
Browse files- auditqa/doc_process.py +14 -0
auditqa/doc_process.py
CHANGED
|
@@ -15,4 +15,18 @@ def process_pdf():
|
|
| 15 |
docs[file] = PyMuPDFLoader(value).load()
|
| 16 |
except Exception as e:
|
| 17 |
print("Exception: ", e)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
|
|
|
| 15 |
docs[file] = PyMuPDFLoader(value).load()
|
| 16 |
except Exception as e:
|
| 17 |
print("Exception: ", e)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# text splitter based on the tokenizer of a model of your choosing
|
| 21 |
+
# to make texts fit exactly a transformer's context window size
|
| 22 |
+
# langchain text splitters: https://python.langchain.com/docs/modules/data_connection/document_transformers/
|
| 23 |
+
chunk_size = 256
|
| 24 |
+
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
|
| 25 |
+
AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"),
|
| 26 |
+
chunk_size=chunk_size,
|
| 27 |
+
chunk_overlap=int(chunk_size / 10),
|
| 28 |
+
add_start_index=True,
|
| 29 |
+
strip_whitespace=True,
|
| 30 |
+
separators=["\n\n", "\n", ".", " ", ""],
|
| 31 |
+
)
|
| 32 |
|