Spaces:
Running
on
T4
Running
on
T4
Update auditqa/doc_process.py
Browse files- auditqa/doc_process.py +5 -3
auditqa/doc_process.py
CHANGED
|
@@ -45,17 +45,18 @@ def process_pdf():
|
|
| 45 |
categories = list(files.keys())
|
| 46 |
# iterate through 'source'
|
| 47 |
for category in categories:
|
| 48 |
-
print(category)
|
| 49 |
all_documents[category] = []
|
| 50 |
subtypes = list(files[category].keys())
|
| 51 |
# iterate through 'subtype' within the source
|
| 52 |
# example source/category == 'District', has subtypes which is district names
|
| 53 |
for subtype in subtypes:
|
| 54 |
-
print(subtype)
|
| 55 |
for file in files[category][subtype]:
|
| 56 |
|
| 57 |
# create the chunks
|
| 58 |
doc_processed = text_splitter.split_documents(docs[file])
|
|
|
|
| 59 |
|
| 60 |
# add metadata information
|
| 61 |
for doc in doc_processed:
|
|
@@ -69,6 +70,7 @@ def process_pdf():
|
|
| 69 |
# convert list of list to flat list
|
| 70 |
for key, docs_processed in all_documents.items():
|
| 71 |
docs_processed = [item for sublist in docs_processed for item in sublist]
|
|
|
|
| 72 |
all_documents[key] = docs_processed
|
| 73 |
all_documents['allreports'] = [sublist for key,sublist in all_documents.items()]
|
| 74 |
all_documents['allreports'] = [item for sublist in all_documents['allreports'] for item in sublist]
|
|
@@ -90,7 +92,7 @@ def process_pdf():
|
|
| 90 |
location=":memory:",
|
| 91 |
collection_name=file,
|
| 92 |
)
|
| 93 |
-
|
| 94 |
print("vector embeddings done")
|
| 95 |
return qdrant_collections
|
| 96 |
|
|
|
|
| 45 |
categories = list(files.keys())
|
| 46 |
# iterate through 'source'
|
| 47 |
for category in categories:
|
| 48 |
+
print("documents splitting in source:",category)
|
| 49 |
all_documents[category] = []
|
| 50 |
subtypes = list(files[category].keys())
|
| 51 |
# iterate through 'subtype' within the source
|
| 52 |
# example source/category == 'District', has subtypes which is district names
|
| 53 |
for subtype in subtypes:
|
| 54 |
+
print("document splitting for subtype:",subtype)
|
| 55 |
for file in files[category][subtype]:
|
| 56 |
|
| 57 |
# create the chunks
|
| 58 |
doc_processed = text_splitter.split_documents(docs[file])
|
| 59 |
+
print("chunks in subtype:",subtype, "are:",len(doc_processed))
|
| 60 |
|
| 61 |
# add metadata information
|
| 62 |
for doc in doc_processed:
|
|
|
|
| 70 |
# convert list of list to flat list
|
| 71 |
for key, docs_processed in all_documents.items():
|
| 72 |
docs_processed = [item for sublist in docs_processed for item in sublist]
|
| 73 |
+
print("length of chunks in source:",source, "are:",len(docs_processed)
|
| 74 |
all_documents[key] = docs_processed
|
| 75 |
all_documents['allreports'] = [sublist for key,sublist in all_documents.items()]
|
| 76 |
all_documents['allreports'] = [item for sublist in all_documents['allreports'] for item in sublist]
|
|
|
|
| 92 |
location=":memory:",
|
| 93 |
collection_name=file,
|
| 94 |
)
|
| 95 |
+
print(qdrant_collections)
|
| 96 |
print("vector embeddings done")
|
| 97 |
return qdrant_collections
|
| 98 |
|