Spaces:
Running
on
T4
Running
on
T4
Update auditqa/doc_process.py
Browse files- auditqa/doc_process.py +10 -2
auditqa/doc_process.py
CHANGED
|
@@ -37,7 +37,15 @@ def process_pdf():
|
|
| 37 |
all_documents = {}
|
| 38 |
categories = list(files.keys())
|
| 39 |
for category in categories:
|
| 40 |
-
all_documents[category] =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
-
|
| 43 |
|
|
|
|
| 37 |
all_documents = {}
|
| 38 |
categories = list(files.keys())
|
| 39 |
for category in categories:
|
| 40 |
+
all_documents[category] = []
|
| 41 |
+
subtypes = list(files[category].keys())
|
| 42 |
+
for subtype in subtypes:
|
| 43 |
+
for file in files[category][subtype]:
|
| 44 |
+
doc_processed = text_splitter.split_documents(docs[file])
|
| 45 |
+
for doc in doc_processed:
|
| 46 |
+
doc.metadata["source"] = category
|
| 47 |
+
doc.metadata["subtype"] = subtype
|
| 48 |
+
doc.metadata["year"] = file[-4:]
|
| 49 |
|
| 50 |
+
all_documents[category].append(doc_processed)
|
| 51 |
|