Spaces:
Running
on
T4
Running
on
T4
Update auditqa/process_chunks.py
Browse files
auditqa/process_chunks.py
CHANGED
|
@@ -50,7 +50,7 @@ def load_chunks():
|
|
| 50 |
# which will be used later for filtering database
|
| 51 |
config = getconfig("./model_params.cfg")
|
| 52 |
|
| 53 |
-
doc_processed = open_file(path_to_data + "
|
| 54 |
chunks_list = []
|
| 55 |
|
| 56 |
for doc in doc_processed:
|
|
@@ -62,7 +62,7 @@ def load_chunks():
|
|
| 62 |
embeddings = HuggingFaceEmbeddings(
|
| 63 |
model_kwargs = {'device': device},
|
| 64 |
show_progress= True,
|
| 65 |
-
encode_kwargs = {'normalize_embeddings': bool(int(config.get('retriever','NORMALIZE')))},
|
| 66 |
model_name=config.get('retriever','MODEL')
|
| 67 |
)
|
| 68 |
# placeholder for collection
|
|
@@ -79,11 +79,11 @@ def load_chunks():
|
|
| 79 |
# collection_name='reportsFeb2025',
|
| 80 |
# )
|
| 81 |
|
| 82 |
-
qdrant_collections['
|
| 83 |
chunks_list,
|
| 84 |
embeddings,
|
| 85 |
path="/data/local_qdrant",
|
| 86 |
-
collection_name='
|
| 87 |
)
|
| 88 |
print(qdrant_collections)
|
| 89 |
print("vector embeddings done")
|
|
|
|
| 50 |
# which will be used later for filtering database
|
| 51 |
config = getconfig("./model_params.cfg")
|
| 52 |
|
| 53 |
+
doc_processed = open_file(path_to_data + "docling_chunks.json" )
|
| 54 |
chunks_list = []
|
| 55 |
|
| 56 |
for doc in doc_processed:
|
|
|
|
| 62 |
embeddings = HuggingFaceEmbeddings(
|
| 63 |
model_kwargs = {'device': device},
|
| 64 |
show_progress= True,
|
| 65 |
+
encode_kwargs = {'normalize_embeddings': bool(int(config.get('retriever','NORMALIZE'))),},
|
| 66 |
model_name=config.get('retriever','MODEL')
|
| 67 |
)
|
| 68 |
# placeholder for collection
|
|
|
|
| 79 |
# collection_name='reportsFeb2025',
|
| 80 |
# )
|
| 81 |
|
| 82 |
+
qdrant_collections['docling'] = Qdrant.from_documents(
|
| 83 |
chunks_list,
|
| 84 |
embeddings,
|
| 85 |
path="/data/local_qdrant",
|
| 86 |
+
collection_name='docling',
|
| 87 |
)
|
| 88 |
print(qdrant_collections)
|
| 89 |
print("vector embeddings done")
|