Spaces:

GIZ
/

audit_assistant

Running on T4

App Files Files Community

ppsingh commited on Aug 10, 2024

Commit

f5e5ccb

verified ·

1 Parent(s): 1963b0a

Create process_chunks.py

Browse files

Files changed (1) hide show

auditqa/process_chunks.py +85 -0

auditqa/process_chunks.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import glob
+import os
+from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
+from transformers import AutoTokenizer
+from torch import cuda
+from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceInferenceAPIEmbeddings
+from langchain_community.vectorstores import Qdrant
+from qdrant_client import QdrantClient
+from auditqa.reports import files, report_list
+from langchain.docstore.document import Document
+device = 'cuda' if cuda.is_available() else 'cpu'
+path_to_data = "./reports/"
+def open_file(filepath):
+    with open(filepath) as file:
+        simple_json = json.load(file)
+    return simple_json
+def load_chunks():
+    """
+    this method reads through the files and report_list to create the vector database
+    """
+    #  we iterate through the files which contain information about its
+    # 'source'=='category', 'subtype', these are used in UI for document selection
+    #  which will be used later for filtering database
+    all_documents = {}
+    categories = list(files.keys())
+    # iterate through 'source'
+    for category in categories:
+        print("documents splitting in source:",category)
+        all_documents[category] = []
+        subtypes = list(files[category].keys())
+        # iterate through 'subtype' within the source
+        # example source/category == 'District', has subtypes which is district names
+        for subtype in subtypes:
+            print("document splitting for subtype:",subtype)
+            for file in files[category][subtype]:
+                # load the chunks
+                doc_processed =   open_file(path_to_data + file + "/"+ file+ ".chunks.json" )
+                print("chunks in subtype:",subtype, "are:",len(doc_processed))
+                # add metadata information
+                chunks_list = []
+                for doc in doc_processed:
+                    chunks_list.append(Document(page_content=doc['content'],
+                             metadata={"source": category,
+                                      "subtype":subtype,
+                                      "year":file[-4:],
+                                      "filename":file,
+                                      "page":doc['metadata']['page']}))
+                all_documents[category].append(chunks_list)
+    # convert list of list to flat list
+    for key, docs_processed in all_documents.items():
+        docs_processed = [item for sublist in docs_processed for item in sublist]
+        print("length of chunks in source:",key, "are:",len(docs_processed))
+        all_documents[key] = docs_processed
+    all_documents['allreports'] = [sublist for key,sublist in all_documents.items()]
+    all_documents['allreports'] = [item for sublist in all_documents['allreports'] for item in sublist]
+    # define embedding model
+    embeddings = HuggingFaceEmbeddings(
+        model_kwargs = {'device': device},
+        encode_kwargs = {'normalize_embeddings': True},
+        model_name="BAAI/bge-large-en-v1.5"
+    )
+    # placeholder for collection
+    qdrant_collections = {}
+    for file,value in all_documents.items():
+        if file == "allreports":
+            print("emebddings for:",file)
+            qdrant_collections[file] = Qdrant.from_documents(
+                value,
+                embeddings,
+                location=":memory:",
+                collection_name=file,
+            )
+    print(qdrant_collections)
+    print("vector embeddings done")
+    return qdrant_collections