Spaces:

GIZ
/

audit_assistant

Running on T4

ppsingh commited on Aug 12, 2024

Commit

458b338

verified ·

1 Parent(s): db69dc4

Update auditqa/process_chunks.py

Files changed (1) hide show

auditqa/process_chunks.py CHANGED Viewed

@@ -9,10 +9,27 @@ from langchain_community.vectorstores import Qdrant
 from qdrant_client import QdrantClient
 from auditqa.reports import files, report_list
 from langchain.docstore.document import Document
 device = 'cuda' if cuda.is_available() else 'cpu'
-path_to_data = "./reports/"
 def open_file(filepath):
     with open(filepath) as file:
         simple_json = json.load(file)
@@ -26,6 +43,7 @@ def load_chunks():
     #  we iterate through the files which contain information about its
     # 'source'=='category', 'subtype', these are used in UI for document selection
     #  which will be used later for filtering database
     all_documents = {}
     categories = list(files.keys())
     # iterate through 'source'
@@ -70,8 +88,8 @@ def load_chunks():
     # define embedding model
     embeddings = HuggingFaceEmbeddings(
         model_kwargs = {'device': device},
-        encode_kwargs = {'normalize_embeddings': True},
-        model_name="BAAI/bge-large-en-v1.5"
     )
     # placeholder for collection
     qdrant_collections = {}

 from qdrant_client import QdrantClient
 from auditqa.reports import files, report_list
 from langchain.docstore.document import Document
+import configparser
+# read all the necessary variables
 device = 'cuda' if cuda.is_available() else 'cpu'
+path_to_data = "./reports/"
+##---------------------fucntions -------------------------------------------##
+def getconfig(configfile_path:str):
+    """
+    configfile_path: file path of .cfg file
+    """
+    config = configparser.ConfigParser()
+    try:
+        config.read_file(open(configfile_path))
+        return config
+    except:
+        logging.warning("config file not found")
 def open_file(filepath):
     with open(filepath) as file:
         simple_json = json.load(file)
     #  we iterate through the files which contain information about its
     # 'source'=='category', 'subtype', these are used in UI for document selection
     #  which will be used later for filtering database
+    config = getconfig("./model_params.cfg")
     all_documents = {}
     categories = list(files.keys())
     # iterate through 'source'
     # define embedding model
     embeddings = HuggingFaceEmbeddings(
         model_kwargs = {'device': device},
+        encode_kwargs = {'normalize_embeddings': bool(int(config.get('retriever','NORMALIZE')))},
+        model_name=config.get('retriever','MODEL')
     )
     # placeholder for collection
     qdrant_collections = {}