import numpy as np from typing import List, Tuple, Dict, Any def load_knowledge_base(file_path: str = None) -> dict: """ Loads the knowledge base from a JSON file. """ import os import json current_dir = os.path.dirname(os.path.abspath(__file__)) if file_path is None: file_path = os.path.join(current_dir, "..", "data", "knowledge_base.json") with open(file_path, "r", encoding="utf-8") as f: knowledge_base = json.load(f) return knowledge_base def get_keys_chunks(knowledge_base: Dict[Any, Dict[str, Any]]) -> List[Tuple[Any, str]]: """ Extracts keys and concatenates 'title' and 'topics' from each document in knowledge base. """ return [ (key, f"{doc.get('title', '')} {doc.get('topics', '')}".strip()) for key, doc in knowledge_base.items() ] def get_docs(doc_keys: List[Any], knowledge_base: Dict[Any, Dict[str, Any]]) -> str: """ Retrieves and formats document details for given keys from knowledge base. """ formatted_docs = [] for i, key in enumerate(doc_keys, start=1): doc = knowledge_base.get(key, {}) title = doc.get("title", "Unknown Title") dates = doc.get("dates", "Unknown Dates") details_list = doc.get("details", []) details = ( " ".join(details_list) if isinstance(details_list, list) else str(details_list) ) formatted_doc_info = ( f"{i}. Title: {title}\n" f"Dates: {dates}\n" f"Details: {details}\n" ) formatted_docs.append(formatted_doc_info) return "\n".join(formatted_docs) def calc_cosine_similarity(v1: np.ndarray, v2: np.ndarray) -> float: """ Calculate cosine similarity between two vectors. """ epsilon = 1e-10 norm_v1 = np.linalg.norm(v1) + epsilon norm_v2 = np.linalg.norm(v2) + epsilon return np.dot(v1, v2) / (norm_v1 * norm_v2) def get_top_chunk_keys( user_query_encoded: np.ndarray, keys_chunksEncoded: List[Tuple[Any, np.ndarray]], top_n: int = 3, ) -> List[Any]: """ Retrieve keys of top_n document chunks based on cosine similarity with user query embedding. """ keys_cosine_similarities = [ (key, calc_cosine_similarity(chunk_encoded, user_query_encoded)) for key, chunk_encoded in keys_chunksEncoded ] sorted_keys = sorted(keys_cosine_similarities, key=lambda x: x[1], reverse=True) return [key for key, _ in sorted_keys[:top_n]] def get_messages( docs: str, user_message: str, chat_log: List[Dict[str, str]] ) -> List[Dict[str, str]]: user_message_content = f"""You are a knowledgeable assistant responsible for answering user questions about Matthew Schulz using only the provided context. Instructions: - Base your answer solely on the information in the “Context” section. Do not speculate or make assumptions. - If the question cannot be answered with the given context, respond with: “I don’t have this information. Please reach out to Matthew directly for more details.” - Structure your response clearly and concisely. - Present facts in chronological order, when relevant. - Exclude any information that is not directly relevant to the user's question. Context: {docs} User Query: {user_message}""" messages = [] messages.extend( [{"role": entry["role"], "content": entry["content"]} for entry in chat_log] ) messages.append({"role": "user", "content": user_message_content}) return messages