Spaces:
Sleeping
Sleeping
feat: overhaul model routing — GLM-5.1 primary, strict 4-model fallback chain, purge all stale refs
76e224e | """ | |
| Central configuration for ResearchPilot | |
| RULE: No hardcoded values anywhere else in this codebase. | |
| Every constant lives here. This make the system to | |
| tune without hunting through multiple files. | |
| """ | |
| import os | |
| from pathlib import Path | |
| from dotenv import load_dotenv | |
| # Load environment variables from .env file | |
| # This must happen before anything else reads os.environ | |
| load_dotenv() | |
| # ------------------------------------------ | |
| # PROJECT PATHS | |
| # ------------------------------------------ | |
| # Path(__file__) = config/setting.py | |
| # .parent = = config/ | |
| # .parent.parent = researchpilot/ <- project root | |
| ROOT_DIR = Path(__file__).parent.parent | |
| DATA_DIR = ROOT_DIR / "data" | |
| RAW_DIR = DATA_DIR / "raw" | |
| PROCESSED_DIR = DATA_DIR / "processed" | |
| CHUNKS_DIR = DATA_DIR / "chunks" | |
| EMBEDDINGS_DIR = DATA_DIR / "embeddings" | |
| LOGS_DIR = ROOT_DIR / "logs" | |
| # Create directories if they don't exist | |
| # This ensures the app works on any machine without manual setup | |
| for directory in [RAW_DIR, PROCESSED_DIR, CHUNKS_DIR, EMBEDDINGS_DIR, LOGS_DIR]: | |
| directory.mkdir( | |
| parents = True, | |
| exist_ok = True | |
| ) | |
| # ------------------------------------------ | |
| # DATA INGESTION SETTINGS | |
| # ------------------------------------------ | |
| ARXIV_CATEGORIES = ["cs.LG", "cs.AI", "stat.ML"] # Machine Learning + AI | |
| MAX_PAPERS_PER_FETCH = 100 # Papers per API call | |
| TOTAL_PAPERS_TARGET = 700 # Total papers to collect | |
| ARXIV_API_DELAY_SECONDS = 3.0 # ArXiv rate limit: be respectful | |
| PDF_DOWNLOAD_TIMEOUT = 30 # Seconds before giving up on a PDF | |
| MAX_DOWNLOAD_RETRIES = 3 # Retry failed downloads N times | |
| # ------------------------------------------ | |
| # DOCUMENT PROCESSING SETTINGS | |
| # ------------------------------------------ | |
| MIN_TEXT_LENGTH = 500 # Skip papers with less that 500 chars | |
| MAX_TEXT_LENGTH = 500_000 # Skip papers larger than 100k chars (corrupted) | |
| # ------------------------------------------ | |
| # CHUNKING SETTINGS | |
| # ------------------------------------------ | |
| CHUNK_SIZE = 512 # Characters per chunk | |
| CHUNK_OVERLAP = 50 # Overlap between consecutive chunks | |
| MIN_CHUNK_SIZE = 100 # Discard chunks smaller than this | |
| # ------------------------------------------ | |
| # EMBEDDING SETTINGS | |
| # ------------------------------------------ | |
| EMBEDDING_MODEL_NAME = "BAAI/bge-base-en-v1.5" | |
| EMBEDDING_BATCH_SIZE = 32 # Process N chunks at once | |
| EMBEDDING_DIMENSION = 768 # BGE-base output dimension | |
| # ------------------------------------------ | |
| # VECTOR STORE SETTINGS | |
| # ------------------------------------------ | |
| QDRANT_COLLECTION_NAME = 'research_papers' | |
| QDRANT_PATH = str(ROOT_DIR / 'data' / 'qdrant_db') # Local Storage path | |
| TOP_K_RETRIEVAL = 20 # Retrieve top 20 candidates | |
| TOP_K_RERANK = 5 # Keep top 5 after reranking | |
| # ------------------------------------------ | |
| # LLM SETTINGS | |
| GROQ_API_KEY = os.getenv('GROQ_API_KEY') # Loaded from .env | |
| HF_API_KEY = os.getenv('HF_API_KEY') | |
| if HF_API_KEY: | |
| os.environ["HF_TOKEN"] = HF_API_KEY | |
| LLM_MODEL_NAME = 'zai-org/GLM-5.1' # Primary model ID | |
| LLM_TEMPERATURE = 0.1 # Low = More factual/consistent | |
| LLM_MAX_TOKENS = 2048 # Max response tokens | |
| # ------------------------------------------ | |
| # API SETTINGS | |
| # ------------------------------------------ | |
| API_HOST = "0.0.0.0" | |
| API_PORT = 8000 | |
| API_RELOAD = True # Auto-reload on code change (dev-only) | |
| # ------------------------------------------ | |
| # LOGGING SETTINGS | |
| # ------------------------------------------ | |
| LOG_LEVEL = "INFO" | |
| LOG_FILE = LOGS_DIR / "researchpilot.log" | |
| LOG_ROTATION = "10 MB" # Create new log file after 10MB | |
| LOG_RETENTION = "7 days" # Keep logs for 7 days |