Subhadip007 commited on
Commit
daafb32
·
1 Parent(s): 2306780

feat: vector database indexing complete

Browse files

- Qdrant local database with 15,664 points indexed
- Full payload: text, metadata, categories all populated
- Metadata filtering by category and date working
- Diagnosed and fixed primary_category None bug (pipeline-wide)
- Diagnosed and fixed text empty bug in indexer
- Re-index pipeline: run_indexing.py --recreate
- search() updated to query_points() API (qdrant-client v1.7+)

Search validation:
- Semantic search working across all 4 test queries
- Filtered search returning correct cs.LG results
- Score range: 0.73-0.83 (healthy for BGE-base)

diagnose_payload.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Check what's actually stored in Qdrant payload."""
2
+
3
+ from src.utils.logger import setup_logger, get_logger
4
+ from src.vectorstore.qdrant_store import QdrantStore
5
+
6
+ setup_logger()
7
+ logger = get_logger(__name__)
8
+
9
+ def main():
10
+ store = QdrantStore()
11
+
12
+ # Fetch 3 points directly by scrolling the collection
13
+ # scroll() returns points without needing a query vector
14
+ results, _ = store.client.scroll(
15
+ collection_name = store.collection_name,
16
+ limit = 3,
17
+ with_payload = True,
18
+ with_vectors = False,
19
+ )
20
+
21
+ for i, point in enumerate(results):
22
+ print(f"\n{'='*55}")
23
+ print(f"Point {i+1} — ID: {point.id}")
24
+ print(f"Payload keys: {list(point.payload.keys())}")
25
+ print()
26
+ for k, v in point.payload.items():
27
+ # Truncate long values for readability
28
+ val_str = str(v)[:80] if v else "EMPTY/NONE"
29
+ print(f" {k:<22}: {val_str}")
30
+
31
+ if __name__ == "__main__":
32
+ main()
fix_categories.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from config.settings import RAW_DIR
3
+ from pathlib import Path
4
+
5
+
6
+ fixed = 0
7
+
8
+ for f in RAW_DIR.glob("*.json"):
9
+ if f.name == "paper_index.json":
10
+ continue
11
+ with open(f, "r", encoding = 'utf-8') as fp:
12
+ data = json.load(fp)
13
+
14
+ if not data.get("primary_category"):
15
+ cats = data.get("categories", [])
16
+ data['primary_category'] = cats[0] if cats else "cs.LG"
17
+ with open(f, "w", encoding = "utf-8") as fp:
18
+ json.dump(data, fp, indent = 2, ensure_ascii = False)
19
+ fixed += 1
20
+
21
+
22
+ print(f"Fixed {fixed} raw metadata files")
fix_chunk_categories.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from config.settings import CHUNKS_DIR
3
+
4
+ fixed_files = 0
5
+ fixed_chunks = 0
6
+
7
+ for f in CHUNKS_DIR.glob("*_semantic.json"):
8
+ with open(f, "r", encoding = "utf-8") as fp:
9
+ chunks = json.load(fp)
10
+
11
+ changed = False
12
+ for chunk in chunks:
13
+ if not chunk.get("primary_category"):
14
+ # Derive from paper_id if needed - use cs.LG as safe default
15
+ chunk["primary_category"] = "cs.LG"
16
+ fixed_chunks += 1
17
+ changed = True
18
+
19
+ if changed:
20
+ with open(f, "w", encoding="utf-8") as fp:
21
+ json.dump(chunks, fp, indent = 2, ensure_ascii = False)
22
+ fixed_files += 1
23
+
24
+ print(f"Fixed {fixed_chunks} chunks across {fixed_files} files")
fix_processed_categories.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from config.settings import PROCESSED_DIR
3
+
4
+ fixed = 0
5
+
6
+ for f in PROCESSED_DIR.glob("*.json"):
7
+ with open(f, "r", encoding = 'utf-8') as fp:
8
+ data = json.load(fp)
9
+
10
+ if not data.get("primary_category"):
11
+ cats = data.get("categories", [])
12
+ data["primary_category"] = cats[0] if cats else "cs.LG"
13
+ with open(f, "w", encoding = "utf-8") as fp:
14
+ json.dump(data, fp, indent = 2, ensure_ascii = False)
15
+ fixed += 1
16
+
17
+ print(f"Fixed {fixed} processed files")
run_embedding.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Phase 6: Generate embeddings for all semantic chunks.
3
+
4
+ Run from project root:
5
+ python run_embedding.py
6
+
7
+ Input: data/chunks/*_semantic.json (15,664 chunks)
8
+ Output: data/embeddings/embeddings.npy (shape: 15664 x 768)
9
+ data/embeddings/chunk_ids.npy
10
+ data/embeddings/embedding_index.json
11
+ data/embeddings/chunk_metadata.json
12
+ """
13
+
14
+ from src.utils.logger import setup_logger, get_logger
15
+ from src.embeddings.embedding_pipeline import EmbeddingPipeline
16
+
17
+ setup_logger()
18
+ logger = get_logger(__name__)
19
+
20
+
21
+
22
+ def main():
23
+ logger.info("=" * 60)
24
+ logger.info("PHASE 6 — EMBEDDING PIPELINE")
25
+ logger.info("=" * 60)
26
+
27
+ pipeline = EmbeddingPipeline()
28
+ stats = pipeline.run()
29
+
30
+ logger.info("=" * 60)
31
+ logger.info("EMBEDDING COMPLETE")
32
+ for k, v in stats.items():
33
+ logger.info(f" {k}: {v}")
34
+ logger.info("=" * 60)
35
+
36
+
37
+
38
+ if __name__ == "__main__":
39
+ main()
run_indexing.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Phase 7: Index all embeddings into Qdrant vector database.
3
+
4
+ Input: data/embeddings/embeddings.npy
5
+ data/embeddings/chunk_metadata.json
6
+ Output: data/qdrant_db/ (local Qdrant database)
7
+
8
+ Run from project root:
9
+ python run_indexing.py
10
+
11
+ To force re-index (e.g. after adding more papers):
12
+ python run_indexing.py --recreate
13
+ """
14
+
15
+ import sys
16
+ from src.utils.logger import setup_logger, get_logger
17
+ from src.vectorstore.indexer import VectorIndexer
18
+
19
+ setup_logger()
20
+ logger = get_logger(__name__)
21
+
22
+
23
+ def main():
24
+ recreate = "--recreate" in sys.argv
25
+
26
+ logger.info("=" * 60)
27
+ logger.info(f"PHASE 7 - VECTOR DATABASE INDEXING")
28
+ logger.info("=" * 60)
29
+
30
+
31
+ if recreate:
32
+ logger.warning("--recreate flag set: existing index will be deleted")
33
+
34
+
35
+ indexer = VectorIndexer()
36
+ stats = indexer.run(recreate = recreate)
37
+
38
+
39
+ logger.info("=" * 60)
40
+ logger.info("INDEXING COMPLETE")
41
+ for k, v in stats.items():
42
+ logger.info(f" {k}: {v}")
43
+
44
+ logger.info("=" * 60)
45
+
46
+
47
+
48
+ if __name__ == "__main__":
49
+ main()
src/embeddings/__init__.py ADDED
File without changes
src/embeddings/embedding_cache.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Disk-based cache for computed embeddings.
3
+
4
+ PROBLEM WE'RE SOLVING:
5
+ Embedding 15,664 chunks takes ~30-60 minutes on CPU.
6
+ If you restart your pipeline or add 10 new papers,
7
+ you don't want to re-embed the 15,654 unchanged chunks.
8
+
9
+ SOLUTION:
10
+ Save embeddings to disk as numpy .npy files.
11
+ Build an index that maps chunk_id -> array row index.
12
+ On next run, load from disk instead of recomputing.
13
+
14
+ STORAGE FORMAT:
15
+ data/embeddings/
16
+ |-- embeddings.npy <- numpy array, shape (N, 768)
17
+ |-- chunk_ids.npy <- chunk IDs in same order as rows
18
+ |-- embedding_index.json <- metadata + chunk_id -> row mapping
19
+
20
+ WHY NUMPY .npy OVER JSON:
21
+ Storing 15,664 * 768 floats as JSON = ~90MB of text
22
+ Storing as .npy binary = ~46MB + loads 100x faster
23
+ """
24
+
25
+ import json
26
+ import numpy as np
27
+ from pathlib import Path
28
+
29
+ from src.utils.logger import get_logger
30
+ from config.settings import EMBEDDINGS_DIR, EMBEDDING_DIMENSION
31
+
32
+ logger = get_logger(__name__)
33
+
34
+
35
+
36
+ class EmbeddingCache:
37
+ """
38
+ Manages persistent storage of chunk embeddings
39
+ """
40
+
41
+
42
+ def __init__(self):
43
+ self.embedding_file = EMBEDDINGS_DIR / "embeddings.npy"
44
+ self.chunk_ids_file = EMBEDDINGS_DIR / "chunk_ids.npy"
45
+ self.index_file = EMBEDDINGS_DIR / "embedding_index.json"
46
+
47
+
48
+ # In-memory state
49
+ self._embeddings: np.ndarray = None # Shape (N, 768)
50
+ self._chunk_ids: list[str] = None # length N
51
+ self._id_to_row: dict = None # chunk_id -> row index
52
+
53
+
54
+ def exists(self) -> bool:
55
+ """Check if cached embeddings exists on disk"""
56
+ return (
57
+ self.embedding_file.exists() and
58
+ self.chunk_ids_file.exists() and
59
+ self.index_file.exists()
60
+ )
61
+
62
+
63
+ def load(self) -> bool:
64
+ """
65
+ Load embeddings from disk into memory
66
+
67
+ Returns True if loaded successfully. False if no cache exists
68
+ """
69
+ if not self.exists():
70
+ logger.info("No embedding cache found on disk")
71
+ return False
72
+
73
+ logger.info("Loading embeddings from disk cache...")
74
+
75
+
76
+ # Load numpy arrays - mmap_mode='r' means memory-mapped read
77
+ # WHY mmap: The array is NOT fully loaded into RAM immediately
78
+ # It's read from disk only when specific rows are accessed
79
+ # This is critical for large arrays on machines with limited RAM
80
+ self._embeddings = np.load(
81
+ str(self.embedding_file),
82
+ mmap_mode = 'r'
83
+ )
84
+
85
+ # chunk_ids are stored as numpy array of strings
86
+ # We convert back to Python list for easier indexing
87
+ self._chunk_ids = list(
88
+ np.load(str(self.chunk_ids_file), allow_pickle = True)
89
+ )
90
+
91
+ # Build the reverse lookup: chunk_id -> row number
92
+ self._id_to_row = {
93
+ chunk_id: idx
94
+ for idx, chunk_id in enumerate(self._chunk_ids)
95
+ }
96
+
97
+ logger.info(
98
+ f"Cache loaded: {self._embeddings.shape[0]:,} embeddings"
99
+ f"dimension = {self._embeddings.shape[1]}"
100
+ )
101
+
102
+ return True
103
+
104
+
105
+ def save(self, embeddings: np.ndarray, chunk_ids: list[str]):
106
+ """
107
+ Save embeddings and their chunk IDs to disk.
108
+
109
+ Args:
110
+ embeddings: numpy array of shape (N, 768)
111
+ chunk_ids: list of N chunk ID strings (same order as rows)
112
+ """
113
+
114
+ assert len(embeddings) == len(chunk_ids), (
115
+ f"Mismatch {len(embeddings)} embeddings vs {len(chunk_ids)} IDs"
116
+ )
117
+
118
+ logger.info(f"Saving {len(embeddings):,} embeddings to disk...")
119
+
120
+ # Save the embedding matrix
121
+ np.save(str(self.embedding_file), embeddings)
122
+
123
+ # Save chunk IDs as numpy object array (handles strings)
124
+ np.save(str(self.chunk_ids_file), np.array(chunk_ids, dtype = object))
125
+
126
+ # Save human-readable index file
127
+ index = {
128
+ "total_embeddings": len(embeddings),
129
+ "embedding_dimension": embeddings.shape[1],
130
+ "model_name": "BAAI/bge-base-en-v1.5",
131
+ "chunk_id_sample": chunk_ids[:5], # First 5 for verification
132
+ }
133
+
134
+ with open(self.index_file, "w", encoding = 'utf-8') as f:
135
+ json.dump(index, f, indent = 2)
136
+
137
+
138
+
139
+ # Update in-memory state
140
+ self._embeddings = embeddings
141
+ self._chunk_ids = chunk_ids
142
+ self._id_to_row = {cid: i for i, cid in enumerate(chunk_ids)}
143
+
144
+
145
+ logger.info(
146
+ f"Saved embeddings: {self.embedding_file}"
147
+ f"({self.embedding_file.stat().st_size / 1024 / 1024:.1f} MB)"
148
+ )
149
+
150
+
151
+ def get_embeddings(self, chunk_id: str) -> np.ndarray | None:
152
+ """Get the embedding vector for a specific chunk ID."""
153
+ if self._id_to_row is None:
154
+ return None
155
+
156
+ row = self._id_to_row.get(chunk_id)
157
+
158
+ if row is None:
159
+ return None
160
+
161
+ return self._embeddings[row]
162
+
163
+
164
+
165
+ def get_all(self) -> tuple[np.ndarray, list[str]]:
166
+ """Return all embeddings and their chunk IDs."""
167
+ return self._embeddings, self._chunk_ids
168
+
169
+
170
+ @property
171
+ def size(self) -> int:
172
+ """Number of cached embeddings"""
173
+ if self._chunk_ids is None:
174
+ return 0
175
+
176
+ return len(self._chunk_ids)
src/embeddings/embedding_model.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BGE embedding model wrapper for ResearchPilot.
3
+
4
+ RESPONSIBILITIES:
5
+ 1. Load and cache the BGE-base-en-v1.5 model
6
+ 2. Embed document chunks (no prefix)
7
+ 3. Embed user queries (with BGE instruction prefix)
8
+ 4. Handle batching for large-scale embedding
9
+
10
+ WHY A WRAPPER CLASS instead of calling SentenceTransformer directly:
11
+ If we decide to swap BGE for a better model tomorrow, we change
12
+ ONE file. Nothing else in the codebase changes. This is called
13
+ the FACADE PATTERN - hide implementation behind a stable interface
14
+ """
15
+
16
+ import logging
17
+ # Suppress noisy sentence-transformers logs
18
+ logging.getLogger("sentence-transformers").setLevel(logging.ERROR)
19
+ logging.getLogger("huggingface_hub").setLevel(logging.ERROR)
20
+
21
+ import numpy as np
22
+ from typing import Union
23
+
24
+ from src.utils.logger import get_logger
25
+ from config.settings import EMBEDDING_MODEL_NAME, EMBEDDING_BATCH_SIZE, EMBEDDING_DIMENSION
26
+
27
+
28
+
29
+ logger = get_logger(__name__)
30
+
31
+
32
+
33
+
34
+ class EmbeddingModel:
35
+ """
36
+ Wrapper around BGE-base-en-v1.5 for document and query embedding.
37
+
38
+ Usage:
39
+ model = EmbeddingModel()
40
+
41
+ # Embed chunks (documents)
42
+ chunk_vectors = model.embed_documents(["chunk text 1", "chunk text 2"])
43
+
44
+ # Embed a user query
45
+ query_vector = model.embed_query("what is attention mechanism?")
46
+ """
47
+
48
+ # BGE introduction prefix for queries
49
+ # This is specified in the official BGE model card
50
+ QUERY_PREFIX = "Represent this sentence for searching relevant passages: "
51
+
52
+
53
+ def __init__(self, model_name: str = EMBEDDING_MODEL_NAME):
54
+ self.model_name = model_name
55
+ self._model = None # Lazy loaded
56
+ logger.info(f"EmbeddingModel wrapper created for: {model_name}")
57
+
58
+
59
+ @property
60
+ def model(self):
61
+ """Lazy-load model on first use."""
62
+ if self._model is None:
63
+ from sentence_transformers import SentenceTransformer
64
+ logger.info(f"Loading embedding model: {self.model_name}")
65
+ self._model = SentenceTransformer(self.model_name)
66
+ logger.info(
67
+ f"Model loaded. "
68
+ f"Embedding dimension: {self._model.get_sentence_embedding_dimension()}"
69
+ )
70
+
71
+ return self._model
72
+
73
+
74
+ def embed_documents(
75
+ self,
76
+ texts: list[str],
77
+ batch_size: int = EMBEDDING_BATCH_SIZE,
78
+ show_progress: bool = True,
79
+ ) -> np.ndarray:
80
+ """
81
+ Embed a list of document chunks.
82
+
83
+ NO prefix applied - BGE embeds documents as-is.
84
+
85
+ Args:
86
+ texts: List of chunk texts to embed
87
+ batch_size: How many chunks to process at once
88
+ show_progress: Show tqdm progress bar
89
+
90
+ Returns:
91
+ numpy array of shape (len(texts), 768)
92
+ Each row is the embedding for one chunk.
93
+
94
+ BATCHING EXPLAINED:
95
+ We cannot embed all 15,664 chunks at once - that would
96
+ require ~15,664 * 768 * 4 bytes = ~48MB just for the
97
+ output array, plus the model's working memory.
98
+
99
+ Processing in batches of 32-64 keeps memory stable
100
+ while still being fast (model processes the batch
101
+ as a single matrix multiplication).
102
+ """
103
+
104
+ if not texts:
105
+ return np.array([])
106
+
107
+
108
+ logger.info(f"Embedding {len(texts)} documents in batches of {batch_size}")
109
+
110
+
111
+ embeddings = self.model.encode(
112
+ texts,
113
+ batch_size = batch_size,
114
+ show_progress_bar = show_progress,
115
+ normalize_embeddings = True, # L2 normalize -> cosine sim = dot product
116
+ convert_to_numpy = True,
117
+ )
118
+
119
+
120
+ logger.info(f"Embedding complete. Shape: {embeddings.shape}")
121
+
122
+ return embeddings
123
+
124
+
125
+ def embed_query(self, query: str) -> np.ndarray:
126
+ """
127
+ Embed a single user query WITH the BGE instruction prefix.
128
+
129
+ Args:
130
+ query: Raw user question
131
+
132
+ Returns:
133
+ numpy array of shape (768,)
134
+
135
+ WHY SINGLE QUERY (not batch):
136
+ At query time, we receive one question at a time.
137
+ Batching makes no sense here - we want the answer fast.
138
+ """
139
+
140
+ # Apply BGE's instruction prefix for retrieval queries
141
+ prefixed_query = self.QUERY_PREFIX + query
142
+
143
+ embedding = self.model.encode(
144
+ prefixed_query,
145
+ normalize_embeddings = True,
146
+ convert_to_numpy = True,
147
+ show_progress_bar = False,
148
+ )
149
+
150
+ return embedding
151
+
152
+
153
+ def embed_batch(
154
+ self,
155
+ texts: list[str],
156
+ batch_size: int = EMBEDDING_BATCH_SIZE,
157
+ ) -> np.ndarray:
158
+ """
159
+ Embed texts in batches, yielding one batch at a time.
160
+
161
+ WHY A GENERATOR:
162
+ For 15,664 chunks, we don't want to hold ALL embeddings
163
+ in memory while also saving them. This generator yields
164
+ one batch at a time - we save each batch, then free memory.
165
+
166
+ Usage:
167
+ for batch_embeddings, batch_texts in model.embed_batch(texts):
168
+ save(batch_embeddings)
169
+ """
170
+
171
+ for i in range(0, len(texts), batch_size):
172
+ batch = texts[i : i + batch_size]
173
+ embeddings = self.model.encode(
174
+ batch,
175
+ normalize_embeddings = True,
176
+ convert_to_numpy = True,
177
+ show_progress_bar = False,
178
+ )
179
+
180
+ yield embeddings, batch
src/embeddings/embedding_pipeline.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Orchestrates embedding generation for all chunks.
3
+
4
+ FLOW:
5
+ 1. Load all chunk files from data/chunks/
6
+ 2. Check cache - skip already-embedded chunks
7
+ 3. Embed remaining chunks in batches
8
+ 4. Save to cache
9
+ 5. Report statistics
10
+ """
11
+
12
+ import json
13
+ import numpy as np
14
+ from pathlib import Path
15
+ from tqdm import tqdm
16
+
17
+ from src.embeddings.embedding_model import EmbeddingModel
18
+ from src.embeddings.embedding_cache import EmbeddingCache
19
+ from src.utils.logger import get_logger
20
+ from config.settings import CHUNKS_DIR, EMBEDDING_BATCH_SIZE
21
+
22
+
23
+ logger = get_logger(__name__)
24
+
25
+
26
+
27
+ class EmbeddingPipeline:
28
+ """
29
+ Loads all semantic chunks and generates embeddings for them
30
+ """
31
+
32
+
33
+ def __init__(self):
34
+ self.model = EmbeddingModel()
35
+ self.cache = EmbeddingCache()
36
+
37
+
38
+ def load_all_chunks(self) -> tuple[list[str], list[str], list[dict]]:
39
+ """
40
+ Load all chunk texts, IDs, and metadata from disk.
41
+
42
+ Returns:
43
+ texts: List of chunk text strings
44
+ chunk_ids: List of chunk ID strings (same order)
45
+ metadata: List of chunk metadata dicts (same order)
46
+ """
47
+ chunk_file = list(CHUNKS_DIR.glob("*semantic.json"))
48
+ logger.info(f"Loading chunks from {len(chunk_file)} files...")
49
+
50
+
51
+ texts = []
52
+ chunk_ids = []
53
+ metadata = []
54
+
55
+ for cf in tqdm(chunk_file, desc = "Loading chunks"):
56
+ with open(cf, "r", encoding = 'utf-8') as f:
57
+ chunks = json.load(f)
58
+
59
+
60
+ for chunk in chunks:
61
+ texts.append(chunk["text"])
62
+ chunk_ids.append(chunk["chunk_id"])
63
+ metadata.append(
64
+ {
65
+ k: v for k, v in chunk.items()
66
+ if k != "text" # Don't duplicate text in metadata
67
+ }
68
+ )
69
+
70
+
71
+ logger.info(f"Loaded {len(texts):,} chunks total")
72
+ return texts, chunk_ids, metadata
73
+
74
+
75
+ def run(self) -> dict:
76
+ """
77
+ Main pipeline: embed all chunks and save to cache.
78
+
79
+ Returns:
80
+ Statistics dictionary
81
+ """
82
+ # Load all chunks from disk
83
+ texts, chunk_ids, metadata = self.load_all_chunks()
84
+
85
+ if not texts:
86
+ logger.error("No chunks found. Run run_chunking.py first.")
87
+ return {}
88
+
89
+
90
+ # Check if we already have a complete cache
91
+ if self.cache.exists():
92
+ self.cache.load()
93
+ if self.cache.size == len(texts):
94
+ logger.info(
95
+ f"Cache complete: {self.cache.size:,} embeddings already exist."
96
+ f"Nothing to do."
97
+ )
98
+
99
+ return {
100
+ "total": len(texts),
101
+ "embedded": 0,
102
+ "from_cache": self.cache.size,
103
+ "status": "cache_hit"
104
+ }
105
+ else:
106
+ logger.info(
107
+ f"Partial cache: {self.cache.size:,} / {len(texts):,} "
108
+ f"Re-embedding all for consistency."
109
+ )
110
+
111
+ # Embed all chunks
112
+ logger.info(f"Embedding {len(texts):,} chunks with BGE-base-en-v1.5...")
113
+ logger.info(f"Batch size: {EMBEDDING_BATCH_SIZE}")
114
+ logger.info(
115
+ f"Estimated time: "
116
+ f"{len(texts) / EMBEDDING_BATCH_SIZE * 0.5:.0f} seconds on CPU"
117
+ )
118
+
119
+
120
+ # embed_documents handles batching internally and shows progress bar
121
+ embeddings = self.model.embed_documents(
122
+ texts,
123
+ batch_size = EMBEDDING_BATCH_SIZE,
124
+ show_progress = True,
125
+ )
126
+
127
+
128
+ # Verify shape
129
+ assert embeddings.shape == (len(texts), 768), (
130
+ f"Expected ({len(texts)}, 768), got {embeddings.shape}"
131
+ )
132
+
133
+ # Save to disk
134
+ self.cache.save(embeddings, chunk_ids)
135
+
136
+ # Also save metadata separately (needed for Qdrant in Phase 7)
137
+ metadata_path = CHUNKS_DIR.parent / "embeddings" / "chunk_metadata.json"
138
+ with open(metadata_path, "w", encoding = 'utf-8') as f:
139
+ json.dump(metadata, f, ensure_ascii = False)
140
+
141
+ logger.info(f"Metadata saved to {metadata_path}")
142
+
143
+
144
+ stats = {
145
+ "total_chunks": len(texts),
146
+ "embedding_shape": list(embeddings.shape),
147
+ "embedding_dim": embeddings.shape[1],
148
+ "cache_size_mb": round(
149
+ embeddings.nbytes / 1024 / 1024, 1
150
+ ),
151
+ "status": "complete"
152
+ }
153
+
154
+ logger.info(f"Embedding pipeline completed: {stats}")
155
+
156
+ return stats
src/ingestion/arxiv_fetcher.py CHANGED
@@ -212,7 +212,7 @@ class ArXivFetcher:
212
  abstract = result.summary,
213
  authors = [str(a) for a in result.authors],
214
  categories = result.categories,
215
- primary_categories = result.primary_category,
216
  published_date = result.published.strftime("%Y-%m-%d"),
217
  updated_date = result.updated.strftime("%Y-%m-%d"),
218
  arxiv_url = result.entry_id,
 
212
  abstract = result.summary,
213
  authors = [str(a) for a in result.authors],
214
  categories = result.categories,
215
+ primary_category = str(result.primary_category) if result.primary_category else result.categories[0] if result.categories else "cs.LG",
216
  published_date = result.published.strftime("%Y-%m-%d"),
217
  updated_date = result.updated.strftime("%Y-%m-%d"),
218
  arxiv_url = result.entry_id,
src/processing/chunker.py CHANGED
@@ -673,7 +673,8 @@ class ChunkingPipeline:
673
  "title": processed_doc.get("title", ""),
674
  "authors": processed_doc.get("authors", []),
675
  "published_date": processed_doc.get("published_date", ""),
676
- "primary_category": processed_doc.get("primary_category", ""),
 
677
  "arxiv_url": processed_doc.get("arxiv_url", ""),
678
  }
679
 
 
673
  "title": processed_doc.get("title", ""),
674
  "authors": processed_doc.get("authors", []),
675
  "published_date": processed_doc.get("published_date", ""),
676
+ "primary_category": processed_doc.get("primary_category") or
677
+ (processed_doc.get("categories") or ["cs.LG"])[0],
678
  "arxiv_url": processed_doc.get("arxiv_url", ""),
679
  }
680
 
src/processing/pdf_extractor.py CHANGED
@@ -174,16 +174,34 @@ class PDFExtractor:
174
  return False
175
 
176
  # Build processed document
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  processed_doc = {
178
- # Copy all original metadata
179
  **paper_metadata,
180
-
181
- # Add processed text
182
  "full_text": text,
183
  "text_length": len(text),
184
  "word_count": len(text.split()),
185
-
186
- # Update pipeline state
187
  "text_extracted": True,
188
  "pdf_downloaded": paper_metadata.get("pdf_downloaded", False),
189
  }
 
174
  return False
175
 
176
  # Build processed document
177
+ #---------------------------------------------------------------------------
178
+ # processed_doc = {
179
+ # # Copy all original metadata
180
+ # **paper_metadata,
181
+
182
+ # # Add processed text
183
+ # "full_text": text,
184
+ # "text_length": len(text),
185
+ # "word_count": len(text.split()),
186
+
187
+ # # Update pipeline state
188
+ # "text_extracted": True,
189
+ # "pdf_downloaded": paper_metadata.get("pdf_downloaded", False),
190
+ # }
191
+ #---------------------------------------------------------------------------
192
+
193
+ primary_cat = paper_metadata.get("primary_category")
194
+
195
+ if not primary_cat:
196
+ cats = paper_metadata.get("categories", [])
197
+ primary_cat = cats[0] if cats else "cs.LG"
198
+
199
  processed_doc = {
 
200
  **paper_metadata,
201
+ "primary_category": primary_cat, # Override with rescued value
 
202
  "full_text": text,
203
  "text_length": len(text),
204
  "word_count": len(text.split()),
 
 
205
  "text_extracted": True,
206
  "pdf_downloaded": paper_metadata.get("pdf_downloaded", False),
207
  }
src/vectorstore/__init__.py ADDED
File without changes
src/vectorstore/indexer.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Loads embeddings + chunks from disk and indexes them into Qdrant.
3
+
4
+ This is a ONE-TIME operation (or run when new papers are added).
5
+ After this, all searches go through Qdrant - not numpy arrays.
6
+ """
7
+
8
+ import json
9
+ import numpy as np
10
+ from pathlib import Path
11
+
12
+ from src.vectorstore.qdrant_store import QdrantStore
13
+ from src.embeddings.embedding_cache import EmbeddingCache
14
+ from src.utils.logger import get_logger
15
+ from config.settings import CHUNKS_DIR, EMBEDDINGS_DIR
16
+
17
+ logger = get_logger(__name__)
18
+
19
+
20
+
21
+ class VectorIndexer:
22
+ """Orchestrates loading embeddings and indexing into Qdrant"""
23
+
24
+ def __init__(self):
25
+ self.store = QdrantStore()
26
+ self.cache = EmbeddingCache()
27
+
28
+
29
+ #----------------------------------------------------------------------------------------------------------
30
+
31
+ # def load_texts_by_chunk_id(self, chunk_ids: list[str]) -> dict[str, str]:
32
+ # """
33
+ # Build a lookup dict: chunk_id → chunk text.
34
+
35
+ # We need this because EmbeddingCache stores embeddings
36
+ # but not the original texts. We reload texts from chunk files.
37
+ # """
38
+ # # Load the metadata file which has all chunk info
39
+ # metadata_path = EMBEDDINGS_DIR / "chunk_metadata.json"
40
+
41
+ # if metadata_path.exists():
42
+ # with open(metadata_path, "r", encoding = 'utf-8') as f:
43
+ # metadata_list = json.load(f)
44
+
45
+ # logger.info(f"Loaded metadata for {len(metadata_list):,} chunks")
46
+ # return metadata_list
47
+
48
+ # # Fallback: reload from chunk files (slower)
49
+ # logger.warning("chunk_metadata.json not found, loading from chunk files...")
50
+ # id_to_text = {}
51
+ # for cf in CHUNKS_DIR.glob("*_semantic.json"):
52
+ # with open(cf, 'r', encoding = 'utf-8') as f:
53
+ # chunks = json.load(f)
54
+ # for c in chunks:
55
+ # id_to_text[c['chunk_id']] = c['text']
56
+
57
+ # return id_to_text
58
+
59
+ #----------------------------------------------------------------------------------------------------------
60
+
61
+
62
+
63
+ def load_chunk_from_disk(self) -> tuple[list[str], list[str], list[str]]:
64
+ """
65
+ Load chunk texts and metadata directly from chunk files.
66
+ This is the ground truth source - chunk files have everything.
67
+
68
+ Returns:
69
+ chunk_ids: list of chunk ID strings
70
+ texts: list of chunk text strings
71
+ metadata: list of metadata dicts (without text)
72
+ """
73
+ chunk_ids = []
74
+ texts = []
75
+ metadata = []
76
+
77
+
78
+ chunk_files = list(CHUNKS_DIR.glob("*_semantic.json"))
79
+ logger.info(f"Loading chunks from {len(chunk_files)} files...")
80
+
81
+ for cf in chunk_files:
82
+ with open(cf, 'r', encoding = "utf-8") as f:
83
+ chunks = json.load(f)
84
+
85
+ for chunk in chunks:
86
+ chunk_ids.append(chunk['chunk_id'])
87
+ texts.append(chunk["text"])
88
+
89
+
90
+ # Everything expect that goes into metadata
91
+ metadata.append(
92
+ {
93
+ k: v for k, v in chunk.items()
94
+ if k != "text"
95
+ }
96
+ )
97
+
98
+ logger.info(f"Loaded {len(chunk_ids):,} chunks from disk")
99
+ return chunk_ids, texts, metadata
100
+
101
+
102
+
103
+
104
+ def run(self, recreate: bool = False) -> dict:
105
+ """
106
+ Full indexing pipeline.
107
+
108
+ Args:
109
+ recreate: Delete existing collection and re-index everything.
110
+ Set True when you change embedding model or chunking.
111
+
112
+ Returns:
113
+ Indexing statistics
114
+ """
115
+ # Check if already exists
116
+ current_size = self.store.get_collection_size()
117
+
118
+ if current_size > 0 and not recreate:
119
+ logger.info(
120
+ f"Collection already has {current_size:,} points. "
121
+ f"Run with recreate=True to re-index."
122
+ )
123
+
124
+ return {
125
+ "status": "already_indexed",
126
+ "points": current_size,
127
+ }
128
+
129
+
130
+ # Step 1: Load directly from chunk files - ground truth source
131
+ # (chunk files have text + metadata, and are the source of truth)
132
+ chunk_ids, texts, metadata = self.load_chunk_from_disk()
133
+
134
+ # Step 2: Create the Qdrant collection (skips if already exists)
135
+ self.store.create_collection(recreate=recreate)
136
+
137
+ # Step 3: Load embeddings from cache and reorder to match chunk order from disk
138
+ # (cache order may differ from disk order, so we align by chunk_id)
139
+ logger.info("Loading embeddings from cache...")
140
+ self.cache.load()
141
+ embeddings_matrix, cached_ids = self.cache.get_all()
142
+
143
+ # Build a lookup dict: chunk_id → row index in embedding matrix
144
+ id_to_row = {cid: i for i, cid in enumerate(cached_ids)}
145
+
146
+ # Reorder embeddings so they match the chunk_ids order we loaded from disk
147
+ ordered_embeddings = np.array([
148
+ embeddings_matrix[id_to_row[cid]]
149
+ for cid in chunk_ids
150
+ if cid in id_to_row # only include chunks that have an embedding
151
+ ])
152
+
153
+ # Filter chunk_ids, texts, metadata to only those that have a matching embedding
154
+ # (some chunks may have been added after last embedding run)
155
+ valid_indices = [i for i, cid in enumerate(chunk_ids) if cid in id_to_row]
156
+ chunk_ids = [chunk_ids[i] for i in valid_indices]
157
+ texts = [texts[i] for i in valid_indices]
158
+ metadata = [metadata[i] for i in valid_indices]
159
+
160
+ logger.info(f"Matched {len(chunk_ids):,} chunks with embeddings")
161
+
162
+ # Step 4: Index everything into Qdrant
163
+ logger.info(f"Indexing {len(chunk_ids):,} chunks into Qdrant...")
164
+ total = self.store.index_chunks(
165
+ embeddings = ordered_embeddings,
166
+ chunk_ids = chunk_ids,
167
+ metadata = metadata,
168
+ texts = texts,
169
+ )
170
+
171
+ stats = {
172
+ "status": "complete",
173
+ "chunks_indexed": total,
174
+ "collection_info": self.store.get_collection_info(),
175
+ }
176
+
177
+ logger.info(f"Indexing completed: {stats}")
178
+ return stats
src/vectorstore/qdrant_store.py ADDED
@@ -0,0 +1,318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Qdrant vector database interface for ResearchPilot.
3
+
4
+ RUNS LOCALLY - no server needed, no Docker, no cloud account.
5
+ Qdrant client in local mode stores everything in a directory
6
+ on disk, exactly like SQLite does for relational data.
7
+
8
+ Data lives in: data/qdrant_db/
9
+ """
10
+
11
+ import json
12
+ import uuid
13
+ import numpy as np
14
+ from pathlib import Path
15
+ from typing import Optional
16
+
17
+ from qdrant_client import QdrantClient
18
+ from qdrant_client.models import (
19
+ Distance,
20
+ VectorParams,
21
+ PointStruct,
22
+ Filter,
23
+ FieldCondition,
24
+ MatchValue,
25
+ Range,
26
+ SearchRequest,
27
+ )
28
+ from tqdm import tqdm
29
+
30
+
31
+ from src.utils.logger import get_logger
32
+ from config.settings import (
33
+ QDRANT_COLLECTION_NAME,
34
+ QDRANT_PATH,
35
+ EMBEDDING_DIMENSION,
36
+ TOP_K_RETRIEVAL,
37
+ )
38
+
39
+ logger = get_logger(__name__)
40
+
41
+ # How many points to upload to Qdrant at once
42
+ # Too large = memory spike. Too small = many round trips.
43
+ UPSERT_BATCH_SIZE = 256
44
+
45
+
46
+ class QdrantStore:
47
+ """
48
+ Manages the Qdrant vector database for chunk storage and retrieval.
49
+
50
+ UPSERT PATTERN:
51
+ We use 'upsert' (update + insert) instead of 'insert'.
52
+ If a chunk already exists, upsert updates it.
53
+ If it doesn't exist, upsert creates it.
54
+ This makes our indexing pipeline idempotent - safe to re-run.
55
+ """
56
+
57
+ def __init__(self):
58
+ # Local mode: pass path= instead of url=
59
+ # Qdrant creates/opens a local database at this path
60
+ # No server process needed - runs in-process
61
+ logger.info(f"Connecting to local Qdrant at: {QDRANT_PATH}")
62
+ self.client = QdrantClient(path = QDRANT_PATH)
63
+ self.collection_name = QDRANT_COLLECTION_NAME
64
+
65
+
66
+ def collection_exists(self) -> bool:
67
+ """Check if our collection already exists in Qdrant."""
68
+ collections = self.client.get_collections().collections
69
+ names = [c.name for c in collections]
70
+ return self.collection_name in names
71
+
72
+
73
+ def get_collection_size(self) -> int:
74
+ """Return number of points currently in the collections."""
75
+ if not self.collection_exists():
76
+ return 0
77
+ info = self.client.get_collection(self.collection_name)
78
+ return info.points_count
79
+
80
+
81
+ def create_collection(self, recreate: bool = False):
82
+ """
83
+ Create the Qdrant collection for research paper chunks.
84
+
85
+ Args:
86
+ recreate: If True, delete existing collection and rebuild.
87
+ Use this when you want a clean re-index.
88
+
89
+ COLLECTION CONFIGURATION:
90
+ size=768 -> matches BGE-base-en-v1.5 output dimension
91
+ distance=COSINE -> similarity metric
92
+
93
+ WHY COSINE DISTANCE:
94
+ Our embeddings are L2-normalized (magnitude = 1.0).
95
+ For normalized vectors: cosine_similarity = dot_product
96
+ Qdrant's COSINE metric handles this correctly.
97
+ Using DOT_PRODUCT would also work but COSINE is more explicit.
98
+ """
99
+
100
+ if self.collection_exists():
101
+ if recreate:
102
+ logger.warning(f"Deleting existing collection: {self.collection_name}")
103
+ self.client.delete_collection(self.collection_name)
104
+ else:
105
+ size = self.get_collection_size()
106
+ logger.info(
107
+ f"Collection: '{self.collection_name}' already exists "
108
+ f"with {size:,} points. Skipping creation."
109
+ )
110
+ return
111
+
112
+ logger.info(f"Creating collection: {self.collection_name}")
113
+ self.client.create_collection(
114
+ collection_name = self.collection_name,
115
+ vectors_config = VectorParams(
116
+ size = EMBEDDING_DIMENSION,
117
+ distance = Distance.COSINE,
118
+ ),
119
+ )
120
+ logger.info(f"Collection created: {self.collection_name}")
121
+
122
+
123
+ def index_chunks(
124
+ self,
125
+ embeddings: np.ndarray,
126
+ chunk_ids: list[str],
127
+ metadata: list[dict],
128
+ texts: list[str]
129
+ ) -> int:
130
+ """
131
+ Upload embeddings + metadata into Qdrant.
132
+
133
+ Args:
134
+ embeddings: numpy array (N, 768)
135
+ chunk_ids: list of N chunk ID strings
136
+ metadata: list of N metadata dicts
137
+ texts: list of N chunk text strings
138
+
139
+ Returns:
140
+ Number of points successfully indexed
141
+
142
+ QDRANT POINT STRUCTURE:
143
+ Each point needs:
144
+ - id: unique identifier (we use the chunk_id UUID)
145
+ - vector: the embedding as a Python list of floats
146
+ - payload: dict of any metadata we want to store/filter
147
+
148
+ WHY INCLUDE TEXT IN PAYLOAD:
149
+ When we retrieve a point, we need the text to show to the
150
+ user and to send to the LLM. Storing it in the payload
151
+ means ONE database query returns everything we need.
152
+ Alternative would be a separate text lookup - slower and
153
+ more complex.
154
+ """
155
+ assert len(embeddings) == len(chunk_ids) == len(metadata) == len(texts), \
156
+ "All inputs must have the same length"
157
+
158
+ total_indexed = 0
159
+
160
+ # Process in batches to avoid memory spikes
161
+ for batch_start in tqdm(
162
+ range(0, len(embeddings), UPSERT_BATCH_SIZE),
163
+ desc = "Indexing into Qdrant"
164
+ ):
165
+ batch_end = min(batch_start + UPSERT_BATCH_SIZE, len(embeddings))
166
+
167
+ # Build PointStruct objects for this batch
168
+ points = []
169
+ for i in range(batch_start, batch_end):
170
+ # Qdrant requires UUID format for point IDs
171
+ # Our chunk_ids are already UUIDs from Phase 5
172
+ point = PointStruct(
173
+ id = chunk_ids[i],
174
+ vector = embeddings[i].tolist(), # Numpy -> Python List
175
+ payload = {
176
+ # Store ALL metadata in payload for retrieval
177
+ **metadata[i],
178
+ "text": texts[i], # Inlcude chunk text
179
+ }
180
+ )
181
+ points.append(point)
182
+
183
+ # Upsert this batch
184
+ self.client.upsert(
185
+ collection_name = self.collection_name,
186
+ points = points,
187
+ )
188
+ total_indexed += len(points)
189
+
190
+
191
+ logger.info(
192
+ f"Indexing complete. "
193
+ f"Total points in collection: {self.get_collection_size():,}"
194
+ )
195
+ return total_indexed
196
+
197
+
198
+ def search(
199
+ self,
200
+ query_vector: np.ndarray,
201
+ top_k: int = TOP_K_RETRIEVAL,
202
+ filter_category: Optional[str] = None,
203
+ filter_year_gte: Optional[int] = None,
204
+ ) -> list[dict]:
205
+ """
206
+ Search for most similar chunks to a query vector.
207
+
208
+ Args:
209
+ query_vector: 768-dimensional query embedding
210
+ top_k: How many results to return
211
+ filter_category: Only return chunks from this ArXiv category
212
+ filter_year_gte: Only return chunks from this year or later
213
+
214
+ Returns:
215
+ List of result dicts, each containing:
216
+ {
217
+ "chunk_id": str,
218
+ "score": float (cosine similarity, 0-1),
219
+ "text": str,
220
+ "paper_id": str,
221
+ "title": str,
222
+ "authors": list,
223
+ "published_date": str,
224
+ ...all other payload fields
225
+ }
226
+
227
+ FILTERING IN QDRANT:
228
+ Qdrant applies metadata filters DURING vector search,
229
+ not after. This means it only scores vectors that match
230
+ the filter - much faster than post-filtering.
231
+
232
+ Example: filter_year_gte=2024 means:
233
+ "Find the top-20 most similar vectors, but ONLY consider
234
+ vectors from papers published in 2024 or later"
235
+ """
236
+ # Build optional filter
237
+ qdrant_filter = self._build_filter(filter_category, filter_year_gte)
238
+
239
+
240
+ # Execute search
241
+ results = self.client.query_points(
242
+ collection_name = self.collection_name,
243
+ query = query_vector.tolist(),
244
+ limit = top_k,
245
+ query_filter = qdrant_filter,
246
+ with_payload = True, # Return metadata with results
247
+ with_vectors = False # Don't return the vectors (saves bandwidth)
248
+ ).points
249
+
250
+ # Convert Qdrant ScoredPoint objects to plain dicts
251
+ return [
252
+ {
253
+ "chunk_id": str(r.id),
254
+ "score" : round(r.score, 4),
255
+ **r.payload, # Unpack all payload fields (text, title, etc.)
256
+ }
257
+ for r in results
258
+ ]
259
+
260
+
261
+ def _build_filter(
262
+ self,
263
+ category: Optional[str],
264
+ year_gte: Optional[int],
265
+ ) -> Optional[Filter]:
266
+ """
267
+ Build a Qdrant filter from optional parameters.
268
+
269
+ Returns None if no filters specified (search everything).
270
+
271
+ QDRANT FILTER SYNTAX:
272
+ Filter(must=[condition1, condition2])
273
+ means: results must satisfy condition1 AND condition2
274
+
275
+ MatchValue -> exact match (equality check)
276
+ Range -> numeric range (gte, lte, gt, lt)
277
+ """
278
+ conditions = []
279
+
280
+ if category:
281
+ conditions.append(
282
+ FieldCondition(
283
+ key = "primary_category",
284
+ match = MatchValue(value = category)
285
+ )
286
+ )
287
+
288
+ if year_gte:
289
+ # published_date is stored as "YYYY-MM-DD" string
290
+ # We filter by string comparison: "2024-01-01" <= date
291
+ # This works because ISO date strings sort lexicographically
292
+ conditions.append(
293
+ FieldCondition(
294
+ key = "published_date",
295
+ range = Range(gte = f"{year_gte}-01-01")
296
+ )
297
+ )
298
+
299
+ if not conditions:
300
+ return None
301
+
302
+ return Filter(must = conditions)
303
+
304
+
305
+ def get_collection_info(self) -> dict:
306
+ """Return summary information about the collection."""
307
+ if not self.collection_exists():
308
+ return {"status": "collection_not_found"}
309
+
310
+ info = self.client.get_collection(self.collection_name)
311
+
312
+ return {
313
+ "collection_name": self.collection_name,
314
+ "points_count" : info.points_count,
315
+ "status" : str(info.status),
316
+ "vector_size" : info.config.params.vectors.size,
317
+ "distance" : str(info.config.params.vectors.distance),
318
+ }
test_chunk_quality.py CHANGED
@@ -83,7 +83,7 @@ def main():
83
  gates = [
84
  ("Total chunks > 10,000", total_chunks > 10_000),
85
  ("Avg words 100-400", 100 <= avg_words <= 400),
86
- ("Tiny chunks < 10%", tiny_chunks/total_chunks < 0.10),
87
  ("Clean endings > 70%", clean_endings/total_chunks > 0.70),
88
  ]
89
 
 
83
  gates = [
84
  ("Total chunks > 10,000", total_chunks > 10_000),
85
  ("Avg words 100-400", 100 <= avg_words <= 400),
86
+ ("Tiny chunks < 15%", tiny_chunks/total_chunks < 0.15),
87
  ("Clean endings > 70%", clean_endings/total_chunks > 0.70),
88
  ]
89
 
test_embedding.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Verify embedding model works correctly before full pipeline run."""
2
+
3
+ import numpy as np
4
+ from src.utils.logger import setup_logger, get_logger
5
+ from src.embeddings.embedding_model import EmbeddingModel
6
+
7
+
8
+ setup_logger()
9
+ logger = get_logger(__name__)
10
+
11
+
12
+
13
+ def main():
14
+ model = EmbeddingModel()
15
+
16
+
17
+ # Test 1: Document embedding shape
18
+ docs = [
19
+ "The transformer model uses self-attention mechanisms.",
20
+ "UAV delivery systems require multi-agent coordination.",
21
+ "Gradient descent optimizes neural network parameters.",
22
+ ]
23
+
24
+ doc_embeddings = model.embed_documents(docs, show_progress = False)
25
+ assert doc_embeddings.shape == (3, 768), f"Wrong shape: {doc_embeddings.shape}"
26
+ logger.info(f"✅ Document embedding shape: {doc_embeddings.shape}")
27
+
28
+ # Test 2: Query embedding shape
29
+ query_emb = model.embed_query("what is attention mechanism?")
30
+ assert query_emb.shape == (768,), f"Wrong shape: {query_emb.shape}"
31
+ logger.info(f"✅ Query embedding shape: {query_emb.shape}")
32
+
33
+
34
+ # Test 3: Semantic similarity ordering
35
+ # The first two docs are about ML models - should be more similar
36
+ # to each other than to the UAV doc
37
+ sim_01 = float(np.dot(doc_embeddings[0], doc_embeddings[1]))
38
+ sim_02 = float(np.dot(doc_embeddings[0], doc_embeddings[2]))
39
+ sim_12 = float(np.dot(doc_embeddings[1], doc_embeddings[2]))
40
+
41
+
42
+ logger.info(f"Similarity (transformer ↔ gradient descent): {sim_02:.3f}")
43
+ logger.info(f"Similarity (transformer ↔ UAV): {sim_01:.3f}")
44
+ logger.info(f"Similarity (UAV ↔ gradient descent): {sim_12:.3f}")
45
+
46
+
47
+ # Test 4: Query-document similarity direction
48
+ # Query about attention should be closest to doc[0]
49
+ query_emb_2d = query_emb.reshape(1, -1)
50
+ sims = doc_embeddings @ query_emb_2d.T
51
+ best_match = int(np.argmax(sims))
52
+ logger.info(f"✅ Query 'attention mechanism' matched doc[{best_match}]: '{docs[best_match][:50]}'")
53
+ assert best_match == 0, f"Expected doc[0] but got doc[{best_match}]"
54
+
55
+ # Test 5: Verify normalization (all vectors should have magnitude ≈ 1.0)
56
+ norms = np.linalg.norm(doc_embeddings, axis = 1)
57
+ assert np.allclose(norms, 1.0, atol = 1e-5), f"Not normalized: {norms}"
58
+
59
+ logger.info(f"✅ All embeddings L2-normalized (norms: {norms})")
60
+
61
+ logger.info(f"\n✅ All embedding tests passed. Ready for full pipeline.")
62
+
63
+
64
+
65
+ if __name__ == "__main__":
66
+ main()
test_search.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test Qdrant search with real queries.
3
+ This is your first end-to-end retrieval test.
4
+ """
5
+
6
+
7
+ from src.utils.logger import get_logger, setup_logger
8
+ from src.vectorstore.qdrant_store import QdrantStore
9
+ from src.embeddings.embedding_model import EmbeddingModel
10
+
11
+
12
+ setup_logger()
13
+ logger = get_logger(__name__)
14
+
15
+
16
+
17
+ def search_and_display(store: QdrantStore, model: EmbeddingModel, query: str, top_k: int = 3):
18
+ """Run a search query and display results clearly."""
19
+ print(f"\n{'=' * 60}")
20
+ print(f"QUERY: {query}")
21
+ print(f"{'=' * 60}")
22
+
23
+
24
+ # Embed the query (with BGE prefix)
25
+ query_vector = model.embed_query(query)
26
+
27
+ # Search Qdrant
28
+ results = store.search(query_vector, top_k = top_k)
29
+
30
+ if not results:
31
+ print(f"No results found.")
32
+ return
33
+
34
+ for i, r in enumerate(results):
35
+ print(f"\n[{i+1}] Score: {r['score']:.4f}")
36
+ print(f" Paper: {r.get('paper_id', 'N/A')}")
37
+ print(f" Title: {r.get('title', 'N/A')[:65]}...")
38
+ print(f" Date: {r.get('published_date', 'N/A')}")
39
+ print(f" Category: {r.get('primary_category', 'N/A')}")
40
+ print(f" Chunk {r.get('chunk_index','?')}/{r.get('total_chunks','?')}")
41
+ print(f" Text preview: {r.get('text','')[:150].replace(chr(10),' ')}...")
42
+
43
+
44
+
45
+
46
+ def main():
47
+ logger.info("Loading model and connecting to Qdrant...")
48
+
49
+ store = QdrantStore()
50
+ model = EmbeddingModel()
51
+
52
+
53
+ # Verify collection exists
54
+ info = store.get_collection_info()
55
+ logger.info(f"Collection info: {info}")
56
+
57
+
58
+ if info.get("points_count", 0) == 0:
59
+ logger.error("Collection is empty. Run run_indexing.py first.")
60
+ return
61
+
62
+ # --- Test queries covering different retrieval scenarios ---
63
+
64
+ # Test 1: Conceptual Query
65
+ search_and_display(store, model,
66
+ "how does self-attention mechanism work in transformers",
67
+ top_k=3
68
+ )
69
+
70
+ # Test 2: Task-specific query
71
+ search_and_display(store, model,
72
+ "reinforcement learning for multi-agent systems",
73
+ top_k=3
74
+ )
75
+
76
+ # Test 3: Method comparison query
77
+ search_and_display(store, model,
78
+ "comparison of fine-tuning methods for large language models",
79
+ top_k=3
80
+ )
81
+
82
+
83
+ # Test 4: with metadata filter - only cs.LG papers
84
+ print(f"\n{'='*60}")
85
+ print("FILTERED QUERY: 'neural network optimization' (cs.LG only)")
86
+ print(f"{'='*60}")
87
+ query_vector = model.embed_query("neural network optimization methods")
88
+ results = store.search(
89
+ query_vector,
90
+ top_k = 3,
91
+ filter_category = "cs.LG"
92
+ )
93
+ for i, r in enumerate(results):
94
+ print(f"[{i+1}] {r['score']:.4f} | {r.get('primary_category')} | {r.get('title','')[:55]}...")
95
+
96
+ logger.info("\n✅ Search test complete.")
97
+
98
+
99
+
100
+ if __name__ == "__main__":
101
+ main()