File size: 2,800 Bytes
daafb32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""
Test Qdrant search with real queries.
This is your first end-to-end retrieval test.
"""


from src.utils.logger import get_logger, setup_logger
from src.vectorstore.qdrant_store import QdrantStore
from src.embeddings.embedding_model import EmbeddingModel


setup_logger()
logger = get_logger(__name__)



def search_and_display(store: QdrantStore, model: EmbeddingModel, query: str, top_k: int = 3):
    """Run a search query and display results clearly."""
    print(f"\n{'=' * 60}")
    print(f"QUERY: {query}")
    print(f"{'=' * 60}")


    # Embed the query (with BGE prefix)
    query_vector = model.embed_query(query)

    # Search Qdrant
    results = store.search(query_vector, top_k = top_k)

    if not results:
        print(f"No results found.")
        return

    for i, r in enumerate(results):
        print(f"\n[{i+1}] Score: {r['score']:.4f}")
        print(f"     Paper: {r.get('paper_id', 'N/A')}")
        print(f"     Title: {r.get('title', 'N/A')[:65]}...")
        print(f"     Date:  {r.get('published_date', 'N/A')}")
        print(f"     Category: {r.get('primary_category', 'N/A')}")
        print(f"     Chunk {r.get('chunk_index','?')}/{r.get('total_chunks','?')}")
        print(f"     Text preview: {r.get('text','')[:150].replace(chr(10),' ')}...")




def main():
    logger.info("Loading model and connecting to Qdrant...")

    store = QdrantStore()
    model = EmbeddingModel()


    # Verify collection exists
    info = store.get_collection_info()
    logger.info(f"Collection info: {info}")


    if info.get("points_count", 0) == 0:
        logger.error("Collection is empty. Run run_indexing.py first.")
        return

    # --- Test queries covering different retrieval scenarios ---

    # Test 1: Conceptual Query
    search_and_display(store, model,
        "how does self-attention mechanism work in transformers",
        top_k=3
    )

    # Test 2: Task-specific query
    search_and_display(store, model,
        "reinforcement learning for multi-agent systems",
        top_k=3
    )

    # Test 3: Method comparison query
    search_and_display(store, model,
        "comparison of fine-tuning methods for large language models",
        top_k=3
    )


    # Test 4: with metadata filter - only cs.LG papers
    print(f"\n{'='*60}")
    print("FILTERED QUERY: 'neural network optimization' (cs.LG only)")
    print(f"{'='*60}")
    query_vector = model.embed_query("neural network optimization methods")
    results = store.search(
        query_vector,
        top_k = 3,
        filter_category = "cs.LG"
    )
    for i, r in enumerate(results):
        print(f"[{i+1}] {r['score']:.4f} | {r.get('primary_category')} | {r.get('title','')[:55]}...")

    logger.info("\n✅ Search test complete.")



if __name__ == "__main__":
    main()