File size: 3,219 Bytes
385bc37
 
 
 
75361de
385bc37
 
 
 
 
75361de
385bc37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15dacd4
4cba650
 
75361de
4cba650
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75361de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import numpy as np
import faiss
from typing import List, Dict
from pathlib import Path
from src.fireworks.inference import get_embedding, expand_query, rerank_results
from constants.constants import FAISS_INDEX, PRODUCTS_DF

_FILE_PATH = Path(__file__).parents[2]


def search_vector(query: str, top_k: int = 5) -> List[Dict[str, any]]:
    """
    Search products using vector embeddings and FAISS for semantic search.

    This is Stage 2: semantic search using vector embeddings to understand
    query meaning and intent beyond exact keyword matching.

    Args:
        query: Search query string
        top_k: Number of top results to return (default: 10)

    Returns:
        List of dictionaries containing product information and scores
    """
    query_embedding = get_embedding(query)
    query_vector = np.array([query_embedding], dtype=np.float32)

    faiss.normalize_L2(query_vector)
    faiss_index = FAISS_INDEX[0]
    distances, indices = faiss_index.search(query_vector, top_k)

    # Convert L2 distances to similarity scores (0-1 range)
    # After normalization, L2 distance = 2 * (1 - cosine_similarity)
    # So cosine_similarity = 1 - (L2_distance / 2)
    similarity_scores = 1 - (distances[0] / 2)

    return [
        {
            "product_name": PRODUCTS_DF.iloc[idx]["Product Name"],
            "description": PRODUCTS_DF.iloc[idx]["Description"],
            "main_category": PRODUCTS_DF.iloc[idx]["MainCategory"],
            "secondary_category": PRODUCTS_DF.iloc[idx]["SecondaryCategory"],
            "score": float(score),
        }
        for idx, score in zip(indices[0], similarity_scores)
    ]


def search_vector_with_expansion(query: str, top_k: int = 5) -> List[Dict[str, any]]:
    """
    Search products using vector embeddings and FAISS for semantic search with query expansion.

    This is Stage 3: semantic search using vector embeddings to understand
    query meaning and intent beyond exact keyword matching, with query expansion.

    Args:
        query: Search query string
        top_k: Number of top results to return (default: 10)

    Returns:
        List of dictionaries containing product information and scores
    """
    expanded_query = expand_query(query)
    print(f"Original: {query}")
    print(f"Expanded: {expanded_query}")
    return search_vector(expanded_query, top_k)


def search_vector_with_reranking(query: str, top_k: int = 5) -> List[Dict[str, any]]:
    """
    Search products using vector embeddings and FAISS for semantic search with reranking.

    This is Stage 4: semantic search using vector embeddings to understand
    query meaning and intent beyond exact keyword matching, with reranking.

    Args:
        query: Search query string
        top_k: Number of top results to return (default: 10)

    Returns:
        List of dictionaries containing product information with preserved cosine scores
    """
    results = search_vector_with_expansion(query, top_k)
    cosine_scores = {r["product_name"]: r["score"] for r in results}
    reranked_results = rerank_results(query=query, results=results)

    for r in reranked_results:
        r["score"] = cosine_scores[r["product_name"]]

    return reranked_results