| from typing import Dict, List, Optional, Union |
|
|
| import numpy as np |
| import requests |
| from mteb import DRESModel |
| from tqdm import tqdm |
|
|
|
|
| class SionicEmbeddingModel(DRESModel): |
| def __init__(self, url: str, instruction: Optional[str] = None, batch_size: int = 128, dimension: int = 2048, **kwargs) -> None: |
| self.url = url |
| self.instruction = instruction |
| self.batch_size = batch_size |
| self.dimension = dimension |
|
|
| def get_embeddings(self, queries: List[str]) -> np.ndarray: |
| return np.asarray( |
| requests.post(self.url, json={'inputs': queries}).json()['embedding'], |
| dtype=np.float32, |
| ).reshape(len(queries), self.dimension) |
|
|
| def encode_queries(self, queries: List[str], **kwargs) -> np.ndarray: |
| return self.encode([f'{self.instruction}{query}' for query in queries]) |
|
|
| def encode_corpus(self, corpus: List[Union[Dict[str, str], str]], **kwargs) -> np.ndarray: |
| sentences: List[str] = ( |
| [f"{doc.get('title', '')} {doc['text']}".strip() for doc in corpus] |
| if isinstance(corpus[0], dict) |
| else corpus |
| ) |
|
|
| return self.encode(sentences) |
|
|
| def encode(self, sentences: List[str], **kwargs) -> np.ndarray: |
| return np.concatenate( |
| [ |
| self.get_embeddings(sentences[idx:idx + self.batch_size]) |
| for idx in tqdm(range(0, len(sentences), self.batch_size), desc='encode') |
| ], |
| axis=0, |
| ) |
|
|
|
|