Sentence Similarity
sentence-transformers
Safetensors
Hebrew
hebrew
semantic-retrieval
information-retrieval
dense-retrieval
reranking
rrf
competition
Instructions to use HebArabNlpProject/Semantic-Retrieval-2nd-place with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- sentence-transformers
How to use HebArabNlpProject/Semantic-Retrieval-2nd-place with sentence-transformers:
from sentence_transformers import SentenceTransformer model = SentenceTransformer("HebArabNlpProject/Semantic-Retrieval-2nd-place") sentences = [ "The weather is lovely today.", "It's so sunny outside!", "He drove to the stadium." ] embeddings = model.encode(sentences) similarities = model.similarity(embeddings, embeddings) print(similarities.shape) # [3, 3] - Notebooks
- Google Colab
- Kaggle
| """ | |
| text_utils.py | |
| Single-source Hebrew normalization & tokenization. | |
| Controls behavior across all scripts. | |
| """ | |
| import re | |
| import unicodedata | |
| from typing import List | |
| HEB_PREFIXES = ("ื","ื","ื","ื","ื","ื","ืฉ") | |
| STOPWORDS = set(""" | |
| ืืื ืื ืื ืื ืืชื ืืช ืืชื ืืชื ืืฆื ืขื ืขื ืขื ืื ืื ื ืื ื ืืื ืืื ืื ืื ืืฉืจ ืฉื | |
| ืืื ืื ืื ืืืจ ืืืฉืจ ืืื ืืคื ื ืืืืจ ืืื ืขืื ืจืง | |
| ืื ืื ืื ืื ืื ืื ืื ืื ืืื ืืคื ืืื ืืื ืื' | |
| """.split()) | |
| # --- Core Function --- | |
| def identity(s: str) -> str: | |
| """Does nothing""" | |
| return s | |
| def norm_he(s: str) -> str: | |
| """Current normalization implementation (bad)""" | |
| if not s: | |
| return "" | |
| s = unicodedata.normalize("NFKC", s) | |
| s = re.sub(r"[\u0591-\u05BD\u05BF-\u05C7]", "", s) # strip nikkud | |
| s = (s.replace("ืด", '"').replace("ืณ", "'") | |
| .replace("โ", '"').replace("โ", '"') | |
| .replace("โ", "-").replace("โ", "-")) | |
| return re.sub(r"\s+", " ", s).strip() | |
| def tok_he(text: str) -> List[str]: | |
| """The main tokenizer. It uses the BM25 normalizer internally.""" | |
| s = norm_bm25(text) # Use the specific normalizer for BM25 | |
| toks = re.findall(r"[A-Za-z0-9\u0590-\u05FF]+", s) | |
| out: List[str] = [] | |
| for t in toks: | |
| if len(t) > 3 and t[0] in HEB_PREFIXES: | |
| out.append(t[1:]) # stripped prefix | |
| out.append(t) | |
| return [t for t in out if t not in STOPWORDS] | |
| # --- Component-Specific Assignments --- | |
| # For now, only BM25 gets real normalization. | |
| norm_bm25 = norm_he | |
| # For now, E5, Gemma and BGE inputs are passed through unchanged. | |
| norm_e5_query = identity | |
| norm_e5_passage = identity | |
| norm_gemma_query = identity | |
| norm_gemma_passage = identity | |
| norm_bge_query = identity | |
| norm_bge_passage = identity | |
| # --- General Aliases --- | |
| tokenize = tok_he | |
| normalize = norm_he # General normalize points to the strong one |