HebArabNlpProject
/

Semantic-Retrieval-2nd-place

Sentence Similarity

sentence-transformers

semantic-retrieval

information-retrieval

dense-retrieval

Model card Files Files and versions

Semantic-Retrieval-2nd-place / text_utils.py

yarden077's picture

uploading 2nd place model

0f5ecaf verified 11 days ago

history blame contribute delete

2.01 kB

	"""
	text_utils.py
	Single-source Hebrew normalization & tokenization.
	Controls behavior across all scripts.
	"""
	import re
	import unicodedata
	from typing import List

	HEB_PREFIXES = ("ו","ה","ב","ל","כ","מ","ש")
	STOPWORDS = set("""
	אבל אם או אז אתה את אתם אתן אצל על עד עם אנחנו אני הוא היא הם הן אשר של
	ולא לא כן כבר כאשר לכן לפני לאחר כדי עוד רק
	אל זה זו אך כי גם כל כך בלי לפי וכן וכו וכ'
	""".split())


	# --- Core Function ---

	def identity(s: str) -> str:
	"""Does nothing"""
	return s

	def norm_he(s: str) -> str:
	"""Current normalization implementation (bad)"""
	if not s:
	return ""
	s = unicodedata.normalize("NFKC", s)
	s = re.sub(r"[\u0591-\u05BD\u05BF-\u05C7]", "", s) # strip nikkud
	s = (s.replace("״", '"').replace("׳", "'")
	.replace("”", '"').replace("“", '"')
	.replace("–", "-").replace("—", "-"))
	return re.sub(r"\s+", " ", s).strip()

	def tok_he(text: str) -> List[str]:
	"""The main tokenizer. It uses the BM25 normalizer internally."""
	s = norm_bm25(text) # Use the specific normalizer for BM25
	toks = re.findall(r"[A-Za-z0-9\u0590-\u05FF]+", s)

	out: List[str] = []
	for t in toks:
	if len(t) > 3 and t[0] in HEB_PREFIXES:
	out.append(t[1:]) # stripped prefix
	out.append(t)
	return [t for t in out if t not in STOPWORDS]


	# --- Component-Specific Assignments ---

	# For now, only BM25 gets real normalization.
	norm_bm25 = norm_he

	# For now, E5, Gemma and BGE inputs are passed through unchanged.
	norm_e5_query = identity
	norm_e5_passage = identity
	norm_gemma_query = identity
	norm_gemma_passage = identity
	norm_bge_query = identity
	norm_bge_passage = identity

	# --- General Aliases ---
	tokenize = tok_he
	normalize = norm_he # General normalize points to the strong one