Buckets:

MaximoLopezChenlo's picture
download
raw
2.97 kB
import chromadb
import chromadb.utils.embedding_functions as embedding_functions
import json
import os
def vectorize_chunks(input_dir="data/processed/chunks", db_dir="data/chroma_db"):
# Ensure directories exist
os.makedirs(db_dir, exist_ok=True)
print("⏳ Inicializando ChromaDB local...")
# Initialize persistent ChromaDB
client = chromadb.PersistentClient(path=db_dir)
# Use medical embedding model
model_name = "pritamdeka/S-PubMedBert-MS-MARCO"
print(f"🧠 Cargando modelo de embeddings: {model_name}...")
emb_fn = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=model_name)
# Create or get collection
collection = client.get_or_create_collection(name="clinical_guidelines", embedding_function=emb_fn)
if not os.path.exists(input_dir):
print(f"⚠️ El directorio {input_dir} no existe.")
return
json_files = [f for f in os.listdir(input_dir) if f.endswith('.json')]
if not json_files:
print(f"⚠️ No se encontraron archivos JSON en {input_dir}.")
return
total_chunks = 0
for file in json_files:
path = os.path.join(input_dir, file)
print(f"📄 Indexando {file}...")
with open(path, 'r', encoding='utf-8') as f:
try:
chunks = json.load(f)
except json.JSONDecodeError:
print(f"❌ Error al decodificar JSON en {file}")
continue
if not chunks:
continue
ids = []
documents = []
metadatas = []
for i, chunk in enumerate(chunks):
# Generate a unique ID for each chunk
chunk_id = f"{file.replace('.json', '')}_chunk_{i}"
ids.append(chunk_id)
# Contextualize the chunk slightly by adding the header to the content
header = chunk.get("header", "Unknown Header")
content = chunk.get("content", "")
doc_text = f"Section: {header}\n\n{content}"
documents.append(doc_text)
metadatas.append({
"source": chunk.get("source", "Unknown"),
"page": chunk.get("page", -1),
"header": header
})
# Add to chroma in batches
try:
# collection.add natively handles batching if needed, but doing it directly is usually fine for these sizes
collection.add(
documents=documents,
metadatas=metadatas,
ids=ids
)
total_chunks += len(ids)
except Exception as e:
print(f"❌ Error al añadir {file}: {e}")
print(f"✅ Vectorización completada. {total_chunks} chunks indexados en ChromaDB.")
print(f"📂 Base de datos guardada en: {db_dir}")
if __name__ == "__main__":
vectorize_chunks()

Xet Storage Details

Size:
2.97 kB
·
Xet hash:
d0b00eb38411acbd1d14e5f553cf6dc830844b742ab6d9b09fe8aa976ddef823

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.