|
|
import os |
|
|
import shutil |
|
|
from langchain_community.vectorstores import Chroma |
|
|
from langchain_core.documents import Document |
|
|
|
|
|
|
|
|
CHROMA_PATH = 'chroma' |
|
|
|
|
|
def save_to_chroma_db(chunks: list[Document], embedding_model) -> Chroma: |
|
|
""" |
|
|
Guarda documentos en ChromaDB usando modo local con procesamiento por lotes |
|
|
""" |
|
|
|
|
|
print(f"Usando modo local de ChromaDB en {CHROMA_PATH}") |
|
|
|
|
|
|
|
|
if os.path.exists(CHROMA_PATH): |
|
|
try: |
|
|
shutil.rmtree(CHROMA_PATH) |
|
|
print(f"Base de datos local existente eliminada: {CHROMA_PATH}") |
|
|
except Exception as e: |
|
|
print(f"Error eliminando base de datos local: {e}") |
|
|
|
|
|
try: |
|
|
|
|
|
batch_size = 1000 |
|
|
total_chunks = len(chunks) |
|
|
|
|
|
print(f"Procesando {total_chunks} chunks en lotes de {batch_size}...") |
|
|
|
|
|
|
|
|
first_batch = chunks[:batch_size] |
|
|
print(f"Procesando primer lote: {len(first_batch)} chunks...") |
|
|
|
|
|
db = Chroma.from_documents( |
|
|
first_batch, |
|
|
persist_directory=CHROMA_PATH, |
|
|
embedding=embedding_model |
|
|
) |
|
|
|
|
|
print(f"Primer lote completado. Guardado en {CHROMA_PATH}") |
|
|
|
|
|
|
|
|
for i in range(batch_size, total_chunks, batch_size): |
|
|
end_idx = min(i + batch_size, total_chunks) |
|
|
batch = chunks[i:end_idx] |
|
|
batch_num = (i // batch_size) + 1 |
|
|
total_batches = (total_chunks + batch_size - 1) // batch_size |
|
|
|
|
|
print(f"Procesando lote {batch_num}/{total_batches}: {len(batch)} chunks...") |
|
|
|
|
|
try: |
|
|
db.add_documents(batch) |
|
|
print(f"Lote {batch_num}/{total_batches} completado") |
|
|
except Exception as e: |
|
|
print(f"Error procesando lote {batch_num}: {e}") |
|
|
print("Continuando con siguiente lote...") |
|
|
|
|
|
print(f"Procesamiento completado: {total_chunks} chunks guardados exitosamente") |
|
|
return db |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error cr铆tico creando base de datos: {e}") |
|
|
print("Verifica que Ollama est茅 funcionando y el modelo nomic-embed-text est茅 disponible") |
|
|
return None |
|
|
|
|
|
def get_chroma_client() -> Chroma: |
|
|
""" |
|
|
Obtiene un cliente ChromaDB para consultas |
|
|
""" |
|
|
try: |
|
|
if os.path.exists(CHROMA_PATH): |
|
|
|
|
|
from langchain_ollama import OllamaEmbeddings |
|
|
embedding_model = OllamaEmbeddings(model="nomic-embed-text") |
|
|
|
|
|
db = Chroma( |
|
|
persist_directory=CHROMA_PATH, |
|
|
embedding_function=embedding_model |
|
|
) |
|
|
print(f"Conectado a ChromaDB local en {CHROMA_PATH}") |
|
|
return db |
|
|
else: |
|
|
print("Base de datos local no encontrada") |
|
|
return None |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error conectando a ChromaDB local: {e}") |
|
|
return None |