|
|
from langchain_community.document_loaders import PyPDFDirectoryLoader, TextLoader |
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
from langchain_core.documents import Document |
|
|
import os |
|
|
from typing import List |
|
|
|
|
|
|
|
|
DOCUMENTS_PATH = 'documents' |
|
|
APORTACIONES_PATH = 'aportaciones' |
|
|
|
|
|
def chunk_all_documents() -> List[Document]: |
|
|
""" |
|
|
Procesa todos los archivos de las carpetas documents y aportaciones (PDFs y archivos de texto/Markdown) |
|
|
y los divide en chunks para el procesamiento de embeddings. |
|
|
""" |
|
|
all_documents = [] |
|
|
|
|
|
|
|
|
print("📁 Procesando documentos de la carpeta 'documents'...") |
|
|
if os.path.exists(DOCUMENTS_PATH): |
|
|
|
|
|
if any(file.endswith('.pdf') for file in os.listdir(DOCUMENTS_PATH)): |
|
|
pdf_loader = PyPDFDirectoryLoader(DOCUMENTS_PATH) |
|
|
pdf_documents = pdf_loader.load() |
|
|
all_documents.extend(pdf_documents) |
|
|
print(f" ✅ Se cargaron {len(pdf_documents)} documentos PDF de 'documents'") |
|
|
|
|
|
|
|
|
text_files = [] |
|
|
for file in os.listdir(DOCUMENTS_PATH): |
|
|
if file.endswith(('.txt', '.md')): |
|
|
text_files.append(os.path.join(DOCUMENTS_PATH, file)) |
|
|
|
|
|
for text_file in text_files: |
|
|
text_loader = TextLoader(text_file, encoding='utf-8') |
|
|
text_documents = text_loader.load() |
|
|
all_documents.extend(text_documents) |
|
|
|
|
|
print(f" ✅ Se cargaron {len(text_files)} archivos de texto/markdown de 'documents'") |
|
|
|
|
|
|
|
|
print("🚀 Procesando documentos de la carpeta 'aportaciones'...") |
|
|
if os.path.exists(APORTACIONES_PATH): |
|
|
|
|
|
if any(file.endswith('.pdf') for file in os.listdir(APORTACIONES_PATH)): |
|
|
pdf_loader = PyPDFDirectoryLoader(APORTACIONES_PATH) |
|
|
pdf_documents = pdf_loader.load() |
|
|
all_documents.extend(pdf_documents) |
|
|
print(f" ✅ Se cargaron {len(pdf_documents)} documentos PDF de 'aportaciones'") |
|
|
|
|
|
|
|
|
text_files = [] |
|
|
for file in os.listdir(APORTACIONES_PATH): |
|
|
if file.endswith(('.txt', '.md')): |
|
|
text_files.append(os.path.join(APORTACIONES_PATH, file)) |
|
|
|
|
|
for text_file in text_files: |
|
|
text_loader = TextLoader(text_file, encoding='utf-8') |
|
|
text_documents = text_loader.load() |
|
|
all_documents.extend(text_documents) |
|
|
|
|
|
print(f" ✅ Se cargaron {len(text_files)} archivos de texto/markdown de 'aportaciones'") |
|
|
|
|
|
print(f"📊 Total de documentos cargados: {len(all_documents)}") |
|
|
|
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
|
chunk_size=800, |
|
|
chunk_overlap=100, |
|
|
length_function=len, |
|
|
add_start_index=True, |
|
|
) |
|
|
|
|
|
|
|
|
chunks = text_splitter.split_documents(all_documents) |
|
|
|
|
|
print(f"Se crearon {len(chunks)} chunks de texto") |
|
|
return chunks |
|
|
|
|
|
|
|
|
def chunk_pdfs() -> List[Document]: |
|
|
"""Función legacy para procesar solo PDFs""" |
|
|
return chunk_all_documents() |