from langchain_community.document_loaders import PyPDFDirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
import os
from typing import List

# Paths to the directories containing the files
DOCUMENTS_PATH = 'documents'
APORTACIONES_PATH = 'aportaciones'

def chunk_all_documents() -> List[Document]:
    """
    Procesa todos los archivos de las carpetas documents y aportaciones (PDFs y archivos de texto/Markdown)
    y los divide en chunks para el procesamiento de embeddings.
    """
    all_documents = []

    # Procesar documentos de la carpeta documents
    print("📁 Procesando documentos de la carpeta 'documents'...")
    if os.path.exists(DOCUMENTS_PATH):
        # Procesar archivos PDF
        if any(file.endswith('.pdf') for file in os.listdir(DOCUMENTS_PATH)):
            pdf_loader = PyPDFDirectoryLoader(DOCUMENTS_PATH)
            pdf_documents = pdf_loader.load()
            all_documents.extend(pdf_documents)
            print(f"  ✅ Se cargaron {len(pdf_documents)} documentos PDF de 'documents'")

        # Procesar archivos de texto y markdown
        text_files = []
        for file in os.listdir(DOCUMENTS_PATH):
            if file.endswith(('.txt', '.md')):
                text_files.append(os.path.join(DOCUMENTS_PATH, file))

        for text_file in text_files:
            text_loader = TextLoader(text_file, encoding='utf-8')
            text_documents = text_loader.load()
            all_documents.extend(text_documents)

        print(f"  ✅ Se cargaron {len(text_files)} archivos de texto/markdown de 'documents'")

    # Procesar documentos de la carpeta aportaciones
    print("🚀 Procesando documentos de la carpeta 'aportaciones'...")
    if os.path.exists(APORTACIONES_PATH):
        # Procesar archivos PDF
        if any(file.endswith('.pdf') for file in os.listdir(APORTACIONES_PATH)):
            pdf_loader = PyPDFDirectoryLoader(APORTACIONES_PATH)
            pdf_documents = pdf_loader.load()
            all_documents.extend(pdf_documents)
            print(f"  ✅ Se cargaron {len(pdf_documents)} documentos PDF de 'aportaciones'")

        # Procesar archivos de texto y markdown
        text_files = []
        for file in os.listdir(APORTACIONES_PATH):
            if file.endswith(('.txt', '.md')):
                text_files.append(os.path.join(APORTACIONES_PATH, file))

        for text_file in text_files:
            text_loader = TextLoader(text_file, encoding='utf-8')
            text_documents = text_loader.load()
            all_documents.extend(text_documents)

        print(f"  ✅ Se cargaron {len(text_files)} archivos de texto/markdown de 'aportaciones'")

    print(f"📊 Total de documentos cargados: {len(all_documents)}")

    # Initialize the text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800, # Size of each chunk in characters
        chunk_overlap=100, # Overlap between chunks in characters
        length_function=len, # Function to calculate the length of the text
        add_start_index=True, # Add start index to the chunks
    )

    # Split the documents into chunks
    chunks = text_splitter.split_documents(all_documents)

    print(f"Se crearon {len(chunks)} chunks de texto")
    return chunks

# Mantener función anterior para compatibilidad
def chunk_pdfs() -> List[Document]:
    """Función legacy para procesar solo PDFs"""
    return chunk_all_documents()