Spaces:

bgonpin
/

rag

Runtime error

App Files Files Community

rag / src /file_processor.py

bgonpin

Upload folder using huggingface_hub

3949424 verified 2 months ago

raw

history blame contribute delete

3.5 kB

	from langchain_community.document_loaders import PyPDFDirectoryLoader, TextLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_core.documents import Document
	import os
	from typing import List

	# Paths to the directories containing the files
	DOCUMENTS_PATH = 'documents'
	APORTACIONES_PATH = 'aportaciones'

	def chunk_all_documents() -> List[Document]:
	"""
	Procesa todos los archivos de las carpetas documents y aportaciones (PDFs y archivos de texto/Markdown)
	y los divide en chunks para el procesamiento de embeddings.
	"""
	all_documents = []

	# Procesar documentos de la carpeta documents
	print("📁 Procesando documentos de la carpeta 'documents'...")
	if os.path.exists(DOCUMENTS_PATH):
	# Procesar archivos PDF
	if any(file.endswith('.pdf') for file in os.listdir(DOCUMENTS_PATH)):
	pdf_loader = PyPDFDirectoryLoader(DOCUMENTS_PATH)
	pdf_documents = pdf_loader.load()
	all_documents.extend(pdf_documents)
	print(f" ✅ Se cargaron {len(pdf_documents)} documentos PDF de 'documents'")

	# Procesar archivos de texto y markdown
	text_files = []
	for file in os.listdir(DOCUMENTS_PATH):
	if file.endswith(('.txt', '.md')):
	text_files.append(os.path.join(DOCUMENTS_PATH, file))

	for text_file in text_files:
	text_loader = TextLoader(text_file, encoding='utf-8')
	text_documents = text_loader.load()
	all_documents.extend(text_documents)

	print(f" ✅ Se cargaron {len(text_files)} archivos de texto/markdown de 'documents'")

	# Procesar documentos de la carpeta aportaciones
	print("🚀 Procesando documentos de la carpeta 'aportaciones'...")
	if os.path.exists(APORTACIONES_PATH):
	# Procesar archivos PDF
	if any(file.endswith('.pdf') for file in os.listdir(APORTACIONES_PATH)):
	pdf_loader = PyPDFDirectoryLoader(APORTACIONES_PATH)
	pdf_documents = pdf_loader.load()
	all_documents.extend(pdf_documents)
	print(f" ✅ Se cargaron {len(pdf_documents)} documentos PDF de 'aportaciones'")

	# Procesar archivos de texto y markdown
	text_files = []
	for file in os.listdir(APORTACIONES_PATH):
	if file.endswith(('.txt', '.md')):
	text_files.append(os.path.join(APORTACIONES_PATH, file))

	for text_file in text_files:
	text_loader = TextLoader(text_file, encoding='utf-8')
	text_documents = text_loader.load()
	all_documents.extend(text_documents)

	print(f" ✅ Se cargaron {len(text_files)} archivos de texto/markdown de 'aportaciones'")

	print(f"📊 Total de documentos cargados: {len(all_documents)}")

	# Initialize the text splitter
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=800, # Size of each chunk in characters
	chunk_overlap=100, # Overlap between chunks in characters
	length_function=len, # Function to calculate the length of the text
	add_start_index=True, # Add start index to the chunks
	)

	# Split the documents into chunks
	chunks = text_splitter.split_documents(all_documents)

	print(f"Se crearon {len(chunks)} chunks de texto")
	return chunks

	# Mantener función anterior para compatibilidad
	def chunk_pdfs() -> List[Document]:
	"""Función legacy para procesar solo PDFs"""
	return chunk_all_documents()