Spaces:

ShanenThomas
/

PDF_RAG

Sleeping

App Files Files Community

PDF_RAG / app.py

ShanenThomas

Update app.py

77d4f4a verified 4 months ago

raw

history blame contribute delete

8.26 kB


	"""
	BOOK BUDDY — Ask questions about your PDFs

	This file is written with super-simple names and big comments,
	so a kid can read it and understand what’s going on.

	How it works:
	1) We read your PDF and cut it into small text pieces.
	2) We make "numbers" (embeddings) for each piece so we can search fast.
	3) When you ask a question, we find the best pieces and give them
	to a friendly robot model (Mistral) to make a short answer.
	4) We also show which book files we used (sources).
	"""

	import os
	import numpy as np
	import gradio as gr
	from typing import List, Tuple
	from pypdf import PdfReader
	from sentence_transformers import SentenceTransformer
	from huggingface_hub import InferenceClient

	# ====== SETTINGS YOU CAN CHANGE ======
	ROBOT_MODEL = os.getenv("GEN_MODEL", "mistralai/Mistral-7B-Instruct-v0.2")
	HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
	EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
	PIECE_SIZE = 900 # how big each text piece is
	PIECE_OVERLAP = 150 # how much pieces overlap
	TOP_K = 4 # how many pieces we use to answer

	# ====== TRY FAISS (fast search). IF NOT, USE SIMPLE NUMPY SEARCH ======
	USE_FAISS = True
	try:
	import faiss # fast similarity search
	except Exception:
	USE_FAISS = False

	# ====== GLOBAL MEMORY (lives while the app is running) ======
	make_numbers = SentenceTransformer(EMBEDDING_MODEL)
	faiss_index = None # used if FAISS works
	all_vectors = None # used if FAISS doesn't work
	all_pieces: List[str] = []
	all_files: List[str] = []
	client = InferenceClient(model=ROBOT_MODEL, token=HF_TOKEN)

	# A friendly rule for the robot
	ROBOT_RULES = (
	"You are a helpful assistant. Use the given CONTEXT to answer the QUESTION.\n"
	"!!IMPORTANT!! - If the answer is not in the context, Strictly say 'I don't know.', Do not respond any other answers!!!\n"
	"Be short and add source filenames at the end like [source: file.pdf]."
	)

	# ====== LITTLE HELPER FUNCTIONS ======

	def read_pdf_text(path: str) -> str:
	"""Open a PDF and return all the text inside."""
	reader = PdfReader(path)
	pages = [(p.extract_text() or "") for p in reader.pages]
	return "\n".join(pages)

	def cut_into_pieces(big_text: str, size: int, overlap: int) -> List[str]:
	"""Cut text into small overlapping pieces (like puzzle pieces)."""
	pieces, step = [], size - overlap
	i, n = 0, len(big_text)
	while i < n:
	chunk = big_text[i:i+size].strip()
	if chunk:
	pieces.append(chunk)
	i += step
	return pieces

	def embed_texts(texts: List[str]) -> np.ndarray:
	"""Turn text into numbers so we can search by meaning."""
	X = make_numbers.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
	return np.asarray(X, dtype=np.float32)

	def start_memory(dim: int):
	"""Create the place where we store the vectors (FAISS or NumPy)."""
	global faiss_index, all_vectors
	if USE_FAISS:
	faiss_index = faiss.IndexFlatIP(dim) # inner product = cosine because normalized
	else:
	faiss_index = None
	all_vectors = None

	def add_to_memory(vectors: np.ndarray):
	"""Put new vectors into our memory."""
	global all_vectors
	if USE_FAISS:
	faiss_index.add(vectors)
	else:
	all_vectors = vectors if all_vectors is None else np.vstack([all_vectors, vectors])

	def search_best_pieces(query_vector: np.ndarray, k: int) -> Tuple[np.ndarray, np.ndarray]:
	"""Find the k best matching pieces for the question."""
	if USE_FAISS:
	return faiss_index.search(query_vector, k) # returns (distances, indices)
	scores = all_vectors @ query_vector[0] # cosine/IP because normalized
	idx = np.argsort(-scores)[:k]
	return scores[idx][None, :], idx[None, :]

	# ====== MAIN ACTIONS THE BUTTONS CALL ======

	def reset_everything():
	"""Clear all memory (like starting fresh)."""
	global faiss_index, all_vectors, all_pieces, all_files
	faiss_index = None
	all_vectors = None
	all_pieces = []
	all_files = []
	return "Cleared! Upload PDFs again and click Build Index."

	def build_memory_from_pdfs(files) -> str:
	"""Read PDFs → cut into pieces → turn to numbers → store them."""
	global all_pieces, all_files
	all_pieces, all_files = [], []

	# 1) read + cut
	for f in files:
	text = read_pdf_text(f.name)
	pieces = cut_into_pieces(text, PIECE_SIZE, PIECE_OVERLAP)
	all_pieces.extend(pieces)
	all_files.extend([os.path.basename(f.name)] * len(pieces))

	if not all_pieces:
	return "No text found. Try another PDF."

	# 2) embeddings + memory
	E = embed_texts(all_pieces)
	start_memory(E.shape[1])
	add_to_memory(E)
	return f"Indexed {len(all_pieces)} pieces from {len(files)} file(s)."

	def ask_robot(question: str) -> str:
	"""Search the best pieces and ask the robot model to answer."""
	if not question.strip():
	return "Type a question in the box."
	if (USE_FAISS and faiss_index is None) or (not USE_FAISS and all_vectors is None) or not all_pieces:
	return "Upload PDFs and press Build Index first."

	# 1) find helpful pieces
	qv = embed_texts([question])
	_, idxs = search_best_pieces(qv, TOP_K)
	ids = [i for i in idxs[0].tolist() if i >= 0]

	# 2) build the context we give to the robot
	context_blocks = []
	used_files = []
	for rank, i in enumerate(ids, start=1):
	snippet = all_pieces[i][:1000]
	fname = all_files[i]
	context_blocks.append(f"[{rank}] {fname}\n{snippet}")
	used_files.append(fname)

	context_text = "\n\n---\n".join(context_blocks)

	# 3) talk to the robot on Hugging Face
	messages = [
	{"role": "system", "content": ROBOT_RULES},
	{"role": "user", "content": f"QUESTION: {question}\n\nCONTEXT:\n{context_text}"},
	]

	# Plan A: chat-completions (most models)
	try:
	resp = client.chat.completions.create(
	model=ROBOT_MODEL,
	messages=messages,
	max_tokens=512,
	temperature=0.2,
	top_p=0.95,
	)
	out = resp.choices[0].message.content
	except Exception:
	# Plan B: plain text generation (some endpoints)
	prompt = f"[INST] {ROBOT_RULES}\n\nQUESTION: {question}\n\nCONTEXT:\n{context_text}\n[/INST]"
	out = client.text_generation(
	prompt,
	max_new_tokens=512,
	temperature=0.2,
	top_p=0.95,
	repetition_penalty=1.05,
	do_sample=True,
	return_full_text=False,
	)

	# 4) add sources (the book files we used)
	unique_sources = ", ".join(sorted(set(used_files))) if used_files else "N/A"
	return f"{out.strip()}\n\nSources: {unique_sources}"

	# ====== THE SIMPLE WEB PAGE ======
	with gr.Blocks(title="📚 Book Buddy — Ask your PDFs") as demo:
	gr.Markdown(
	"## 📚 Book Buddy\n"
	"1) Upload your PDF book. 2) Press Build Index (Book Buddy learns!). "
	"3) Ask your question. 4) Look at Sources to see which file was used.\n"
	"_Tip: start with one small PDF so it’s fast._"
	)

	with gr.Row():
	with gr.Column(scale=1):
	pdfs = gr.File(file_count="multiple", file_types=[".pdf"], label="Upload PDF books")
	build_btn = gr.Button("🧠 Build Index", variant="primary")
	reset_btn = gr.Button("🔁 Reset")
	status = gr.Markdown()
	with gr.Column(scale=2):
	q = gr.Textbox(label="Ask a question", placeholder="Example: Give me 3 key points from this book.")
	examples = gr.Examples(
	examples=[
	["Summarize the main idea in 2 sentences."],
	["List 3 important facts from this book."],
	],
	inputs=q,
	)
	ask_btn = gr.Button("➡️ Ask")
	answer = gr.Markdown()

	build_btn.click(build_memory_from_pdfs, inputs=[pdfs], outputs=[status])
	reset_btn.click(fn=reset_everything, inputs=None, outputs=[status])
	ask_btn.click(ask_robot, inputs=[q], outputs=[answer])
	q.submit(ask_robot, inputs=[q], outputs=[answer])

	if __name__ == "__main__":
	demo.launch()