Spaces:

ShanenThomas
/

PDF_RAG

Sleeping

App Files Files Community

ShanenThomas commited on Aug 16

Commit

4f729f0

verified ·

1 Parent(s): caf69ac

Create app.py

Browse files

Files changed (1) hide show

app.py +173 -0

app.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import os
+import faiss
+import numpy as np
+import gradio as gr
+from typing import List, Tuple
+from pypdf import PdfReader
+from sentence_transformers import SentenceTransformer
+from huggingface_hub import InferenceClient
+# ==============================
+# Config
+# ==============================
+GEN_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"
+HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")  # set in Space Secrets
+EMB_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
+CHUNK_SIZE = 900
+CHUNK_OVERLAP = 150
+TOP_K = 4
+# ==============================
+# Globals (lifetime of the Space)
+# ==============================
+emb = SentenceTransformer(EMB_MODEL_NAME)
+index = None                      # FAISS index (inner product / cosine)
+doc_chunks: List[str] = []        # text chunks
+doc_meta: List[dict] = []         # {"file": "..."}
+client = InferenceClient(model=GEN_MODEL, token=HF_TOKEN)
+# ==============================
+# Helpers
+# ==============================
+def _chunk_text(text: str, size: int, overlap: int) -> List[str]:
+    chunks = []
+    start = 0
+    n = len(text)
+    step = size - overlap
+    while start < n:
+        end = min(start + size, n)
+        chunks.append(text[start:end])
+        start += step
+    return [c.strip() for c in chunks if c.strip()]
+def _embed(texts: List[str]) -> np.ndarray:
+    # 384-d for MiniLM; normalize for cosine/IP search
+    X = emb.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
+    return np.asarray(X, dtype=np.float32)
+def _ensure_index(dim: int):
+    global index
+    index = faiss.IndexFlatIP(dim)  # cosine via normalized vectors
+def _extract_text_from_pdf(path: str) -> str:
+    reader = PdfReader(path)
+    pages = []
+    for p in reader.pages:
+        t = p.extract_text() or ""
+        pages.append(t)
+    return "\n".join(pages)
+# ==============================
+# Build index
+# ==============================
+def build_from_pdfs(files) -> str:
+    global index, doc_chunks, doc_meta
+    doc_chunks, doc_meta = [], []
+    # 1) read PDFs → 2) chunk → collect
+    for f in files:
+        try:
+            text = _extract_text_from_pdf(f.name)
+        except Exception as e:
+            return f"Failed to read {os.path.basename(f.name)}: {e}"
+        chunks = _chunk_text(text, CHUNK_SIZE, CHUNK_OVERLAP)
+        for c in chunks:
+            doc_chunks.append(c)
+            doc_meta.append({"file": os.path.basename(f.name)})
+    if not doc_chunks:
+        return "No text extracted. Check your PDFs."
+    # 3) embeddings → FAISS
+    E = _embed(doc_chunks)
+    _ensure_index(E.shape[1])
+    index.add(E)
+    return f"Indexed {len(doc_chunks)} chunks from {len(files)} file(s)."
+# ==============================
+# Retrieval + Generation
+# ==============================
+def _retrieve(query: str, k: int = TOP_K) -> Tuple[List[int], List[str]]:
+    qv = _embed([query])  # shape (1, d)
+    sims, idxs = index.search(qv, k)  # inner product similarity
+    ids = idxs[0].tolist()
+    # Filter out -1 (in case FAISS returns for empty)
+    ids = [i for i in ids if i >= 0]
+    return ids, [doc_chunks[i] for i in ids]
+SYSTEM_PROMPT = (
+    "You are a helpful assistant. Use the given CONTEXT to answer the QUESTION.\n"
+    "If the answer is not in the context, say you don't know.\n"
+    "Provide a concise answer and list source filenames as [source: file.pdf] at the end."
+)
+def _mistral_prompt(question: str, context: str) -> str:
+    # Simple Mistral-instruct prompt format
+    return (
+        f"[INST] {SYSTEM_PROMPT}\n\n"
+        f"QUESTION: {question}\n\n"
+        f"CONTEXT:\n{context}\n"
+        f"[/INST]"
+    )
+def answer(question: str) -> str:
+    if not question.strip():
+        return "Ask a question."
+    if index is None or not doc_chunks:
+        return "Upload PDFs and click **Build Index** first."
+    ids, ctx_chunks = _retrieve(question, TOP_K)
+    # keep contexts reasonably short per chunk
+    previews = []
+    contexts = []
+    files = []
+    for rank, i in enumerate(ids, start=1):
+        chunk = doc_chunks[i][:1000]
+        fname = doc_meta[i]["file"]
+        contexts.append(f"[{rank}] {fname}\n{chunk}")
+        previews.append(f"[{rank}] {fname}")
+        files.append(fname)
+    context_str = "\n\n---\n".join(contexts)
+    prompt = _mistral_prompt(question, context_str)
+    try:
+        # Use hosted Inference API; returns a single string
+        out = client.text_generation(
+            prompt,
+            max_new_tokens=512,
+            temperature=0.2,
+            top_p=0.95,
+            repetition_penalty=1.05,
+            do_sample=True,
+            return_full_text=False,
+        )
+        # Ensure sources are visible at the end
+        unique_files = ", ".join(sorted(set(files)))
+        return f"{out.strip()}\n\nSources: {unique_files}"
+    except Exception as e:
+        return f"Generation error: {e}\n(Verify your HUGGINGFACEHUB_API_TOKEN and model name.)"
+# ==============================
+# UI
+# ==============================
+with gr.Blocks(title="Mistral 7B PDF-RAG") as demo:
+    gr.Markdown("# 📚 PDF-RAG (Mistral-7B-Instruct)\nUpload PDFs → Build Index → Ask questions. Answers cite sources.")
+    with gr.Row():
+        with gr.Column(scale=1):
+            files = gr.File(file_count="multiple", file_types=[".pdf"], label="Upload PDF books")
+            build_btn = gr.Button("Build Index", variant="primary")
+            status = gr.Markdown()
+        with gr.Column(scale=2):
+            q = gr.Textbox(label="Ask a question", placeholder="What does the book say about ...?")
+            ask_btn = gr.Button("Ask ➜")
+            a = gr.Markdown()
+    build_btn.click(build_from_pdfs, inputs=[files], outputs=[status])
+    ask_btn.click(answer, inputs=[q], outputs=[a])
+    q.submit(answer, inputs=[q], outputs=[a])  # hit Enter to ask
+if __name__ == "__main__":
+    demo.launch()