Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,216 +1,3 @@
|
|
| 1 |
-
# import os
|
| 2 |
-
# import numpy as np
|
| 3 |
-
# import gradio as gr
|
| 4 |
-
# from typing import List, Tuple
|
| 5 |
-
# from pypdf import PdfReader
|
| 6 |
-
# from sentence_transformers import SentenceTransformer
|
| 7 |
-
# from huggingface_hub import InferenceClient
|
| 8 |
-
|
| 9 |
-
# # -------------------------------------------------
|
| 10 |
-
# # Config
|
| 11 |
-
# # -------------------------------------------------
|
| 12 |
-
# # You can swap to another chat model if needed, e.g.:
|
| 13 |
-
# # "mistralai/Mistral-Nemo-Instruct-2407" or "meta-llama/Llama-3.1-8B-Instruct"
|
| 14 |
-
# GEN_MODEL = os.getenv("GEN_MODEL", "mistralai/Mistral-7B-Instruct-v0.2")
|
| 15 |
-
# HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN") # set in Space Secrets
|
| 16 |
-
# EMB_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
|
| 17 |
-
# CHUNK_SIZE = 900
|
| 18 |
-
# CHUNK_OVERLAP = 150
|
| 19 |
-
# TOP_K = 4
|
| 20 |
-
|
| 21 |
-
# # -------------------------------------------------
|
| 22 |
-
# # Try FAISS; fallback to pure NumPy search
|
| 23 |
-
# # -------------------------------------------------
|
| 24 |
-
# USE_FAISS = True
|
| 25 |
-
# try:
|
| 26 |
-
# import faiss # type: ignore
|
| 27 |
-
# except Exception:
|
| 28 |
-
# USE_FAISS = False
|
| 29 |
-
|
| 30 |
-
# # -------------------------------------------------
|
| 31 |
-
# # Globals
|
| 32 |
-
# # -------------------------------------------------
|
| 33 |
-
# emb = SentenceTransformer(EMB_MODEL_NAME)
|
| 34 |
-
# index = None # FAISS index (if available)
|
| 35 |
-
# matrix = None # fallback: stacked embeddings
|
| 36 |
-
# doc_chunks: List[str] = []
|
| 37 |
-
# doc_meta: List[dict] = []
|
| 38 |
-
# client = InferenceClient(model=GEN_MODEL, token=HF_TOKEN)
|
| 39 |
-
|
| 40 |
-
# SYSTEM_PROMPT = (
|
| 41 |
-
# "You are a helpful assistant. Use the given CONTEXT to answer the QUESTION.\n"
|
| 42 |
-
# "If the answer is not in the context, say you don't know.\n"
|
| 43 |
-
# "Be concise and list source filenames as [source: file.pdf] at the end."
|
| 44 |
-
# )
|
| 45 |
-
|
| 46 |
-
# # -------------------------------------------------
|
| 47 |
-
# # Helpers
|
| 48 |
-
# # -------------------------------------------------
|
| 49 |
-
# def _extract_text_from_pdf(path: str) -> str:
|
| 50 |
-
# r = PdfReader(path)
|
| 51 |
-
# pages = [(p.extract_text() or "") for p in r.pages]
|
| 52 |
-
# return "\n".join(pages)
|
| 53 |
-
|
| 54 |
-
# def _chunk_text(text: str, size: int, overlap: int) -> List[str]:
|
| 55 |
-
# chunks, step = [], size - overlap
|
| 56 |
-
# i, n = 0, len(text)
|
| 57 |
-
# while i < n:
|
| 58 |
-
# chunk = text[i:i+size].strip()
|
| 59 |
-
# if chunk: chunks.append(chunk)
|
| 60 |
-
# i += step
|
| 61 |
-
# return chunks
|
| 62 |
-
|
| 63 |
-
# def _embed(texts: List[str]) -> np.ndarray:
|
| 64 |
-
# X = emb.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
|
| 65 |
-
# return np.asarray(X, dtype=np.float32)
|
| 66 |
-
|
| 67 |
-
# def _ensure_index(dim: int):
|
| 68 |
-
# global index, matrix
|
| 69 |
-
# if USE_FAISS:
|
| 70 |
-
# index = faiss.IndexFlatIP(dim)
|
| 71 |
-
# else:
|
| 72 |
-
# index = None
|
| 73 |
-
# matrix = None
|
| 74 |
-
|
| 75 |
-
# def _add_embeddings(E: np.ndarray):
|
| 76 |
-
# global matrix
|
| 77 |
-
# if USE_FAISS:
|
| 78 |
-
# index.add(E)
|
| 79 |
-
# else:
|
| 80 |
-
# matrix = E if matrix is None else np.vstack([matrix, E])
|
| 81 |
-
|
| 82 |
-
# def _search(qv: np.ndarray, k: int):
|
| 83 |
-
# if USE_FAISS:
|
| 84 |
-
# return index.search(qv, k) # returns (D, I)
|
| 85 |
-
# sims = matrix @ qv[0] # IP because vectors are normalized
|
| 86 |
-
# I = np.argsort(-sims)[:k]
|
| 87 |
-
# D = sims[I]
|
| 88 |
-
# return D[None, :], I[None, :]
|
| 89 |
-
|
| 90 |
-
# # -------------------------------------------------
|
| 91 |
-
# # Build index
|
| 92 |
-
# # -------------------------------------------------
|
| 93 |
-
# def build_from_pdfs(files) -> str:
|
| 94 |
-
# global doc_chunks, doc_meta
|
| 95 |
-
# doc_chunks, doc_meta = [], []
|
| 96 |
-
|
| 97 |
-
# for f in files:
|
| 98 |
-
# text = _extract_text_from_pdf(f.name)
|
| 99 |
-
# chunks = _chunk_text(text, CHUNK_SIZE, CHUNK_OVERLAP)
|
| 100 |
-
# for c in chunks:
|
| 101 |
-
# doc_chunks.append(c)
|
| 102 |
-
# doc_meta.append({"file": os.path.basename(f.name)})
|
| 103 |
-
|
| 104 |
-
# if not doc_chunks:
|
| 105 |
-
# return "No text extracted. Check your PDFs."
|
| 106 |
-
|
| 107 |
-
# E = _embed(doc_chunks)
|
| 108 |
-
# _ensure_index(E.shape[1])
|
| 109 |
-
# _add_embeddings(E)
|
| 110 |
-
# return f"Indexed {len(doc_chunks)} chunks from {len(files)} file(s)."
|
| 111 |
-
|
| 112 |
-
# # -------------------------------------------------
|
| 113 |
-
# # Retrieval + Generation
|
| 114 |
-
# # -------------------------------------------------
|
| 115 |
-
# def _retrieve(query: str, k: int = TOP_K) -> Tuple[List[int], List[str]]:
|
| 116 |
-
# qv = _embed([query])
|
| 117 |
-
# _, idxs = _search(qv, k)
|
| 118 |
-
# ids = [i for i in idxs[0].tolist() if i >= 0]
|
| 119 |
-
# return ids, [doc_chunks[i] for i in ids]
|
| 120 |
-
|
| 121 |
-
# def _call_chat(messages):
|
| 122 |
-
# """
|
| 123 |
-
# Try several Hugging Face client paths for max compatibility.
|
| 124 |
-
# Returns generated string or raises last exception.
|
| 125 |
-
# """
|
| 126 |
-
# # 1) Newer helper
|
| 127 |
-
# try:
|
| 128 |
-
# resp = client.chat_completion(messages=messages, max_tokens=512, temperature=0.2, top_p=0.95)
|
| 129 |
-
# # resp.choices[0].message.content (object or dict)
|
| 130 |
-
# choice = resp.choices[0]
|
| 131 |
-
# msg = getattr(choice, "message", None) or choice["message"]
|
| 132 |
-
# return getattr(msg, "content", None) or msg["content"]
|
| 133 |
-
# except Exception as e1:
|
| 134 |
-
# last = e1
|
| 135 |
-
# # 2) OpenAI-style
|
| 136 |
-
# try:
|
| 137 |
-
# resp = client.chat.completions.create(model=GEN_MODEL, messages=messages, max_tokens=512, temperature=0.2, top_p=0.95)
|
| 138 |
-
# choice = resp.choices[0]
|
| 139 |
-
# msg = getattr(choice, "message", None) or choice["message"]
|
| 140 |
-
# return getattr(msg, "content", None) or msg["content"]
|
| 141 |
-
# except Exception as e2:
|
| 142 |
-
# last = e2
|
| 143 |
-
# # 3) Text generation with a single composed prompt
|
| 144 |
-
# try:
|
| 145 |
-
# prompt = f"[INST] {SYSTEM_PROMPT}\n\n{messages[-1]['content']} [/INST]"
|
| 146 |
-
# return client.text_generation(prompt, max_new_tokens=512, temperature=0.2, top_p=0.95,
|
| 147 |
-
# repetition_penalty=1.05, do_sample=True, return_full_text=False).strip()
|
| 148 |
-
# except Exception as e3:
|
| 149 |
-
# last = e3
|
| 150 |
-
# # 4) Old conversational task
|
| 151 |
-
# try:
|
| 152 |
-
# conv = client.conversational(
|
| 153 |
-
# past_user_inputs=[],
|
| 154 |
-
# generated_responses=[],
|
| 155 |
-
# text=messages[-1]["content"],
|
| 156 |
-
# parameters={"temperature": 0.2, "max_new_tokens": 512},
|
| 157 |
-
# )
|
| 158 |
-
# return conv["generated_text"] if isinstance(conv, dict) else conv.generated_text
|
| 159 |
-
# except Exception as e4:
|
| 160 |
-
# last = e4
|
| 161 |
-
# raise last
|
| 162 |
-
|
| 163 |
-
# def answer(question: str) -> str:
|
| 164 |
-
# if not question.strip():
|
| 165 |
-
# return "Ask a question."
|
| 166 |
-
# if (USE_FAISS and index is None) or (not USE_FAISS and matrix is None) or not doc_chunks:
|
| 167 |
-
# return "Upload PDFs and click **Build Index** first."
|
| 168 |
-
|
| 169 |
-
# ids, ctx_chunks = _retrieve(question, TOP_K)
|
| 170 |
-
# contexts, files = [], []
|
| 171 |
-
# for rank, i in enumerate(ids, start=1):
|
| 172 |
-
# chunk = doc_chunks[i][:1000]
|
| 173 |
-
# fname = doc_meta[i]["file"]
|
| 174 |
-
# contexts.append(f"[{rank}] {fname}\n{chunk}")
|
| 175 |
-
# files.append(fname)
|
| 176 |
-
|
| 177 |
-
# context_str = "\n\n---\n".join(contexts)
|
| 178 |
-
# messages = [
|
| 179 |
-
# {"role": "system", "content": SYSTEM_PROMPT},
|
| 180 |
-
# {"role": "user", "content": f"QUESTION: {question}\n\nCONTEXT:\n{context_str}"},
|
| 181 |
-
# ]
|
| 182 |
-
|
| 183 |
-
# try:
|
| 184 |
-
# out = _call_chat(messages)
|
| 185 |
-
# unique_files = ", ".join(sorted(set(files))) if files else "N/A"
|
| 186 |
-
# return f"{out.strip()}\n\nSources: {unique_files}"
|
| 187 |
-
# except Exception as e:
|
| 188 |
-
# return f"Generation error: {e}\n(Verify your HUGGINGFACEHUB_API_TOKEN and model availability.)"
|
| 189 |
-
|
| 190 |
-
# # -------------------------------------------------
|
| 191 |
-
# # UI
|
| 192 |
-
# # -------------------------------------------------
|
| 193 |
-
# with gr.Blocks(title="Mistral 7B PDF-RAG") as demo:
|
| 194 |
-
# gr.Markdown("# 📚 PDF-RAG (Mistral-7B-Instruct)\nUpload PDFs → Build Index → Ask questions. Answers cite sources.")
|
| 195 |
-
|
| 196 |
-
# with gr.Row():
|
| 197 |
-
# with gr.Column(scale=1):
|
| 198 |
-
# files = gr.File(file_count="multiple", file_types=[".pdf"], label="Upload PDF books")
|
| 199 |
-
# build_btn = gr.Button("Build Index", variant="primary")
|
| 200 |
-
# status = gr.Markdown()
|
| 201 |
-
# with gr.Column(scale=2):
|
| 202 |
-
# q = gr.Textbox(label="Ask a question", placeholder="What does the book say about ...?")
|
| 203 |
-
# ask_btn = gr.Button("Ask ➜")
|
| 204 |
-
# a = gr.Markdown()
|
| 205 |
-
|
| 206 |
-
# build_btn.click(build_from_pdfs, inputs=[files], outputs=[status])
|
| 207 |
-
# ask_btn.click(answer, inputs=[q], outputs=[a])
|
| 208 |
-
# q.submit(answer, inputs=[q], outputs=[a])
|
| 209 |
-
|
| 210 |
-
# if __name__ == "__main__":
|
| 211 |
-
# demo.launch()
|
| 212 |
-
|
| 213 |
-
|
| 214 |
|
| 215 |
"""
|
| 216 |
BOOK BUDDY — Ask questions about your PDFs
|
|
@@ -260,7 +47,7 @@ client = InferenceClient(model=ROBOT_MODEL, token=HF_TOKEN)
|
|
| 260 |
# A friendly rule for the robot
|
| 261 |
ROBOT_RULES = (
|
| 262 |
"You are a helpful assistant. Use the given CONTEXT to answer the QUESTION.\n"
|
| 263 |
-
"If the answer is not in the context, say
|
| 264 |
"Be short and add source filenames at the end like [source: file.pdf]."
|
| 265 |
)
|
| 266 |
|
|
@@ -420,10 +207,8 @@ with gr.Blocks(title="📚 Book Buddy — Ask your PDFs") as demo:
|
|
| 420 |
q = gr.Textbox(label="Ask a question", placeholder="Example: Give me 3 key points from this book.")
|
| 421 |
examples = gr.Examples(
|
| 422 |
examples=[
|
| 423 |
-
["What is a molecule?"],
|
| 424 |
["Summarize the main idea in 2 sentences."],
|
| 425 |
["List 3 important facts from this book."],
|
| 426 |
-
["What is the name of the book?"],
|
| 427 |
],
|
| 428 |
inputs=q,
|
| 429 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
|
| 2 |
"""
|
| 3 |
BOOK BUDDY — Ask questions about your PDFs
|
|
|
|
| 47 |
# A friendly rule for the robot
|
| 48 |
ROBOT_RULES = (
|
| 49 |
"You are a helpful assistant. Use the given CONTEXT to answer the QUESTION.\n"
|
| 50 |
+
"!!IMPORTANT!! - If the answer is not in the context, Strictly say 'I don't know.', Do not respond any other answers!!!\n"
|
| 51 |
"Be short and add source filenames at the end like [source: file.pdf]."
|
| 52 |
)
|
| 53 |
|
|
|
|
| 207 |
q = gr.Textbox(label="Ask a question", placeholder="Example: Give me 3 key points from this book.")
|
| 208 |
examples = gr.Examples(
|
| 209 |
examples=[
|
|
|
|
| 210 |
["Summarize the main idea in 2 sentences."],
|
| 211 |
["List 3 important facts from this book."],
|
|
|
|
| 212 |
],
|
| 213 |
inputs=q,
|
| 214 |
)
|