swisscondor commited on
Commit
cf68bef
·
verified ·
1 Parent(s): cf505b9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -10
app.py CHANGED
@@ -1,10 +1,30 @@
1
  import streamlit as st
2
  import chromadb
3
  import torch
4
- from transformers import pipeline
 
5
  from PyPDF2 import PdfReader
6
  import os
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  # Initialize Hugging Face pipeline for question answering
9
  def load_qa_pipeline():
10
  return pipeline("question-answering", model="deepset/roberta-base-squad2")
@@ -24,8 +44,8 @@ def split_text_into_chunks(text, chunk_size=500, overlap=100):
24
  chunks.append(text[i:i+chunk_size])
25
  return chunks
26
 
27
- # Create ChromaDB collection
28
- def create_chroma_collection(chunks):
29
  # Use persistent client to avoid memory issues
30
  client = chromadb.PersistentClient(path="./chroma_db")
31
 
@@ -35,19 +55,27 @@ def create_chroma_collection(chunks):
35
  # Create collection
36
  collection = client.create_collection(name=collection_name)
37
 
38
- # Add chunks to collection
39
  for i, chunk in enumerate(chunks):
 
 
 
40
  collection.add(
41
  ids=[f"chunk_{i}"],
42
- documents=[chunk]
 
43
  )
44
 
45
  return client, collection, collection_name
46
 
47
  # Retrieve most relevant context
48
- def retrieve_context(collection, question, top_k=3):
 
 
 
 
49
  results = collection.query(
50
- query_texts=[question],
51
  n_results=top_k
52
  )
53
  return results['documents'][0]
@@ -56,6 +84,9 @@ def retrieve_context(collection, question, top_k=3):
56
  def main():
57
  st.title("PDF Question Answering App")
58
 
 
 
 
59
  # File uploader
60
  uploaded_file = st.file_uploader("Upload PDF", type=['pdf'])
61
 
@@ -75,11 +106,15 @@ def main():
75
  # Split text into chunks
76
  text_chunks = split_text_into_chunks(pdf_text)
77
 
78
- # Create ChromaDB collection
79
- client, collection, collection_name = create_chroma_collection(text_chunks)
 
 
80
 
81
  # Retrieve context
82
- contexts = retrieve_context(collection, question)
 
 
83
 
84
  # Prepare answers
85
  answers = []
 
1
  import streamlit as st
2
  import chromadb
3
  import torch
4
+ from transformers import pipeline, AutoModel, AutoTokenizer
5
+ import numpy as np
6
  from PyPDF2 import PdfReader
7
  import os
8
 
9
+ # Load sentence transformer model for embeddings
10
+ def load_embedding_model():
11
+ model = AutoModel.from_pretrained("cross-encoder/qnli-electra-base")
12
+ tokenizer = AutoTokenizer.from_pretrained("cross-encoder/qnli-electra-base")
13
+ return model, tokenizer
14
+
15
+ # Generate embeddings for text
16
+ def generate_embedding(model, tokenizer, text):
17
+ # Tokenize the text
18
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
19
+
20
+ # Generate embeddings
21
+ with torch.no_grad():
22
+ outputs = model(**inputs)
23
+
24
+ # Use the last hidden state as embedding
25
+ embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
26
+ return embeddings
27
+
28
  # Initialize Hugging Face pipeline for question answering
29
  def load_qa_pipeline():
30
  return pipeline("question-answering", model="deepset/roberta-base-squad2")
 
44
  chunks.append(text[i:i+chunk_size])
45
  return chunks
46
 
47
+ # Create ChromaDB collection with embeddings
48
+ def create_chroma_collection(chunks, model, tokenizer):
49
  # Use persistent client to avoid memory issues
50
  client = chromadb.PersistentClient(path="./chroma_db")
51
 
 
55
  # Create collection
56
  collection = client.create_collection(name=collection_name)
57
 
58
+ # Add chunks to collection with embeddings
59
  for i, chunk in enumerate(chunks):
60
+ # Generate embedding for the chunk
61
+ embedding = generate_embedding(model, tokenizer, chunk)
62
+
63
  collection.add(
64
  ids=[f"chunk_{i}"],
65
+ documents=[chunk],
66
+ embeddings=[embedding.tolist()]
67
  )
68
 
69
  return client, collection, collection_name
70
 
71
  # Retrieve most relevant context
72
+ def retrieve_context(collection, question, model, tokenizer, top_k=3):
73
+ # Generate embedding for the question
74
+ question_embedding = generate_embedding(model, tokenizer, question)
75
+
76
+ # Query the collection
77
  results = collection.query(
78
+ query_embeddings=[question_embedding.tolist()],
79
  n_results=top_k
80
  )
81
  return results['documents'][0]
 
84
  def main():
85
  st.title("PDF Question Answering App")
86
 
87
+ # Load embedding model
88
+ embedding_model, tokenizer = load_embedding_model()
89
+
90
  # File uploader
91
  uploaded_file = st.file_uploader("Upload PDF", type=['pdf'])
92
 
 
106
  # Split text into chunks
107
  text_chunks = split_text_into_chunks(pdf_text)
108
 
109
+ # Create ChromaDB collection with embeddings
110
+ client, collection, collection_name = create_chroma_collection(
111
+ text_chunks, embedding_model, tokenizer
112
+ )
113
 
114
  # Retrieve context
115
+ contexts = retrieve_context(
116
+ collection, question, embedding_model, tokenizer
117
+ )
118
 
119
  # Prepare answers
120
  answers = []