Spaces:

swisscondor
/

QuickQuery

Sleeping

App Files Files Community

QuickQuery / app.py

swisscondor

Update app.py

cf68bef verified 12 months ago

raw

history blame contribute delete

4.61 kB

	import streamlit as st
	import chromadb
	import torch
	from transformers import pipeline, AutoModel, AutoTokenizer
	import numpy as np
	from PyPDF2 import PdfReader
	import os

	# Load sentence transformer model for embeddings
	def load_embedding_model():
	model = AutoModel.from_pretrained("cross-encoder/qnli-electra-base")
	tokenizer = AutoTokenizer.from_pretrained("cross-encoder/qnli-electra-base")
	return model, tokenizer

	# Generate embeddings for text
	def generate_embedding(model, tokenizer, text):
	# Tokenize the text
	inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)

	# Generate embeddings
	with torch.no_grad():
	outputs = model(**inputs)

	# Use the last hidden state as embedding
	embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
	return embeddings

	# Initialize Hugging Face pipeline for question answering
	def load_qa_pipeline():
	return pipeline("question-answering", model="deepset/roberta-base-squad2")

	# Extract text from PDF
	def extract_pdf_text(pdf_file):
	reader = PdfReader(pdf_file)
	text = ""
	for page in reader.pages:
	text += page.extract_text() + "\n"
	return text

	# Split text into chunks
	def split_text_into_chunks(text, chunk_size=500, overlap=100):
	chunks = []
	for i in range(0, len(text), chunk_size - overlap):
	chunks.append(text[i:i+chunk_size])
	return chunks

	# Create ChromaDB collection with embeddings
	def create_chroma_collection(chunks, model, tokenizer):
	# Use persistent client to avoid memory issues
	client = chromadb.PersistentClient(path="./chroma_db")

	# Create a unique collection name
	collection_name = f"pdf_qa_collection_{int(torch.rand(1).item() * 10000)}"

	# Create collection
	collection = client.create_collection(name=collection_name)

	# Add chunks to collection with embeddings
	for i, chunk in enumerate(chunks):
	# Generate embedding for the chunk
	embedding = generate_embedding(model, tokenizer, chunk)

	collection.add(
	ids=[f"chunk_{i}"],
	documents=[chunk],
	embeddings=[embedding.tolist()]
	)

	return client, collection, collection_name

	# Retrieve most relevant context
	def retrieve_context(collection, question, model, tokenizer, top_k=3):
	# Generate embedding for the question
	question_embedding = generate_embedding(model, tokenizer, question)

	# Query the collection
	results = collection.query(
	query_embeddings=[question_embedding.tolist()],
	n_results=top_k
	)
	return results['documents'][0]

	# Main Streamlit app
	def main():
	st.title("PDF Question Answering App")

	# Load embedding model
	embedding_model, tokenizer = load_embedding_model()

	# File uploader
	uploaded_file = st.file_uploader("Upload PDF", type=['pdf'])

	# Question input
	question = st.text_input("Enter your question")

	# Run button
	if st.button("Get Answer"):
	if uploaded_file and question:
	try:
	# Load QA pipeline
	qa_pipeline = load_qa_pipeline()

	# Extract PDF text
	pdf_text = extract_pdf_text(uploaded_file)

	# Split text into chunks
	text_chunks = split_text_into_chunks(pdf_text)

	# Create ChromaDB collection with embeddings
	client, collection, collection_name = create_chroma_collection(
	text_chunks, embedding_model, tokenizer
	)

	# Retrieve context
	contexts = retrieve_context(
	collection, question, embedding_model, tokenizer
	)

	# Prepare answers
	answers = []
	for context in contexts:
	result = qa_pipeline(question=question, context=context)
	answers.append(result)

	# Display best answer
	best_answer = max(answers, key=lambda x: x['score'])
	st.write("Answer:", best_answer['answer'])
	st.write("Confidence Score:", best_answer['score'])

	# Clean up ChromaDB collection
	client.delete_collection(name=collection_name)

	except Exception as e:
	st.error(f"An error occurred: {e}")

	if __name__ == "__main__":
	main()