import streamlit as st
import chromadb
import torch
from transformers import pipeline, AutoModel, AutoTokenizer
import numpy as np
from PyPDF2 import PdfReader
import os

# Load sentence transformer model for embeddings
def load_embedding_model():
    model = AutoModel.from_pretrained("cross-encoder/qnli-electra-base")
    tokenizer = AutoTokenizer.from_pretrained("cross-encoder/qnli-electra-base")
    return model, tokenizer

# Generate embeddings for text
def generate_embedding(model, tokenizer, text):
    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    
    # Generate embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        
    # Use the last hidden state as embedding
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embeddings

# Initialize Hugging Face pipeline for question answering
def load_qa_pipeline():
    return pipeline("question-answering", model="deepset/roberta-base-squad2")

# Extract text from PDF
def extract_pdf_text(pdf_file):
    reader = PdfReader(pdf_file)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text

# Split text into chunks
def split_text_into_chunks(text, chunk_size=500, overlap=100):
    chunks = []
    for i in range(0, len(text), chunk_size - overlap):
        chunks.append(text[i:i+chunk_size])
    return chunks

# Create ChromaDB collection with embeddings
def create_chroma_collection(chunks, model, tokenizer):
    # Use persistent client to avoid memory issues
    client = chromadb.PersistentClient(path="./chroma_db")
    
    # Create a unique collection name
    collection_name = f"pdf_qa_collection_{int(torch.rand(1).item() * 10000)}"
    
    # Create collection
    collection = client.create_collection(name=collection_name)
    
    # Add chunks to collection with embeddings
    for i, chunk in enumerate(chunks):
        # Generate embedding for the chunk
        embedding = generate_embedding(model, tokenizer, chunk)
        
        collection.add(
            ids=[f"chunk_{i}"],
            documents=[chunk],
            embeddings=[embedding.tolist()]
        )
    
    return client, collection, collection_name

# Retrieve most relevant context
def retrieve_context(collection, question, model, tokenizer, top_k=3):
    # Generate embedding for the question
    question_embedding = generate_embedding(model, tokenizer, question)
    
    # Query the collection
    results = collection.query(
        query_embeddings=[question_embedding.tolist()],
        n_results=top_k
    )
    return results['documents'][0]

# Main Streamlit app
def main():
    st.title("PDF Question Answering App")
    
    # Load embedding model
    embedding_model, tokenizer = load_embedding_model()
    
    # File uploader
    uploaded_file = st.file_uploader("Upload PDF", type=['pdf'])
    
    # Question input
    question = st.text_input("Enter your question")
    
    # Run button
    if st.button("Get Answer"):
        if uploaded_file and question:
            try:
                # Load QA pipeline
                qa_pipeline = load_qa_pipeline()
                
                # Extract PDF text
                pdf_text = extract_pdf_text(uploaded_file)
                
                # Split text into chunks
                text_chunks = split_text_into_chunks(pdf_text)
                
                # Create ChromaDB collection with embeddings
                client, collection, collection_name = create_chroma_collection(
                    text_chunks, embedding_model, tokenizer
                )
                
                # Retrieve context
                contexts = retrieve_context(
                    collection, question, embedding_model, tokenizer
                )
                
                # Prepare answers
                answers = []
                for context in contexts:
                    result = qa_pipeline(question=question, context=context)
                    answers.append(result)
                
                # Display best answer
                best_answer = max(answers, key=lambda x: x['score'])
                st.write("Answer:", best_answer['answer'])
                st.write("Confidence Score:", best_answer['score'])
                
                # Clean up ChromaDB collection
                client.delete_collection(name=collection_name)
                
            except Exception as e:
                st.error(f"An error occurred: {e}")

if __name__ == "__main__":
    main()