| import streamlit as st |
| from langchain.embeddings import HuggingFaceEmbeddings |
| from langchain.vectorstores import FAISS |
| from langchain.text_splitter import RecursiveCharacterTextSplitter |
| from langchain.memory import ConversationBufferMemory |
| from langchain.llms import HuggingFaceHub |
| from langchain.chains import RetrievalQA |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
| from pdfminer.high_level import extract_text |
| def get_pdf_text(files): |
| full_text = "" |
| for file in files: |
| text = extract_text(file) |
| text = text.replace("\n", " ") |
| full_text = text + full_text |
| return full_text |
|
|
| st.title("Embedding Creation for Langchain") |
| st.header("File Upload") |
| files = st.file_uploader("Upload your files", accept_multiple_files=True, type="pdf") |
| |
| if files: |
| question = st.text_input("Ask a question") |
| if st.button("Search"): |
| with st.spinner("Fetching 3 most similar matches..."): |
| full_text = get_pdf_text(files) |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150) |
| chunks = text_splitter.split_text(full_text) |
| embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") |
| db = FAISS.from_texts(chunks, embeddings) |
| memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True) |
| chain = RetrievalQA.from_llm( |
| llm=AutoModelForCausalLM.from_pretrained("red1xe/Llama-2-7B-codeGPT"), |
| memory=memory, |
| retriever=db.as_retriever(search_kwargs={"k": 3}), |
| ) |
| answer = chain.answer(question) |
| st.write(answer) |