Spaces:
Build error
Build error
| # Import necessary libraries | |
| from langchain.document_loaders import DirectoryLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.embeddings import SentenceTransformerEmbeddings | |
| from langchain.vectorstores import Chroma | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
| from langchain_community.llms import HuggingFacePipeline | |
| from langchain.chains.question_answering import load_qa_chain | |
| # Load and process documents | |
| dir = "data" | |
| def load_docs(dir): | |
| loader = DirectoryLoader(dir) | |
| docs = loader.load() | |
| return docs | |
| docs = load_docs(dir) | |
| def split_docs(docs, chunk_size=512, chunk_overlap=20): | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) | |
| split_docs = text_splitter.split_documents(docs) | |
| return split_docs | |
| docs = split_docs(docs) | |
| # Initialize embeddings and vector store | |
| embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") | |
| persist_directory = "chroma_db" | |
| vectordb = Chroma.from_documents(docs, embeddings, persist_directory=persist_directory) | |
| vectordb.persist() | |
| new_db = Chroma(persist_directory=persist_directory, embedding_function=embeddings) | |
| def get_similar_docs(query, k=2, score=False): | |
| if score: | |
| similar_docs = new_db.similarity_search_with_score(query, k=k) | |
| else: | |
| similar_docs = new_db.similarity_search(query, k=k) | |
| return similar_docs | |
| # Load LLM model from Hugging Face | |
| # model_name = "HuggingFaceH4/zephyr-7b-beta" | |
| # model = AutoModelForCausalLM.from_pretrained(model_name) | |
| # tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| # model = AutoModelForCausalLM.from_pretrained("gpt2") | |
| # tokenizer = AutoTokenizer.from_pretrained("gpt2") | |
| tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama_v1.1") | |
| model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama_v1.1") | |
| text_generation_pipeline = pipeline( | |
| model=model, | |
| tokenizer=tokenizer, | |
| task="text-generation", | |
| temperature=0.2, | |
| do_sample=True, | |
| repetition_penalty=1.1, | |
| return_full_text=True, | |
| max_new_tokens=400, | |
| ) | |
| # text_generation_pipeline = pipeline("text-generation", model="bigscience/bloom-1b7") | |
| llm = HuggingFacePipeline(pipeline=text_generation_pipeline) | |
| chain = load_qa_chain(llm, chain_type="stuff") | |
| def get_helpful_answer(text): | |
| # Find the index of "Helpful Answer:" | |
| index = text.find("Helpful Answer:") | |
| # If "Helpful Answer:" is not found, return an empty string | |
| if index == -1: | |
| return "" | |
| # Add the length of "Helpful Answer:" to the index to start from the end of this string | |
| index += len("Helpful Answer:") | |
| # Return the text from this index to the end | |
| return text[index:].strip() | |
| def get_answer(query): | |
| similar_docs = get_similar_docs(query) | |
| answer = chain.run(input_documents=similar_docs, question=query) | |
| answer = get_helpful_answer(answer) | |
| return answer | |