| import gradio as gr |
| import torch, faiss, pickle, numpy as np |
| from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig |
| from sentence_transformers import SentenceTransformer |
| from huggingface_hub import hf_hub_download |
|
|
| MODEL_REPO = "dev6696/edu-llm-llama3" |
|
|
| |
| index_path = hf_hub_download(MODEL_REPO, "faiss_index.bin") |
| chunks_path = hf_hub_download(MODEL_REPO, "chunks_meta.pkl") |
|
|
| index = faiss.read_index(index_path) |
| with open(chunks_path, "rb") as f: |
| store = pickle.load(f) |
| all_chunks = store["chunks"] |
| metadata = store["metadata"] |
|
|
| embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") |
|
|
| def retrieve(query, top_k=3): |
| q_emb = embedder.encode([query], normalize_embeddings=True).astype("float32") |
| scores, indices = index.search(q_emb, top_k) |
| results = [] |
| for score, idx in zip(scores[0], indices[0]): |
| if idx >= 0 and score > 0.3: |
| results.append(f"[{metadata[idx]['source']}]\n{all_chunks[idx]}") |
| return "\n\n---\n\n".join(results) if results else "" |
|
|
| |
| bnb_config = BitsAndBytesConfig( |
| load_in_4bit=True, |
| bnb_4bit_quant_type="nf4", |
| bnb_4bit_compute_dtype=torch.bfloat16, |
| bnb_4bit_use_double_quant=True, |
| ) |
|
|
| tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO) |
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| model = AutoModelForCausalLM.from_pretrained( |
| MODEL_REPO, |
| quantization_config=bnb_config, |
| device_map="auto", |
| torch_dtype=torch.bfloat16, |
| low_cpu_mem_usage=True, |
| ) |
| model.eval() |
|
|
| |
| def answer(query, history): |
| context = retrieve(query) |
| system_msg = "You are an expert educational assistant." |
| if context: |
| system_msg += f"\n\nContext:\n{context}" |
|
|
| prompt = ( |
| f"<|begin_of_text|>" |
| f"<|start_header_id|>system<|end_header_id|>\n{system_msg}\n<|eot_id|>" |
| f"<|start_header_id|>user<|end_header_id|>\n{query}\n<|eot_id|>" |
| f"<|start_header_id|>assistant<|end_header_id|>\n" |
| ) |
| inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to("cuda") |
| with torch.no_grad(): |
| out = model.generate( |
| **inputs, |
| max_new_tokens=512, |
| temperature=0.7, |
| top_p=0.9, |
| do_sample=True, |
| repetition_penalty=1.1, |
| pad_token_id=tokenizer.eos_token_id, |
| ) |
| decoded = tokenizer.decode(out[0], skip_special_tokens=True) |
| return decoded.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip() |
|
|
| |
| with gr.Blocks(theme=gr.themes.Soft(), title="EduLLM") as demo: |
| gr.Markdown("# π EduLLM β AI Educational Assistant") |
| gr.Markdown("Powered by Llama-3.1-1B + QLoRA + RAG") |
| chatbot = gr.ChatInterface( |
| fn=answer, |
| examples=["Explain Newton's second law", "What is photosynthesis?"], |
| cache_examples=False, |
| ) |
|
|
| demo.launch() |