Spaces:

dev6696
/

final_llm

Sleeping

App Files Files Community

final_llm / app.py

dev6696

Create app.py

f788fb6 verified 7 days ago

raw

history blame contribute delete

3.45 kB

	import gradio as gr
	import torch, faiss, pickle, numpy as np
	from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
	from sentence_transformers import SentenceTransformer
	from huggingface_hub import hf_hub_download

	MODEL_REPO = "dev6696/edu-llm-llama3" # 👈 your repo

	# ── Load RAG ───────────────────────────────────────────
	index_path = hf_hub_download(MODEL_REPO, "faiss_index.bin")
	chunks_path = hf_hub_download(MODEL_REPO, "chunks_meta.pkl")

	index = faiss.read_index(index_path)
	with open(chunks_path, "rb") as f:
	store = pickle.load(f)
	all_chunks = store["chunks"]
	metadata = store["metadata"]

	embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

	def retrieve(query, top_k=3):
	q_emb = embedder.encode([query], normalize_embeddings=True).astype("float32")
	scores, indices = index.search(q_emb, top_k)
	results = []
	for score, idx in zip(scores[0], indices[0]):
	if idx >= 0 and score > 0.3:
	results.append(f"[{metadata[idx]['source']}]\n{all_chunks[idx]}")
	return "\n\n---\n\n".join(results) if results else ""

	# ── Load Model ─────────────────────────────────────────
	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=torch.bfloat16,
	bnb_4bit_use_double_quant=True,
	)

	tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO)
	tokenizer.pad_token = tokenizer.eos_token

	model = AutoModelForCausalLM.from_pretrained(
	MODEL_REPO,
	quantization_config=bnb_config,
	device_map="auto",
	torch_dtype=torch.bfloat16,
	low_cpu_mem_usage=True,
	)
	model.eval()

	# ── Inference ──────────────────────────────────────────
	def answer(query, history):
	context = retrieve(query)
	system_msg = "You are an expert educational assistant."
	if context:
	system_msg += f"\n\nContext:\n{context}"

	prompt = (
	f"<\|begin_of_text\|>"
	f"<\|start_header_id\|>system<\|end_header_id\|>\n{system_msg}\n<\|eot_id\|>"
	f"<\|start_header_id\|>user<\|end_header_id\|>\n{query}\n<\|eot_id\|>"
	f"<\|start_header_id\|>assistant<\|end_header_id\|>\n"
	)
	inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to("cuda")
	with torch.no_grad():
	out = model.generate(
	**inputs,
	max_new_tokens=512,
	temperature=0.7,
	top_p=0.9,
	do_sample=True,
	repetition_penalty=1.1,
	pad_token_id=tokenizer.eos_token_id,
	)
	decoded = tokenizer.decode(out[0], skip_special_tokens=True)
	return decoded.split("<\|start_header_id\|>assistant<\|end_header_id\|>")[-1].strip()

	# ── Gradio UI ──────────────────────────────────────────
	with gr.Blocks(theme=gr.themes.Soft(), title="EduLLM") as demo:
	gr.Markdown("# 📚 EduLLM — AI Educational Assistant")
	gr.Markdown("Powered by Llama-3.1-1B + QLoRA + RAG")
	chatbot = gr.ChatInterface(
	fn=answer,
	examples=["Explain Newton's second law", "What is photosynthesis?"],
	cache_examples=False,
	)

	demo.launch()