Spaces:
Sleeping
Sleeping
| # app.py (LoRA-only loading) | |
| import gradio as gr | |
| from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, pipeline | |
| import torch | |
| import os | |
| import re | |
| import json | |
| import time | |
| from datetime import datetime | |
| from huggingface_hub import model_info | |
| # ===== Settings ===== | |
| device = 0 if torch.cuda.is_available() else -1 | |
| lora_repo = "rahul7star/Qwen2.5-0.5B-Gita" # ONLY LoRA fine-tuned repo | |
| log_lines = [] | |
| def log(msg): | |
| line = f"[{datetime.now().strftime('%H:%M:%S')}] {msg}" | |
| print(line) | |
| log_lines.append(line) | |
| log(f"π Loading LoRA-only model from {lora_repo}") | |
| log(f"Device: {'GPU' if device==0 else 'CPU'}") | |
| # ====== Tokenizer ====== | |
| try: | |
| tokenizer = AutoTokenizer.from_pretrained(lora_repo, trust_remote_code=True) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| log(f"β Tokenizer loaded: vocab size {tokenizer.vocab_size}") | |
| except Exception as e: | |
| log(f"β Tokenizer load failed: {e}") | |
| tokenizer = None | |
| # ====== LoRA-only model ====== | |
| model = None | |
| pipe = None | |
| try: | |
| model = AutoModelForCausalLM.from_pretrained( | |
| lora_repo, | |
| trust_remote_code=True, | |
| torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, | |
| device_map="auto" if torch.cuda.is_available() else None, | |
| ) | |
| model.eval() | |
| log("β LoRA-only model loaded successfully") | |
| pipe = pipeline( | |
| "text-generation", | |
| model=model, | |
| tokenizer=tokenizer, | |
| device=device, | |
| ) | |
| log("β Pipeline ready for inference") | |
| except Exception as e: | |
| log(f"β LoRA model load failed: {e}") | |
| # ====== Chat Function ====== | |
| def chat_with_model(message, history): | |
| log_lines.clear() | |
| log(f"π User message: {message}") | |
| if pipe is None: | |
| return "", history, "β οΈ Model pipeline not loaded." | |
| context = "The following is a conversation between a user and an AI assistant trained on Bhagavad Gita excerpts.\n" | |
| for user, bot in history: | |
| context += f"User: {user}\nAssistant: {bot}\n" | |
| context += f"User: {message}\nAssistant:" | |
| log("π Built conversation context") | |
| log(context) | |
| start_time = time.time() | |
| try: | |
| output = pipe( | |
| context, | |
| max_new_tokens=200, | |
| do_sample=True, | |
| temperature=0.7, | |
| top_p=0.9, | |
| repetition_penalty=1.1, | |
| )[0]["generated_text"] | |
| log(f"β±οΈ Inference took {time.time() - start_time:.2f}s") | |
| except Exception as e: | |
| log(f"β Generation failed: {e}") | |
| return "", history, "\n".join(log_lines) | |
| # Clean reply | |
| reply = output[len(context):].strip() | |
| reply = re.sub(r"(ContentLoaded|<\/?[^>]+>|[\r\n]{2,})", " ", reply) | |
| reply = re.sub(r"\s{2,}", " ", reply).strip() | |
| reply = reply.split("User:")[0].split("Assistant:")[0].strip() | |
| log(f"πͺ Model reply: {reply}") | |
| history.append((message, reply)) | |
| return "", history, "\n".join(log_lines) | |
| # ===== Gradio ===== | |
| with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo: | |
| gr.Markdown("## π¬ Qwen LoRA-only β Bhagavad Gita Assistant") | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| chatbot = gr.Chatbot(height=500) | |
| msg = gr.Textbox(placeholder="Ask about the Gita...", label="Your Message") | |
| clear = gr.Button("Clear") | |
| with gr.Column(scale=1): | |
| log_box = gr.Textbox(label="Detailed Model Log", lines=25, interactive=False) | |
| msg.submit(chat_with_model, [msg, chatbot], [msg, chatbot, log_box]) | |
| clear.click(lambda: (None, None, ""), None, [chatbot, log_box], queue=False) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |