Spaces:
Runtime error
Runtime error
| # app.py β Corrected for Hugging Face ZeroGPU Spaces | |
| # --------------------------------------------------------------- | |
| # This version is adapted for the ZeroGPU environment by using | |
| # the @spaces.GPU decorator. | |
| # --------------------------------------------------------------- | |
| import os | |
| import torch | |
| import gradio as gr | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| import spaces # 1. Import the spaces library | |
| IS_CUDA = torch.cuda.is_available() | |
| IS_ZEROGPU = True if os.getenv("SPACES_ZERO_GPU", None) else False | |
| if IS_ZEROGPU: | |
| torch.compiler.set_stance("force_eager") | |
| torch.set_float32_matmul_precision("high") | |
| torch.backends.cuda.matmul.allow_tf32 = True | |
| # ββ Configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| MODEL_ID = "Reubencf/gemma3-konkani" | |
| HF_TOKEN = os.getenv("HF_TOKEN", None) | |
| TITLE = "Konkani LLM Fine Tuned on Gemma 3" | |
| DESCRIPTION = ( | |
| "Version 1 of the Konkani LLM.\n" | |
| "This release may contain inconsistencies, but improvements will follow in future updates." | |
| ) | |
| # ββ Loading ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print(f"[Init] Loading model pipeline for the first time: {MODEL_ID}...") | |
| def load_model(): | |
| try: | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN) | |
| kwargs = {"attn_implementation": "sdpa"} if IS_CUDA else {} | |
| model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16 if IS_CUDA else torch.float32, | |
| device_map="auto", token=HF_TOKEN, **kwargs) | |
| print("[Init] Model loaded successfully.") | |
| return model, tokenizer | |
| except Exception as e: | |
| # If model loading fails, we can't proceed. | |
| print(f"[Fatal] Could not load model: {e}") | |
| raise Exception(f"β Model failed to load: {e}") | |
| model, tokenizer = load_model() | |
| DEF_TOKENS = 256 | |
| DEF_TEMPERATURE = 0.7 | |
| DEF_TOPK = 50 | |
| DEF_TOPP = 0.95 | |
| DEF_DURATION = 10 | |
| def get_duration(message, history=[], system_message="", max_tokens=DEF_TOKENS, temperature=DEF_TEMPERATURE, top_k=DEF_TOPK, top_p=DEF_TOPP, duration=DEF_DURATION): | |
| return int(duration if duration is not None else DEF_DURATION) | |
| # ββ Generation Function ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 2. Decorate the function that needs the GPU | |
| def generate_response(message, history=[], system_message="", max_tokens=DEF_TOKENS, temperature=DEF_TEMPERATURE, top_k=DEF_TOPK, top_p=DEF_TOPP, duration=DEF_DURATION): | |
| """ | |
| This function is called for each user message. | |
| The @spaces.GPU decorator ensures a GPU is allocated when this runs. | |
| """ | |
| try: | |
| # Format the conversation history | |
| conversation = [] | |
| if system_message: conversation.append({"role": "system", "content": system_message}) | |
| for msg in history: # https://www.gradio.app/docs/gradio/chatbot | |
| if not isinstance(msg, dict) or not {"role", "content"}.issubset(msg.keys()): continue | |
| conversation.append({"role": msg["role"], "content": msg["content"]}) | |
| # Add the current user's message | |
| conversation.append({"role": "user", "content": message}) | |
| # Apply the chat template | |
| inputs = tokenizer.apply_chat_template( | |
| conversation, | |
| tokenize=True, | |
| add_generation_prompt=True, | |
| return_tensors="pt", | |
| return_dict=True, | |
| ).to(model.device) | |
| # Generate the response | |
| gen_kwargs = dict( | |
| input_ids=inputs["input_ids"], | |
| attention_mask=inputs["attention_mask"], | |
| max_new_tokens=max_tokens, | |
| do_sample=True, | |
| temperature=temperature, | |
| top_k=top_k, | |
| top_p=top_p, | |
| #eos_token_id=tokenizer.eos_token_id, | |
| #num_beams=1, | |
| output_scores=False, | |
| cache_implementation="static", # https://github.com/huggingface/transformers/issues/38501 | |
| ) | |
| outputs = model.generate(**gen_kwargs) | |
| # Extract only the newly generated text | |
| gen_ids = outputs[0][inputs["input_ids"].shape[-1]:] | |
| new_response = tokenizer.decode(gen_ids, skip_special_tokens=True) | |
| return new_response | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| gr.Warning(f"Error: {e}") | |
| return "" | |
| # ββ UI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| examples = [ | |
| ["Translate From English to Devnagri Konkani: what is color?"], | |
| ["ΰ€ΰ€°ΰ€Ύΰ€ΰ€€ ΰ€΅ΰ€Ώΰ€ΰ₯ΰ€ΰ₯ ΰ€΅ΰ€Ύΰ€ͺΰ€° ΰ€ΰ€£ΰ₯ ΰ€ΰ€°ΰ€ͺΰ€Ύΰ€ΰ₯ ΰ€―ΰ₯ΰ€΅ΰ€ΰ€£ ΰ€€ΰ€―ΰ€Ύΰ€° ΰ€ΰ€°ΰ€ͺ."], | |
| ] | |
| demo = gr.ChatInterface( | |
| fn=generate_response, | |
| type="messages", | |
| title=TITLE, | |
| description=DESCRIPTION, | |
| examples=examples, | |
| cache_examples=True, | |
| theme="soft", | |
| additional_inputs=[ | |
| gr.Textbox(value="", label="System message"), | |
| gr.Slider(minimum=1, maximum=2048, value=DEF_TOKENS, step=1, label="Max new tokens"), | |
| gr.Slider(minimum=0.1, maximum=4.0, value=DEF_TEMPERATURE, step=0.1, label="Temperature"), | |
| gr.Slider(minimum=0, maximum=360, value=DEF_TOPK, step=1, label="Top-k"), | |
| gr.Slider(minimum=0.1, maximum=1.0, value=DEF_TOPP, step=0.05, label="Top-p (nucleus sampling)"), | |
| gr.Slider(minimum=1, maximum=360, value=DEF_DURATION, step=1, label="Duration"), | |
| ], | |
| ) | |
| # ββ Launch ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if __name__ == "__main__": | |
| print("π Starting Gradio app for ZeroGPU...") | |
| demo.queue().launch() |