import gradio as gr import torch, os from nemo.collections.speechlm2.models import SALM # Descarga y carga el modelo (1,3 GB → cabe en 16 GB RAM) MODEL_NAME = "nvidia/canary-qwen-2.5b" device = "cuda" if torch.cuda.is_available() else "cpu" model = SALM.from_pretrained(MODEL_NAME).to(device).eval() def transcribe(audio): # Gradio ya entrega un .wav de 16 kHz mono with open(audio, "rb") as f: hyp = model.transcribe([f.read()], batch_size=1)[0][0] return hyp demo = gr.Interface(fn=transcribe, inputs=gr.Audio(type="filepath", label="Sube o graba audio"), outputs=gr.Textbox(label="Transcripción")) demo.launch(server_name="0.0.0.0", server_port=7860)