Spaces:
Running on Zero
Running on Zero
| #!/usr/bin/env python3 | |
| """DramaBox β Gradio demo (warm server). | |
| Loads the warm TTSServer once, then handles requests at ~2.5 s each. All | |
| generated audio is invisibly watermarked with Resemble Perth before being | |
| returned to the user. | |
| """ | |
| import logging | |
| import os | |
| import sys | |
| import tempfile | |
| import time | |
| import gradio as gr | |
| from fastapi.responses import HTMLResponse | |
| from fastapi.staticfiles import StaticFiles | |
| from gradio import Server | |
| from gradio.data_classes import FileData | |
| import spaces | |
| # Local src import. | |
| sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "src")) | |
| from inference_server import TTSServer # noqa: E402 | |
| from model_downloader import get_all_paths # noqa: E402 | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") | |
| logging.info("Fetching DramaBox checkpoints from HuggingFace (cached after first run)...") | |
| PATHS = get_all_paths() | |
| # Module-level warm load (same pattern as IndexTTS-2-Demo on ZeroGPU). The | |
| # `spaces` package patches torch so that .to("cuda") at import time pins the | |
| # weights into ZeroGPU's shared memory; each @spaces.GPU call then maps them | |
| # onto the actual GPU instantly. First user request is ~2.5 s instead of ~30 s. | |
| logging.info("Loading DramaBox warm server (Gemma + DiT + VAE + Decoder)...") | |
| tts = TTSServer( | |
| checkpoint=PATHS["transformer"], | |
| full_checkpoint=PATHS["audio_components"], | |
| gemma_root=PATHS["gemma_root"], | |
| device="cuda", | |
| dtype=os.environ.get("LTX_DTYPE", "bf16"), | |
| compile_model=False, # torch.compile breaks under ZeroGPU's brief GPU windows | |
| bnb_4bit=True, # unsloth Gemma is pre-quantized | |
| ) | |
| logging.info("TTSServer ready.") | |
| # ββ Example prompts shipped with a matching voice reference ββββββββββββββββββ | |
| # Files live under assets/voices/ so users can click a row and generate | |
| # without uploading anything. | |
| _VOICES_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "voices") | |
| EXAMPLES: list[tuple[str, str, str]] = [ | |
| ( | |
| "Villain monologue", | |
| os.path.join(_VOICES_DIR, "male_harvey_keitel.mp3"), | |
| 'A shadowy villain speaks with cold menace, "You have entered my domain, mortal." ' | |
| 'He chuckles darkly, "Such arrogance will be your undoing." ' | |
| 'His voice rises with fury, "Kneel, or be destroyed where you stand!"', | |
| ), | |
| ( | |
| "Talk-show host wheeze-laugh", | |
| os.path.join(_VOICES_DIR, "male_conan.mp3"), | |
| 'A talk show host gasps with shock, "No! You did NOT just say that!" ' | |
| 'He bursts into uncontrollable laughter, "Hahaha! Oh my god, oh my god!" ' | |
| 'He wheezes, "I cannot, I literally cannot breathe right now!"', | |
| ), | |
| ( | |
| "Tender goodnight whisper", | |
| os.path.join(_VOICES_DIR, "female_shadowheart.wav"), | |
| 'A woman speaks tenderly, "It has been a long day, my love." ' | |
| 'She whispers, "Close your eyes. I am right here." ' | |
| 'She hums quietly, "Mmmm-mmm. Sleep now."', | |
| ), | |
| ( | |
| "Old-school radio anchor", | |
| os.path.join(_VOICES_DIR, "male_old_movie.wav"), | |
| 'A radio host clears his throat, "Excuse me, pardon that." ' | |
| 'He settles into a warm, professional tone, "Good evening everyone, ' | |
| 'and welcome back to the show. We have got a wonderful lineup tonight."', | |
| ), | |
| ( | |
| "Catgirl uncontrollable giggling", | |
| os.path.join(_VOICES_DIR, "female_american.wav"), | |
| 'A playful girl already mid-giggle, "Hehehe, oh my gosh you should see your face!" ' | |
| 'She gasps for air between giggles, "Oh my, hehe, oh my, I cannot stop!" ' | |
| 'She tries to compose herself, "Ahhhhh okay okay okay, I will stop, I promise."', | |
| ), | |
| ( | |
| "Hero stammering courage", | |
| os.path.join(_VOICES_DIR, "male_arnie.mp3"), | |
| 'A young warrior speaks with a trembling voice, "I... I do not know if I can do this." ' | |
| 'He takes a shaky breath, "But someone has to try." ' | |
| 'His voice steadies with growing fire, "No more running. I WILL fight!"', | |
| ), | |
| ( | |
| "Exhausted dad, fraying patience", | |
| os.path.join(_VOICES_DIR, "male_petergriffin.wav"), | |
| 'An exhausted father speaks with fraying patience, "Sweetie, daddy is asking very nicely." ' | |
| 'He sighs deeply, "Ohhhh my goodness." ' | |
| 'He puts on an overly cheerful voice, "Hey buddy! Look at the shiny thing!" ' | |
| 'Then he laughs helplessly, "Hahaha, I am losing my mind."', | |
| ), | |
| ( | |
| "Smug-confident announcer", | |
| os.path.join(_VOICES_DIR, "male_samuel_j.mp3"), | |
| 'A confident announcer speaks proudly, "And now, the moment you have all been waiting for." ' | |
| 'He chuckles knowingly, "Heheh, trust me, this one is going to blow you away."', | |
| ), | |
| # ββ Long-form examples (~30 s each) βββββββββββββββββββββββββββββββββββββββ | |
| # These pair a richer multi-beat scene with gen_duration = 30 s in the | |
| # Examples row below so the model is asked for a full half-minute clip. | |
| ( | |
| "30s β’ Villain soliloquy", | |
| os.path.join(_VOICES_DIR, "male_harvey_keitel.mp3"), | |
| 'A shadowy villain stands at the edge of his throne room, gazing into the dark. ' | |
| 'He speaks with slow, measured menace, "So, the little hero has come to finish me, has he?" ' | |
| 'He chuckles low and humourless, "Hehe, oh how delightfully predictable you mortals are." ' | |
| 'His voice hardens into ice, "I have lived ten thousand years. I have seen empires rise and fall like the tide." ' | |
| 'He scoffs, "And you think you, with your borrowed sword and your trembling hands, will be the one to end me?" ' | |
| 'A long pause. He whispers, almost tenderly, "I will give you a single chance to turn around and walk away." ' | |
| 'Then his voice rises with crushing finality, "Choose, child. The door behind you, or the grave at your feet."', | |
| ), | |
| ( | |
| "30s β’ Late-night radio monologue", | |
| os.path.join(_VOICES_DIR, "male_old_movie.wav"), | |
| 'A radio host clears his throat softly into the microphone in the late hours of the night. ' | |
| 'He settles into a warm, smoky tone, "Good evening, dear listeners, and welcome back to the After Hours Hour." ' | |
| 'He sighs contentedly, "Mmm, what a night it has been. The rain is tapping at my window like an old friend." ' | |
| 'He chuckles softly, "Heheh, you know the kind of friend, the one that always shows up unannounced." ' | |
| 'His voice drops, intimate, "I want you to lean back, wherever you are. Pour yourself something warm." ' | |
| 'He pauses, breath audible, "Tonight we are going to talk about love, and loss, and the songs that hold us together." ' | |
| 'A smile in his voice, "And I have got the perfect record cued up to start us off, so stay right where you are."', | |
| ), | |
| ( | |
| "30s β’ Stand-up wheeze-laugh", | |
| os.path.join(_VOICES_DIR, "male_conan.mp3"), | |
| 'A talk show host walks out and the crowd is already roaring. He gasps in mock outrage, "No! No no no!" ' | |
| 'He bursts into uncontrollable laughter, "Hahahaha, oh my god, oh my god, you cannot do that to me already!" ' | |
| 'He wheezes, gasping for air, "I have not even, hahaha, I have not even said hello yet!" ' | |
| 'He tries to compose himself, "Okay, okay, just give me a second here, give me a second." ' | |
| 'He clears his throat dramatically, "Ahem. Good evening, ladies and gentlemen." ' | |
| 'Then he loses it again, "Hahaha! No, sorry, sorry, I just remembered what happened in the green room." ' | |
| 'He pants, "Oh man, oh man, this is going to be one of those nights, I can already tell."', | |
| ), | |
| ( | |
| "30s β’ Bedtime story", | |
| os.path.join(_VOICES_DIR, "female_shadowheart.wav"), | |
| 'A mother sits at the edge of her child\'s bed in the dim glow of a single lamp. ' | |
| 'She speaks softly, "Once upon a time, in a kingdom by the sea, there lived a small dragon named Pip." ' | |
| 'She lowers her voice playfully, "Now Pip was not like the other dragons. Pip was afraid of fire." ' | |
| 'She smiles warmly, "Mmm, can you imagine? A dragon who was afraid of his own breath?" ' | |
| 'A gentle pause, "But Pip had something the other dragons did not have. Pip had courage in his heart." ' | |
| 'She hums softly, "Mmmmm. And one cold winter night, when the village below ran out of warmth..." ' | |
| 'Her voice drops to a whisper, "Pip closed his eyes, took a deep, deep breath, and remembered who he was."', | |
| ), | |
| ( | |
| "30s β’ Sports commentary", | |
| os.path.join(_VOICES_DIR, "male_samuel_j.mp3"), | |
| 'A sports commentator leans into the microphone with the crowd roaring around him. ' | |
| 'He shouts with rising energy, "Oh, this is it! This is the moment we have been waiting for all season!" ' | |
| 'He pants between phrases, "She has the ball at midfield, she is dribbling past one, past two!" ' | |
| 'A sudden gasp, "Oh my, what a move! Did you see that footwork, ladies and gentlemen?" ' | |
| 'His voice climbs, "She is twenty yards out, fifteen yards out, she winds back, and she SHOOTS!" ' | |
| 'A massive pause, then, "GOAAAAAAL! What a strike! What an absolute thunderbolt of a goal!" ' | |
| 'He laughs in disbelief, "Hahaha! Unbelievable! Forty thousand fans on their feet, and so am I!"', | |
| ), | |
| ] | |
| app = Server() | |
| # Serve static voice files and images | |
| app.mount("/assets", StaticFiles(directory="assets"), name="assets") | |
| async def homepage(): | |
| html_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "index.html") | |
| with open(html_path, "r", encoding="utf-8") as f: | |
| return f.read() | |
| def generate_audio( | |
| prompt: str, | |
| audio_ref: FileData | None, | |
| cfg: float, | |
| stg: float, | |
| dur_mult: float, | |
| gen_dur: float, | |
| ref_dur: float, | |
| seed: int | |
| ) -> FileData: | |
| if not prompt or not prompt.strip(): | |
| raise gr.Error("Prompt is empty.") | |
| t0 = time.time() | |
| ref_path = None | |
| if audio_ref: | |
| if isinstance(audio_ref, dict): | |
| ref_path = audio_ref.get("path") | |
| elif hasattr(audio_ref, "path"): | |
| ref_path = audio_ref.path | |
| if ref_path and not os.path.exists(ref_path): | |
| ref_path = None | |
| output = tempfile.mktemp(suffix=".wav", prefix="dramabox_") | |
| tts.generate_to_file( | |
| prompt=prompt, | |
| output=output, | |
| voice_ref=ref_path, | |
| cfg_scale=cfg, | |
| stg_scale=stg, | |
| duration_multiplier=dur_mult, | |
| seed=int(seed), | |
| gen_duration=float(gen_dur), | |
| ref_duration=float(ref_dur), | |
| ) | |
| elapsed = time.time() - t0 | |
| logging.info(f"Generated in {elapsed:.2f}s -> {output}") | |
| return FileData(path=output) | |
| if __name__ == "__main__": | |
| port = int(os.environ.get("GRADIO_SERVER_PORT", "7860")) | |
| app.launch( | |
| server_name="0.0.0.0", | |
| server_port=port, | |
| show_error=True | |
| ) | |