| |
| import gradio as gr |
| from playdiffusion import PlayDiffusion, RVCInput |
| import os |
| import wget |
| import torch |
|
|
| |
| print("--- Checking and Downloading Model Assets ---") |
| MODEL_FILES = { |
| "kmeans_10k.npy": "https://huggingface.co/PlayHT/PlayDiffusion/resolve/main/kmeans_10k.npy", |
| "last_250k_fixed.pkl": "https://huggingface.co/PlayHT/PlayDiffusion/resolve/main/last_250k_fixed.pkl", |
| "tokenizer-multi_bpe16384_merged_extended_58M.json": "https://huggingface.co/PlayHT/PlayDiffusion/resolve/main/tokenizer-multi_bpe16384_merged_extended_58M.json", |
| "v090_g_01105000": "https://huggingface.co/PlayHT/PlayDiffusion/resolve/main/v090_g_01105000", |
| "voice_encoder_1992000.pt": "https://huggingface.co/PlayHT/PlayDiffusion/resolve/main/voice_encoder_1992000.pt", |
| "xlsr2_1b_v2_custom.pt": "https://huggingface.co/PlayHT/PlayDiffusion/resolve/main/xlsr2_1b_v2_custom.pt" |
| } |
|
|
| for filename, url in MODEL_FILES.items(): |
| if not os.path.exists(filename): |
| print(f"Downloading {filename}...") |
| wget.download(url, filename) |
| else: |
| print(f"{filename} already exists. Skipping download.") |
|
|
|
|
| |
| |
| |
| device = 'cuda' if torch.cuda.is_available() else 'cpu' |
| print(f"--- Device selected: {device.upper()} ---") |
| print("Initializing PlayDiffusion... This will load the models into memory.") |
|
|
| |
| inpainter = PlayDiffusion(device=device) |
|
|
| print("PlayDiffusion initialized successfully.") |
| |
|
|
|
|
| def speech_rvc(rvc_source_speech, rvc_target_voice): |
| if rvc_source_speech is None or rvc_target_voice is None: |
| raise gr.Error("Please provide both a source speech audio and a target voice audio.") |
| print("Starting voice conversion...") |
| converted_audio = inpainter.rvc(RVCInput(source_speech=rvc_source_speech, target_voice=rvc_target_voice)) |
| print("Voice conversion finished.") |
| return converted_audio |
|
|
|
|
| with gr.Blocks(theme=gr.themes.Soft(), title="PlayDiffusion Voice Conversion") as demo: |
| gr.Markdown("# π£οΈ PlayDiffusion Voice Conversion") |
| gr.Markdown(f"### Running on: **{device.upper()}**") |
| gr.Markdown("Upload a **Source Speech** audio and a **Target Voice** audio to convert the speech.") |
| if device == 'cpu': |
| gr.Warning("Running on CPU. The voice conversion process will be extremely slow and may time out.") |
|
|
| with gr.Row(): |
| rvc_source_speech = gr.Audio(label="Source Speech", sources=["upload", "microphone"], type="filepath") |
| rvc_target_voice = gr.Audio(label="Target Voice", sources=["upload", "microphone"], type="filepath") |
|
|
| rvc_submit = gr.Button("π Run Voice Conversion", variant="primary") |
| gr.Markdown("### Converted Speech Output") |
| rvc_output = gr.Audio(label="Result", interactive=False) |
|
|
| rvc_submit.click(fn=speech_rvc, inputs=[rvc_source_speech, rvc_target_voice], outputs=[rvc_output]) |
|
|
|
|
| demo.launch() |