Spaces:

thepatch
/

micro-slot-machine

Sleeping

App Files Files Community

thecollabagepatch commited on Jul 11

Commit

1aa3245

1 Parent(s): 8f10b7a

melodyflow experiment

Browse files

Files changed (1) hide show

app.py +183 -140

app.py CHANGED Viewed

@@ -5,20 +5,57 @@ from audiocraft.models import MusicGen
 from audiocraft.data.audio import audio_write
 import tempfile
 import os
-import logging
 import torch
-from pydub import AudioSegment
-import io
 import random
 # Check if CUDA is available
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 def preprocess_audio(waveform):
     waveform_np = waveform.cpu().squeeze().numpy()
     return torch.from_numpy(waveform_np).unsqueeze(0).to(device)
-@spaces.GPU(duration=10)
 def generate_drum_sample():
     model = MusicGen.get_pretrained('pharoAIsanders420/micro-musicgen-jungle')
     model.set_generation_params(duration=10)
@@ -28,10 +65,9 @@ def generate_drum_sample():
     filename_with_extension = f'{filename_without_extension}.wav'
     audio_write(filename_without_extension, wav.cpu(), model.sample_rate, strategy="loudness", loudness_compressor=True)
     return filename_with_extension
-@spaces.GPU(duration=10)
 def continue_drum_sample(existing_audio_path):
     if existing_audio_path is None:
         return None
@@ -57,7 +93,6 @@ def continue_drum_sample(existing_audio_path):
     if output.dim() == 3:
         output = output.squeeze(0)
     if output.dim() == 1:
         output = output.unsqueeze(0)
@@ -66,10 +101,9 @@ def continue_drum_sample(existing_audio_path):
     combined_file_path = f'./continued_jungle_{random.randint(1000, 9999)}.wav'
     torchaudio.save(combined_file_path, combined_audio, sr)
     return combined_file_path
-@spaces.GPU(duration=120)
 def generate_music(wav_filename, prompt_duration, musicgen_model, output_duration):
     if wav_filename is None:
         return None
@@ -104,136 +138,131 @@ def generate_music(wav_filename, prompt_duration, musicgen_model, output_duratio
     return filename_with_extension
-@spaces.GPU(duration=120)
-def continue_music(input_audio_path, prompt_duration, musicgen_model, output_duration):
-    if input_audio_path is None:
-        return None
-    song, sr = torchaudio.load(input_audio_path)
-    song = song.to(device)
-    model_continue = MusicGen.get_pretrained(musicgen_model.split(" ")[0])
-    model_continue.set_generation_params(
-        use_sampling=True,
-        top_k=250,
-        top_p=0.0,
-        temperature=1.0,
-        duration=output_duration,
-        cfg_coef=3
-    )
-    original_audio = AudioSegment.from_mp3(input_audio_path)
-    current_audio = original_audio
-    file_paths_for_cleanup = []
-    for i in range(1):
-        num_samples = int(prompt_duration * sr)
-        if current_audio.duration_seconds * 1000 < prompt_duration * 1000:
-            raise ValueError("The prompt_duration is longer than the current audio length.")
-        start_time = current_audio.duration_seconds * 1000 - prompt_duration * 1000
-        prompt_audio = current_audio[start_time:]
-        prompt_bytes = prompt_audio.export(format="wav").read()
-        prompt_waveform, _ = torchaudio.load(io.BytesIO(prompt_bytes))
-        prompt_waveform = prompt_waveform.to(device)
-        prompt_waveform = preprocess_audio(prompt_waveform)
-        output = model_continue.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True)
-        output = output.cpu()
-        if len(output.size()) > 2:
-            output = output.squeeze()
-        filename_without_extension = f'continue_{i}'
-        filename_with_extension = f'{filename_without_extension}.wav'
-        correct_filename_extension = f'{filename_without_extension}.wav.wav'
-        audio_write(filename_with_extension, output, model_continue.sample_rate, strategy="loudness", loudness_compressor=True)
-        generated_audio_segment = AudioSegment.from_wav(correct_filename_extension)
-        current_audio = current_audio[:start_time] + generated_audio_segment
-        file_paths_for_cleanup.append(correct_filename_extension)
-    combined_audio_filename = f"combined_audio_{random.randint(1, 10000)}.mp3"
-    current_audio.export(combined_audio_filename, format="mp3")
-    for file_path in file_paths_for_cleanup:
-        os.remove(file_path)
-    return combined_audio_filename
-# Define the expandable sections (keeping your existing content)
-musicgen_micro_blurb = """
-## musicgen_micro
-musicgen micro is an experimental series of models by aaron abebe. they are incredibly fast, and extra insane. this one does goated jungle drums. we're very excited about these.
-[<img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub" width="20" style="vertical-align:middle"> aaron's github](https://github.com/aaronabebe/)
-[<img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" alt="Hugging Face" width="20" style="vertical-align:middle"> musicgen-micro on huggingface](https://huggingface.co/pharoAIsanders420/micro-musicgen-jungle)
-"""
-musicgen_blurb = """
-## musicgen
-musicgen is a transformer-based music model that generates audio. It can also do something called a continuation, which was initially meant to extend musicgen outputs beyond 30 seconds. it can be used with any input audio to produce surprising results.
-[<img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub" width="20" style="vertical-align:middle"> audiocraft github](https://github.com/facebookresearch/audiocraft)
-visit https://thecollabagepatch.com/infinitepolo.mp3 or https://thecollabagepatch.com/audiocraft.mp3 to hear continuations in action.
-see also https://youtube.com/@thecollabagepatch
-"""
-finetunes_blurb = """
-## fine-tuned models
-the fine-tunes hosted on the huggingface hub are provided collectively by the musicgen discord community. thanks to vanya, mj, hoenn, septicDNB and of course, lyra.
-[<img src="https://cdn.iconscout.com/icon/free/png-256/discord-3691244-3073764.png" alt="Discord" width="20" style="vertical-align:middle"> musicgen discord](https://discord.gg/93kX8rGZ)
-[<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" style="vertical-align:middle"> fine-tuning colab notebook by lyra](https://colab.research.google.com/drive/13tbcC3A42KlaUZ21qvUXd25SFLu8WIvb)
-"""
-fine_tunes_info = """
-## thepatch/vanya_ai_dnb_0.1
-thepatch/vanya_ai_dnb_0.1 was trained by vanya. [vanya's Twitter](https://twitter.com/@veryVANYA) 🔗 - it treats almost all input audio as the beginning of a buildup to a dnb drop (can do downtempo well)
-## thepatch/bleeps-medium
-thepatch/bleeps-medium was trained by kevin and lyra [lyra's Twitter](https://twitter.com/@_lyraaaa_) 🔗 - it is a medium model. it's more melodic and ambient sometimes than vanya's, but there's a 50/50 chance it gets real heavy with the edm vibes. It can be amazing at turning your chords into pads, and is a good percussionist.
-## thepatch/budots_remix
-thepatch/budots_remix was trained by MJ BERSABEph. budots is a dope niche genre from the philippines apparently. this one will often do fascinating, demonic, kinds of vocal chopping. warning: it tends to speed up and slow down tempo, which makes it hard to use in a daw.
-## thepatch/hoenn_lofi
-thepatch/hoenn_lofi is a large fine-tune by hoenn. [hoenn's Twitter](https://twitter.com/@eschatolocation) 🔗 - this model is a large boi, and it shows. even tho it is trained to do lo-fi, its ability to run with your melodies and not ruin them is unparalleled among the fine-tunes so far.
-## thepatch/PhonkV2
-thepatch/PhonkV2 was trained by MJ BERSABEph. there are multiple versions in the discord.
-## foureyednymph/musicgen-sza-sos-small
-foureyednymph/musicgen-sza-sos-small was just trained by foureyednymph. We're all about to find out if it does continuations well.
-"""
-# Create the Gradio interface with explicit types
 with gr.Blocks() as iface:
-    gr.Markdown("# the-micro-slot-machine")
-    gr.Markdown("two ai's jamming. warning: outputs will be very strange, likely stupid, and possibly rad.")
-    gr.Markdown("this is an even weirder slot machine than the other one. on the left, you get to generate some state of the art lo-fi jungle drums at incredible speed thanks to aaron's new class of model, and if you want you can have it continue its own output. Then, you can either press the generate_music button to use the first 5 seconds as a prompt, or you can re-upload the audio into the continue_music section to have a fine-tune continue from the end of the jungle drum output, however long and insane it is. think of this as a very weird relay race and you're winning.")
-    with gr.Accordion("more info", open=False):
-        gr.Markdown(musicgen_micro_blurb)
-        gr.Markdown(musicgen_blurb)
-        gr.Markdown(finetunes_blurb)
-    with gr.Accordion("fine-tunes info", open=False):
-        gr.Markdown(fine_tunes_info)
     with gr.Row():
         with gr.Column():
-            generate_button = gr.Button("Generate Drum Sample")
-            drum_audio = gr.Audio(
-                label="Generated Drum Sample",
-                type="filepath",
-                interactive=True,
-                show_download_button=True
             )
-            continue_drum_sample_button = gr.Button("Continue Drum Sample")
         with gr.Column():
             prompt_duration = gr.Dropdown(
                 label="Prompt Duration (seconds)",
@@ -247,6 +276,8 @@ with gr.Blocks() as iface:
                 step=1,
                 value=20
             )
             musicgen_model = gr.Dropdown(
                 label="MusicGen Model",
                 choices=[
@@ -259,16 +290,28 @@ with gr.Blocks() as iface:
                 ],
                 value="thepatch/vanya_ai_dnb_0.1 (small)"
             )
-            generate_music_button = gr.Button("Generate Music")
-            output_audio = gr.Audio(label="Generated Music", type="filepath")
-            continue_button = gr.Button("Continue Generating Music")
-            continue_output_audio = gr.Audio(label="Continued Music Output", type="filepath")
-    # Connecting the components
-    generate_button.click(generate_drum_sample, outputs=[drum_audio])
-    continue_drum_sample_button.click(continue_drum_sample, inputs=[drum_audio], outputs=[drum_audio])
-    generate_music_button.click(generate_music, inputs=[drum_audio, prompt_duration, musicgen_model, output_duration], outputs=[output_audio])
-    continue_button.click(continue_music, inputs=[output_audio, prompt_duration, musicgen_model, output_duration], outputs=continue_output_audio)
 if __name__ == "__main__":
     iface.launch()

 from audiocraft.data.audio import audio_write
 import tempfile
 import os
 import torch
+from gradio_client import Client, handle_file
 import random
+import time
 # Check if CUDA is available
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# MelodyFlow variation mapping - map your semantic variations to text prompts
+VARIATION_PROMPTS = {
+    'accordion_folk': 'folk accordion melody with traditional folk instruments',
+    'banjo_bluegrass': 'bluegrass banjo with country folk instruments',
+    'piano_classical': 'classical piano with orchestral arrangement',
+    'celtic': 'celtic harp and flute with traditional irish instruments',
+    'strings_quartet': 'string quartet with violin, viola, cello arrangement',
+    'synth_retro': 'retro 80s synthesizer with vintage electronic sounds',
+    'synth_modern': 'modern synthesizer with contemporary electronic production',
+    'synth_edm': 'edm synthesizer with dance electronic beats',
+    'lofi_chill': 'lo-fi chill with relaxed jazz hip-hop elements',
+    'synth_bass': 'heavy bass synthesizer with sub-bass frequencies',
+    'rock_band': 'rock band with electric guitar, bass, and drums',
+    'cinematic_epic': 'cinematic epic orchestral with dramatic strings and brass',
+    'retro_rpg': 'retro rpg chiptune with 8-bit game music elements',
+    'chiptune': '8-bit chiptune with retro video game sounds',
+    'steel_drums': 'steel drums with caribbean tropical percussion',
+    'gamelan_fusion': 'gamelan fusion with indonesian percussion instruments',
+    'music_box': 'music box with delicate mechanical melody',
+    'trap_808': 'trap beats with heavy 808 drums and hi-hats',
+    'lo_fi_drums': 'lo-fi drums with vinyl crackle and jazz samples',
+    'boom_bap': 'boom bap hip-hop with classic drum breaks',
+    'percussion_ensemble': 'percussion ensemble with varied drum instruments',
+    'future_bass': 'future bass with melodic drops and vocal chops',
+    'synthwave_retro': 'synthwave retro with neon 80s aesthetic',
+    'melodic_techno': 'melodic techno with driving beats and emotional melodies',
+    'dubstep_wobble': 'dubstep with heavy wobble bass and electronic drops',
+    'glitch_hop': 'glitch hop with broken beats and digital artifacts',
+    'digital_disruption': 'digital disruption with glitchy electronic effects',
+    'circuit_bent': 'circuit bent with broken electronic hardware sounds',
+    'orchestral_glitch': 'orchestral glitch with classical instruments and digital errors',
+    'vapor_drums': 'vaporwave drums with slowed down nostalgic beats',
+    'industrial_textures': 'industrial textures with harsh mechanical sounds',
+    'jungle_breaks': 'jungle breaks with fast drum and bass rhythms'
+}
 def preprocess_audio(waveform):
     waveform_np = waveform.cpu().squeeze().numpy()
     return torch.from_numpy(waveform_np).unsqueeze(0).to(device)
+# ========== MUSICGEN FUNCTIONS (Local ZeroGPU) ==========
+@spaces.GPU
 def generate_drum_sample():
     model = MusicGen.get_pretrained('pharoAIsanders420/micro-musicgen-jungle')
     model.set_generation_params(duration=10)
     filename_with_extension = f'{filename_without_extension}.wav'
     audio_write(filename_without_extension, wav.cpu(), model.sample_rate, strategy="loudness", loudness_compressor=True)
     return filename_with_extension
+@spaces.GPU
 def continue_drum_sample(existing_audio_path):
     if existing_audio_path is None:
         return None
     if output.dim() == 3:
         output = output.squeeze(0)
     if output.dim() == 1:
         output = output.unsqueeze(0)
     combined_file_path = f'./continued_jungle_{random.randint(1000, 9999)}.wav'
     torchaudio.save(combined_file_path, combined_audio, sr)
     return combined_file_path
+@spaces.GPU
 def generate_music(wav_filename, prompt_duration, musicgen_model, output_duration):
     if wav_filename is None:
         return None
     return filename_with_extension
+# ========== MELODYFLOW FUNCTIONS (Via Facebook Space) ==========
+def transform_with_melodyflow_api(audio_path, variation, custom_prompt="", solver="euler", steps=128):
+    """Transform audio using Facebook/MelodyFlow space API"""
+    if audio_path is None:
+        return None, "❌ No audio file provided"
+    try:
+        # Initialize client for Facebook MelodyFlow space
+        client = Client("facebook/MelodyFlow")
+        # Determine the prompt to use
+        if custom_prompt.strip():
+            prompt_text = custom_prompt.strip()
+            status_msg = f"✅ Transformed with custom prompt: '{prompt_text}'"
+        else:
+            prompt_text = VARIATION_PROMPTS.get(variation, f"transform this audio to {variation} style")
+            status_msg = f"✅ Transformed with {variation} style"
+        # Call the MelodyFlow API
+        result = client.predict(
+            model="facebook/melodyflow-t24-30secs",
+            text=prompt_text,
+            solver=solver,
+            steps=steps,
+            target_flowstep=0,  # Default flowstep
+            regularize=False,
+            regularization_strength=0.2,
+            duration=30,  # Max duration
+            melody=handle_file(audio_path),
+            api_name="/predict"
+        )
+        # Result is a tuple of 3 audio files (variations)
+        # We'll use the first variation
+        if result and len(result) > 0 and result[0]:
+            # Save the result locally
+            output_filename = f"melodyflow_{variation}_{random.randint(1000, 9999)}.wav"
+            # Copy the result file to our local filename
+            import shutil
+            shutil.copy2(result[0], output_filename)
+            return output_filename, status_msg
+        else:
+            return None, "❌ MelodyFlow API returned no results"
+    except Exception as e:
+        return None, f"❌ MelodyFlow API error: {str(e)}"
+# ========== GRADIO INTERFACE ==========
+# Create the interface
 with gr.Blocks() as iface:
+    gr.Markdown("# 🎰 The Mega Slot Machine")
+    gr.Markdown("**Hybrid Multi-Model Pipeline**: MicroMusicGen → MelodyFlow (via API) → MusicGen Fine-tunes")
+    gr.Markdown("*Demonstrating the workflow from our Ableton device in a web interface!*")
+    with gr.Accordion("How This Works", open=False):
+        gr.Markdown("""
+        This demo shows how multiple AI models can work together:
+        1. **Generate** initial audio with MicroMusicGen (super fast jungle drums)
+        2. **Transform** it using MelodyFlow (via Facebook's space API)
+        3. **Continue** with MusicGen fine-tunes (trained on specific genres)
+        4. **Repeat** the cycle to create infinite musical journeys!
+        The models run with different PyTorch versions, so we use the Facebook MelodyFlow space via API.
+        """)
+    # ========== STEP 1: GENERATE ==========
+    gr.Markdown("## 🎵 Step 1: Generate Initial Audio")
     with gr.Row():
         with gr.Column():
+            generate_button = gr.Button("Generate Jungle Drums", variant="primary", size="lg")
+            continue_drum_button = gr.Button("Continue Drums", size="sm")
+    main_audio = gr.Audio(
+        label="🎵 Current Audio (flows through pipeline)",
+        type="filepath",
+        interactive=True,
+        show_download_button=True
+    )
+    # ========== STEP 2: TRANSFORM ==========
+    gr.Markdown("## 🎛️ Step 2: Transform with MelodyFlow")
+    with gr.Row():
+        with gr.Column(scale=2):
+            transform_variation = gr.Dropdown(
+                label="Transform Style",
+                choices=list(VARIATION_PROMPTS.keys()),
+                value="synth_modern",
+                interactive=True
             )
+        with gr.Column(scale=3):
+            transform_prompt = gr.Textbox(
+                label="Custom Prompt (optional)",
+                placeholder="Leave empty to use style above, or enter custom transformation prompt",
+                lines=2
+            )
+    with gr.Row():
+        transform_solver = gr.Dropdown(
+            label="Solver",
+            choices=["euler", "midpoint"],
+            value="euler"
+        )
+        transform_steps = gr.Slider(
+            label="Steps",
+            minimum=64,
+            maximum=256,
+            step=32,
+            value=128
+        )
+        transform_button = gr.Button("🎛️ Transform Audio", variant="secondary", size="lg")
+    transform_status = gr.Textbox(label="Transform Status", value="Ready to transform", interactive=False)
+    # ========== STEP 3: CONTINUE ==========
+    gr.Markdown("## 🎼 Step 3: Continue with MusicGen")
+    with gr.Row():
         with gr.Column():
             prompt_duration = gr.Dropdown(
                 label="Prompt Duration (seconds)",
                 step=1,
                 value=20
             )
+        with gr.Column():
             musicgen_model = gr.Dropdown(
                 label="MusicGen Model",
                 choices=[
                 ],
                 value="thepatch/vanya_ai_dnb_0.1 (small)"
             )
+    generate_music_button = gr.Button("🎼 Continue with MusicGen", variant="primary", size="lg")
+    # ========== EVENT HANDLERS ==========
+    # Step 1: Generate
+    generate_button.click(generate_drum_sample, outputs=[main_audio])
+    continue_drum_button.click(continue_drum_sample, inputs=[main_audio], outputs=[main_audio])
+    # Step 2: Transform (using Facebook MelodyFlow API)
+    transform_button.click(
+        transform_with_melodyflow_api,
+        inputs=[main_audio, transform_variation, transform_prompt, transform_solver, transform_steps],
+        outputs=[main_audio, transform_status]
+    )
+    # Step 3: Continue
+    generate_music_button.click(
+        generate_music,
+        inputs=[main_audio, prompt_duration, musicgen_model, output_duration],
+        outputs=[main_audio]
+    )
 if __name__ == "__main__":
     iface.launch()