thecollabagepatch commited on
Commit
1aa3245
Β·
1 Parent(s): 8f10b7a

melodyflow experiment

Browse files
Files changed (1) hide show
  1. app.py +183 -140
app.py CHANGED
@@ -5,20 +5,57 @@ from audiocraft.models import MusicGen
5
  from audiocraft.data.audio import audio_write
6
  import tempfile
7
  import os
8
- import logging
9
  import torch
10
- from pydub import AudioSegment
11
- import io
12
  import random
 
13
 
14
  # Check if CUDA is available
15
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  def preprocess_audio(waveform):
18
  waveform_np = waveform.cpu().squeeze().numpy()
19
  return torch.from_numpy(waveform_np).unsqueeze(0).to(device)
20
 
21
- @spaces.GPU(duration=10)
 
 
22
  def generate_drum_sample():
23
  model = MusicGen.get_pretrained('pharoAIsanders420/micro-musicgen-jungle')
24
  model.set_generation_params(duration=10)
@@ -28,10 +65,9 @@ def generate_drum_sample():
28
  filename_with_extension = f'{filename_without_extension}.wav'
29
 
30
  audio_write(filename_without_extension, wav.cpu(), model.sample_rate, strategy="loudness", loudness_compressor=True)
31
-
32
  return filename_with_extension
33
 
34
- @spaces.GPU(duration=10)
35
  def continue_drum_sample(existing_audio_path):
36
  if existing_audio_path is None:
37
  return None
@@ -57,7 +93,6 @@ def continue_drum_sample(existing_audio_path):
57
 
58
  if output.dim() == 3:
59
  output = output.squeeze(0)
60
-
61
  if output.dim() == 1:
62
  output = output.unsqueeze(0)
63
 
@@ -66,10 +101,9 @@ def continue_drum_sample(existing_audio_path):
66
 
67
  combined_file_path = f'./continued_jungle_{random.randint(1000, 9999)}.wav'
68
  torchaudio.save(combined_file_path, combined_audio, sr)
69
-
70
  return combined_file_path
71
 
72
- @spaces.GPU(duration=120)
73
  def generate_music(wav_filename, prompt_duration, musicgen_model, output_duration):
74
  if wav_filename is None:
75
  return None
@@ -104,136 +138,131 @@ def generate_music(wav_filename, prompt_duration, musicgen_model, output_duratio
104
 
105
  return filename_with_extension
106
 
107
- @spaces.GPU(duration=120)
108
- def continue_music(input_audio_path, prompt_duration, musicgen_model, output_duration):
109
- if input_audio_path is None:
110
- return None
111
-
112
- song, sr = torchaudio.load(input_audio_path)
113
- song = song.to(device)
114
-
115
- model_continue = MusicGen.get_pretrained(musicgen_model.split(" ")[0])
116
- model_continue.set_generation_params(
117
- use_sampling=True,
118
- top_k=250,
119
- top_p=0.0,
120
- temperature=1.0,
121
- duration=output_duration,
122
- cfg_coef=3
123
- )
124
-
125
- original_audio = AudioSegment.from_mp3(input_audio_path)
126
- current_audio = original_audio
127
-
128
- file_paths_for_cleanup = []
129
-
130
- for i in range(1):
131
- num_samples = int(prompt_duration * sr)
132
- if current_audio.duration_seconds * 1000 < prompt_duration * 1000:
133
- raise ValueError("The prompt_duration is longer than the current audio length.")
134
-
135
- start_time = current_audio.duration_seconds * 1000 - prompt_duration * 1000
136
- prompt_audio = current_audio[start_time:]
137
-
138
- prompt_bytes = prompt_audio.export(format="wav").read()
139
- prompt_waveform, _ = torchaudio.load(io.BytesIO(prompt_bytes))
140
- prompt_waveform = prompt_waveform.to(device)
141
-
142
- prompt_waveform = preprocess_audio(prompt_waveform)
143
-
144
- output = model_continue.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True)
145
- output = output.cpu()
146
-
147
- if len(output.size()) > 2:
148
- output = output.squeeze()
149
-
150
- filename_without_extension = f'continue_{i}'
151
- filename_with_extension = f'{filename_without_extension}.wav'
152
- correct_filename_extension = f'{filename_without_extension}.wav.wav'
153
-
154
- audio_write(filename_with_extension, output, model_continue.sample_rate, strategy="loudness", loudness_compressor=True)
155
- generated_audio_segment = AudioSegment.from_wav(correct_filename_extension)
156
-
157
- current_audio = current_audio[:start_time] + generated_audio_segment
158
-
159
- file_paths_for_cleanup.append(correct_filename_extension)
160
-
161
- combined_audio_filename = f"combined_audio_{random.randint(1, 10000)}.mp3"
162
- current_audio.export(combined_audio_filename, format="mp3")
163
-
164
- for file_path in file_paths_for_cleanup:
165
- os.remove(file_path)
166
-
167
- return combined_audio_filename
168
-
169
- # Define the expandable sections (keeping your existing content)
170
- musicgen_micro_blurb = """
171
- ## musicgen_micro
172
- musicgen micro is an experimental series of models by aaron abebe. they are incredibly fast, and extra insane. this one does goated jungle drums. we're very excited about these.
173
- [<img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub" width="20" style="vertical-align:middle"> aaron's github](https://github.com/aaronabebe/)
174
- [<img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" alt="Hugging Face" width="20" style="vertical-align:middle"> musicgen-micro on huggingface](https://huggingface.co/pharoAIsanders420/micro-musicgen-jungle)
175
- """
176
-
177
- musicgen_blurb = """
178
- ## musicgen
179
- musicgen is a transformer-based music model that generates audio. It can also do something called a continuation, which was initially meant to extend musicgen outputs beyond 30 seconds. it can be used with any input audio to produce surprising results.
180
- [<img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub" width="20" style="vertical-align:middle"> audiocraft github](https://github.com/facebookresearch/audiocraft)
181
- visit https://thecollabagepatch.com/infinitepolo.mp3 or https://thecollabagepatch.com/audiocraft.mp3 to hear continuations in action.
182
- see also https://youtube.com/@thecollabagepatch
183
- """
184
 
185
- finetunes_blurb = """
186
- ## fine-tuned models
187
- the fine-tunes hosted on the huggingface hub are provided collectively by the musicgen discord community. thanks to vanya, mj, hoenn, septicDNB and of course, lyra.
188
- [<img src="https://cdn.iconscout.com/icon/free/png-256/discord-3691244-3073764.png" alt="Discord" width="20" style="vertical-align:middle"> musicgen discord](https://discord.gg/93kX8rGZ)
189
- [<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" style="vertical-align:middle"> fine-tuning colab notebook by lyra](https://colab.research.google.com/drive/13tbcC3A42KlaUZ21qvUXd25SFLu8WIvb)
190
- """
191
-
192
- fine_tunes_info = """
193
- ## thepatch/vanya_ai_dnb_0.1
194
- thepatch/vanya_ai_dnb_0.1 was trained by vanya. [vanya's Twitter](https://twitter.com/@veryVANYA) πŸ”— - it treats almost all input audio as the beginning of a buildup to a dnb drop (can do downtempo well)
195
-
196
- ## thepatch/bleeps-medium
197
- thepatch/bleeps-medium was trained by kevin and lyra [lyra's Twitter](https://twitter.com/@_lyraaaa_) πŸ”— - it is a medium model. it's more melodic and ambient sometimes than vanya's, but there's a 50/50 chance it gets real heavy with the edm vibes. It can be amazing at turning your chords into pads, and is a good percussionist.
198
-
199
- ## thepatch/budots_remix
200
- thepatch/budots_remix was trained by MJ BERSABEph. budots is a dope niche genre from the philippines apparently. this one will often do fascinating, demonic, kinds of vocal chopping. warning: it tends to speed up and slow down tempo, which makes it hard to use in a daw.
201
-
202
- ## thepatch/hoenn_lofi
203
- thepatch/hoenn_lofi is a large fine-tune by hoenn. [hoenn's Twitter](https://twitter.com/@eschatolocation) πŸ”— - this model is a large boi, and it shows. even tho it is trained to do lo-fi, its ability to run with your melodies and not ruin them is unparalleled among the fine-tunes so far.
204
-
205
- ## thepatch/PhonkV2
206
- thepatch/PhonkV2 was trained by MJ BERSABEph. there are multiple versions in the discord.
207
-
208
- ## foureyednymph/musicgen-sza-sos-small
209
- foureyednymph/musicgen-sza-sos-small was just trained by foureyednymph. We're all about to find out if it does continuations well.
210
- """
211
-
212
- # Create the Gradio interface with explicit types
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  with gr.Blocks() as iface:
214
- gr.Markdown("# the-micro-slot-machine")
215
- gr.Markdown("two ai's jamming. warning: outputs will be very strange, likely stupid, and possibly rad.")
216
- gr.Markdown("this is an even weirder slot machine than the other one. on the left, you get to generate some state of the art lo-fi jungle drums at incredible speed thanks to aaron's new class of model, and if you want you can have it continue its own output. Then, you can either press the generate_music button to use the first 5 seconds as a prompt, or you can re-upload the audio into the continue_music section to have a fine-tune continue from the end of the jungle drum output, however long and insane it is. think of this as a very weird relay race and you're winning.")
217
 
218
- with gr.Accordion("more info", open=False):
219
- gr.Markdown(musicgen_micro_blurb)
220
- gr.Markdown(musicgen_blurb)
221
- gr.Markdown(finetunes_blurb)
222
-
223
- with gr.Accordion("fine-tunes info", open=False):
224
- gr.Markdown(fine_tunes_info)
 
 
 
 
225
 
 
 
 
226
  with gr.Row():
227
  with gr.Column():
228
- generate_button = gr.Button("Generate Drum Sample")
229
- drum_audio = gr.Audio(
230
- label="Generated Drum Sample",
231
- type="filepath",
232
- interactive=True,
233
- show_download_button=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  )
235
- continue_drum_sample_button = gr.Button("Continue Drum Sample")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
 
 
 
 
237
  with gr.Column():
238
  prompt_duration = gr.Dropdown(
239
  label="Prompt Duration (seconds)",
@@ -247,6 +276,8 @@ with gr.Blocks() as iface:
247
  step=1,
248
  value=20
249
  )
 
 
250
  musicgen_model = gr.Dropdown(
251
  label="MusicGen Model",
252
  choices=[
@@ -259,16 +290,28 @@ with gr.Blocks() as iface:
259
  ],
260
  value="thepatch/vanya_ai_dnb_0.1 (small)"
261
  )
262
- generate_music_button = gr.Button("Generate Music")
263
- output_audio = gr.Audio(label="Generated Music", type="filepath")
264
- continue_button = gr.Button("Continue Generating Music")
265
- continue_output_audio = gr.Audio(label="Continued Music Output", type="filepath")
266
 
267
- # Connecting the components
268
- generate_button.click(generate_drum_sample, outputs=[drum_audio])
269
- continue_drum_sample_button.click(continue_drum_sample, inputs=[drum_audio], outputs=[drum_audio])
270
- generate_music_button.click(generate_music, inputs=[drum_audio, prompt_duration, musicgen_model, output_duration], outputs=[output_audio])
271
- continue_button.click(continue_music, inputs=[output_audio, prompt_duration, musicgen_model, output_duration], outputs=continue_output_audio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
 
273
  if __name__ == "__main__":
274
  iface.launch()
 
5
  from audiocraft.data.audio import audio_write
6
  import tempfile
7
  import os
 
8
  import torch
9
+ from gradio_client import Client, handle_file
 
10
  import random
11
+ import time
12
 
13
  # Check if CUDA is available
14
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
 
16
+ # MelodyFlow variation mapping - map your semantic variations to text prompts
17
+ VARIATION_PROMPTS = {
18
+ 'accordion_folk': 'folk accordion melody with traditional folk instruments',
19
+ 'banjo_bluegrass': 'bluegrass banjo with country folk instruments',
20
+ 'piano_classical': 'classical piano with orchestral arrangement',
21
+ 'celtic': 'celtic harp and flute with traditional irish instruments',
22
+ 'strings_quartet': 'string quartet with violin, viola, cello arrangement',
23
+ 'synth_retro': 'retro 80s synthesizer with vintage electronic sounds',
24
+ 'synth_modern': 'modern synthesizer with contemporary electronic production',
25
+ 'synth_edm': 'edm synthesizer with dance electronic beats',
26
+ 'lofi_chill': 'lo-fi chill with relaxed jazz hip-hop elements',
27
+ 'synth_bass': 'heavy bass synthesizer with sub-bass frequencies',
28
+ 'rock_band': 'rock band with electric guitar, bass, and drums',
29
+ 'cinematic_epic': 'cinematic epic orchestral with dramatic strings and brass',
30
+ 'retro_rpg': 'retro rpg chiptune with 8-bit game music elements',
31
+ 'chiptune': '8-bit chiptune with retro video game sounds',
32
+ 'steel_drums': 'steel drums with caribbean tropical percussion',
33
+ 'gamelan_fusion': 'gamelan fusion with indonesian percussion instruments',
34
+ 'music_box': 'music box with delicate mechanical melody',
35
+ 'trap_808': 'trap beats with heavy 808 drums and hi-hats',
36
+ 'lo_fi_drums': 'lo-fi drums with vinyl crackle and jazz samples',
37
+ 'boom_bap': 'boom bap hip-hop with classic drum breaks',
38
+ 'percussion_ensemble': 'percussion ensemble with varied drum instruments',
39
+ 'future_bass': 'future bass with melodic drops and vocal chops',
40
+ 'synthwave_retro': 'synthwave retro with neon 80s aesthetic',
41
+ 'melodic_techno': 'melodic techno with driving beats and emotional melodies',
42
+ 'dubstep_wobble': 'dubstep with heavy wobble bass and electronic drops',
43
+ 'glitch_hop': 'glitch hop with broken beats and digital artifacts',
44
+ 'digital_disruption': 'digital disruption with glitchy electronic effects',
45
+ 'circuit_bent': 'circuit bent with broken electronic hardware sounds',
46
+ 'orchestral_glitch': 'orchestral glitch with classical instruments and digital errors',
47
+ 'vapor_drums': 'vaporwave drums with slowed down nostalgic beats',
48
+ 'industrial_textures': 'industrial textures with harsh mechanical sounds',
49
+ 'jungle_breaks': 'jungle breaks with fast drum and bass rhythms'
50
+ }
51
+
52
  def preprocess_audio(waveform):
53
  waveform_np = waveform.cpu().squeeze().numpy()
54
  return torch.from_numpy(waveform_np).unsqueeze(0).to(device)
55
 
56
+ # ========== MUSICGEN FUNCTIONS (Local ZeroGPU) ==========
57
+
58
+ @spaces.GPU
59
  def generate_drum_sample():
60
  model = MusicGen.get_pretrained('pharoAIsanders420/micro-musicgen-jungle')
61
  model.set_generation_params(duration=10)
 
65
  filename_with_extension = f'{filename_without_extension}.wav'
66
 
67
  audio_write(filename_without_extension, wav.cpu(), model.sample_rate, strategy="loudness", loudness_compressor=True)
 
68
  return filename_with_extension
69
 
70
+ @spaces.GPU
71
  def continue_drum_sample(existing_audio_path):
72
  if existing_audio_path is None:
73
  return None
 
93
 
94
  if output.dim() == 3:
95
  output = output.squeeze(0)
 
96
  if output.dim() == 1:
97
  output = output.unsqueeze(0)
98
 
 
101
 
102
  combined_file_path = f'./continued_jungle_{random.randint(1000, 9999)}.wav'
103
  torchaudio.save(combined_file_path, combined_audio, sr)
 
104
  return combined_file_path
105
 
106
+ @spaces.GPU
107
  def generate_music(wav_filename, prompt_duration, musicgen_model, output_duration):
108
  if wav_filename is None:
109
  return None
 
138
 
139
  return filename_with_extension
140
 
141
+ # ========== MELODYFLOW FUNCTIONS (Via Facebook Space) ==========
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
+ def transform_with_melodyflow_api(audio_path, variation, custom_prompt="", solver="euler", steps=128):
144
+ """Transform audio using Facebook/MelodyFlow space API"""
145
+ if audio_path is None:
146
+ return None, "❌ No audio file provided"
147
+
148
+ try:
149
+ # Initialize client for Facebook MelodyFlow space
150
+ client = Client("facebook/MelodyFlow")
151
+
152
+ # Determine the prompt to use
153
+ if custom_prompt.strip():
154
+ prompt_text = custom_prompt.strip()
155
+ status_msg = f"βœ… Transformed with custom prompt: '{prompt_text}'"
156
+ else:
157
+ prompt_text = VARIATION_PROMPTS.get(variation, f"transform this audio to {variation} style")
158
+ status_msg = f"βœ… Transformed with {variation} style"
159
+
160
+ # Call the MelodyFlow API
161
+ result = client.predict(
162
+ model="facebook/melodyflow-t24-30secs",
163
+ text=prompt_text,
164
+ solver=solver,
165
+ steps=steps,
166
+ target_flowstep=0, # Default flowstep
167
+ regularize=False,
168
+ regularization_strength=0.2,
169
+ duration=30, # Max duration
170
+ melody=handle_file(audio_path),
171
+ api_name="/predict"
172
+ )
173
+
174
+ # Result is a tuple of 3 audio files (variations)
175
+ # We'll use the first variation
176
+ if result and len(result) > 0 and result[0]:
177
+ # Save the result locally
178
+ output_filename = f"melodyflow_{variation}_{random.randint(1000, 9999)}.wav"
179
+
180
+ # Copy the result file to our local filename
181
+ import shutil
182
+ shutil.copy2(result[0], output_filename)
183
+
184
+ return output_filename, status_msg
185
+ else:
186
+ return None, "❌ MelodyFlow API returned no results"
187
+
188
+ except Exception as e:
189
+ return None, f"❌ MelodyFlow API error: {str(e)}"
190
+
191
+ # ========== GRADIO INTERFACE ==========
192
+
193
+ # Create the interface
194
  with gr.Blocks() as iface:
195
+ gr.Markdown("# 🎰 The Mega Slot Machine")
196
+ gr.Markdown("**Hybrid Multi-Model Pipeline**: MicroMusicGen β†’ MelodyFlow (via API) β†’ MusicGen Fine-tunes")
197
+ gr.Markdown("*Demonstrating the workflow from our Ableton device in a web interface!*")
198
 
199
+ with gr.Accordion("How This Works", open=False):
200
+ gr.Markdown("""
201
+ This demo shows how multiple AI models can work together:
202
+
203
+ 1. **Generate** initial audio with MicroMusicGen (super fast jungle drums)
204
+ 2. **Transform** it using MelodyFlow (via Facebook's space API)
205
+ 3. **Continue** with MusicGen fine-tunes (trained on specific genres)
206
+ 4. **Repeat** the cycle to create infinite musical journeys!
207
+
208
+ The models run with different PyTorch versions, so we use the Facebook MelodyFlow space via API.
209
+ """)
210
 
211
+ # ========== STEP 1: GENERATE ==========
212
+ gr.Markdown("## 🎡 Step 1: Generate Initial Audio")
213
+
214
  with gr.Row():
215
  with gr.Column():
216
+ generate_button = gr.Button("Generate Jungle Drums", variant="primary", size="lg")
217
+ continue_drum_button = gr.Button("Continue Drums", size="sm")
218
+
219
+ main_audio = gr.Audio(
220
+ label="🎡 Current Audio (flows through pipeline)",
221
+ type="filepath",
222
+ interactive=True,
223
+ show_download_button=True
224
+ )
225
+
226
+ # ========== STEP 2: TRANSFORM ==========
227
+ gr.Markdown("## πŸŽ›οΈ Step 2: Transform with MelodyFlow")
228
+
229
+ with gr.Row():
230
+ with gr.Column(scale=2):
231
+ transform_variation = gr.Dropdown(
232
+ label="Transform Style",
233
+ choices=list(VARIATION_PROMPTS.keys()),
234
+ value="synth_modern",
235
+ interactive=True
236
  )
237
+
238
+ with gr.Column(scale=3):
239
+ transform_prompt = gr.Textbox(
240
+ label="Custom Prompt (optional)",
241
+ placeholder="Leave empty to use style above, or enter custom transformation prompt",
242
+ lines=2
243
+ )
244
+
245
+ with gr.Row():
246
+ transform_solver = gr.Dropdown(
247
+ label="Solver",
248
+ choices=["euler", "midpoint"],
249
+ value="euler"
250
+ )
251
+ transform_steps = gr.Slider(
252
+ label="Steps",
253
+ minimum=64,
254
+ maximum=256,
255
+ step=32,
256
+ value=128
257
+ )
258
+ transform_button = gr.Button("πŸŽ›οΈ Transform Audio", variant="secondary", size="lg")
259
+
260
+ transform_status = gr.Textbox(label="Transform Status", value="Ready to transform", interactive=False)
261
 
262
+ # ========== STEP 3: CONTINUE ==========
263
+ gr.Markdown("## 🎼 Step 3: Continue with MusicGen")
264
+
265
+ with gr.Row():
266
  with gr.Column():
267
  prompt_duration = gr.Dropdown(
268
  label="Prompt Duration (seconds)",
 
276
  step=1,
277
  value=20
278
  )
279
+
280
+ with gr.Column():
281
  musicgen_model = gr.Dropdown(
282
  label="MusicGen Model",
283
  choices=[
 
290
  ],
291
  value="thepatch/vanya_ai_dnb_0.1 (small)"
292
  )
293
+
294
+ generate_music_button = gr.Button("🎼 Continue with MusicGen", variant="primary", size="lg")
 
 
295
 
296
+ # ========== EVENT HANDLERS ==========
297
+
298
+ # Step 1: Generate
299
+ generate_button.click(generate_drum_sample, outputs=[main_audio])
300
+ continue_drum_button.click(continue_drum_sample, inputs=[main_audio], outputs=[main_audio])
301
+
302
+ # Step 2: Transform (using Facebook MelodyFlow API)
303
+ transform_button.click(
304
+ transform_with_melodyflow_api,
305
+ inputs=[main_audio, transform_variation, transform_prompt, transform_solver, transform_steps],
306
+ outputs=[main_audio, transform_status]
307
+ )
308
+
309
+ # Step 3: Continue
310
+ generate_music_button.click(
311
+ generate_music,
312
+ inputs=[main_audio, prompt_duration, musicgen_model, output_duration],
313
+ outputs=[main_audio]
314
+ )
315
 
316
  if __name__ == "__main__":
317
  iface.launch()