| | import librosa
|
| | import librosa.display
|
| | import numpy as np
|
| | import matplotlib.pyplot as plt
|
| | import io
|
| | from PIL import Image
|
| |
|
| |
|
| | SR = 16000
|
| | N_FFT = 1024
|
| | HOP_LENGTH = 512
|
| | N_MELS = 128
|
| | TARGET_DURATION = 5.0
|
| | TARGET_LENGTH = int(TARGET_DURATION * SR)
|
| |
|
| | def preprocess_audio(file_path):
|
| |
|
| | y, sr = librosa.load(file_path, sr=None, mono=True)
|
| |
|
| |
|
| | peak = np.abs(y).max()
|
| | if peak > 0:
|
| | y = y / peak * 0.99
|
| |
|
| |
|
| | if sr != SR:
|
| | y = librosa.resample(y, orig_sr=sr, target_sr=SR)
|
| |
|
| |
|
| | chunks = []
|
| | for start in range(0, len(y), TARGET_LENGTH):
|
| | chunk = y[start:start + TARGET_LENGTH]
|
| | if len(chunk) < TARGET_LENGTH:
|
| | chunk = np.pad(chunk, (0, TARGET_LENGTH - len(chunk)), mode="constant")
|
| |
|
| |
|
| | S = librosa.feature.melspectrogram(
|
| | y=chunk, sr=SR, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_MELS
|
| | )
|
| | S_dB = librosa.power_to_db(S, ref=np.max)
|
| |
|
| |
|
| | fig = plt.figure(figsize=(3, 3))
|
| | librosa.display.specshow(S_dB, sr=SR, hop_length=HOP_LENGTH, cmap="magma")
|
| | plt.axis("off")
|
| |
|
| | buf = io.BytesIO()
|
| | plt.savefig(buf, format="png", bbox_inches="tight", pad_inches=0)
|
| | plt.close(fig)
|
| |
|
| | buf.seek(0)
|
| | img = Image.open(buf).convert("RGBA")
|
| | chunks.append(img)
|
| |
|
| | return chunks |