Spaces:
Running
on
Zero
Running
on
Zero
| import os | |
| from typing import Tuple | |
| import gradio as gr | |
| import tempfile | |
| import numpy as np | |
| import soundfile as sf | |
| import librosa | |
| import matplotlib.pyplot as plt | |
| from audio_separator.separator import Separator | |
| from zero import dynGPU | |
| from youtube import youtube | |
| separators = { | |
| "BS-RoFormer": Separator(output_dir=tempfile.gettempdir(), output_format="mp3"), | |
| "Mel-RoFormer": Separator(output_dir=tempfile.gettempdir(), output_format="mp3"), | |
| "HTDemucs-FT": Separator(output_dir=tempfile.gettempdir(), output_format="mp3"), | |
| } | |
| def load(): | |
| separators["BS-RoFormer"].load_model("model_bs_roformer_ep_317_sdr_12.9755.ckpt") | |
| separators["Mel-RoFormer"].load_model( | |
| "model_mel_band_roformer_ep_3005_sdr_11.4360.ckpt" | |
| ) | |
| separators["HTDemucs-FT"].load_model("htdemucs_ft.yaml") | |
| # sometimes the network might be down, so we retry a few times | |
| for _ in range(3): | |
| try: | |
| load() | |
| break | |
| except Exception as e: | |
| print(e) | |
| def merge(outs): | |
| print(f"Merging {outs}") | |
| bgm = np.sum(np.array([sf.read(out)[0] for out in outs]), axis=0) | |
| print(f"Merged shape: {bgm.shape}") | |
| tmp_file = os.path.join(tempfile.gettempdir(), f"{outs[0].split('/')[-1]}_merged") | |
| sf.write(tmp_file + ".mp3", bgm, 44100) | |
| return tmp_file + ".mp3" | |
| def measure_duration(audio: str, model: str) -> int: | |
| y, sr = librosa.load(audio, sr=44100) | |
| return int(librosa.get_duration(y=y, sr=sr) / 3.0) | |
| def separate(audio: str, model: str) -> Tuple[str, str]: | |
| separator = separators[model] | |
| outs = separator.separate(audio) | |
| outs = [os.path.join(tempfile.gettempdir(), out) for out in outs] | |
| # roformers | |
| if len(outs) == 2: | |
| return outs[1], outs[0] | |
| # demucs | |
| if len(outs) == 4: | |
| bgm = merge(outs[:3]) | |
| return outs[3], bgm | |
| raise gr.Error("Unknown output format") | |
| def from_youtube(url: str, model: str) -> Tuple[str, str, str]: | |
| audio = youtube(url) | |
| return audio, *separate(audio, model) | |
| def plot_spectrogram(audio: str): | |
| y, sr = librosa.load(audio, sr=44100) | |
| S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128) | |
| S_dB = librosa.power_to_db(S, ref=np.max) | |
| fig = plt.figure(figsize=(15, 5)) | |
| librosa.display.specshow(S_dB, sr=sr, x_axis="time", y_axis="mel") | |
| plt.colorbar(format="%+2.0f dB") | |
| plt.title("Mel-frequency spectrogram") | |
| fig.tight_layout() | |
| return fig | |
| with gr.Blocks() as app: | |
| with open(os.path.join(os.path.dirname(__file__), "README.md"), "r") as f: | |
| README = f.read() | |
| # remove yaml front matter | |
| blocks = README.split("---") | |
| if len(blocks) > 1: | |
| README = "---".join(blocks[2:]) | |
| gr.Markdown(README) | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("## Upload an audio file") | |
| audio = gr.Audio(label="Upload an audio file", type="filepath") | |
| with gr.Column(): | |
| gr.Markdown( | |
| "## or use a YouTube URL\n\nTry something on [The First Take](https://www.youtube.com/@The_FirstTake)?" | |
| ) | |
| yt = gr.Textbox( | |
| label="YouTube URL", placeholder="https://www.youtube.com/watch?v=..." | |
| ) | |
| yt_btn = gr.Button("Use this YouTube URL") | |
| with gr.Row(): | |
| model = gr.Radio( | |
| label="Select a model", | |
| choices=[s for s in separators.keys()], | |
| value="BS-RoFormer", | |
| ) | |
| btn = gr.Button("Separate", variant="primary") | |
| with gr.Row(): | |
| with gr.Column(): | |
| vocals = gr.Audio( | |
| label="Vocals", format="mp3", type="filepath", interactive=False | |
| ) | |
| with gr.Column(): | |
| bgm = gr.Audio( | |
| label="Background", format="mp3", type="filepath", interactive=False | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| vocal_spec = gr.Plot(label="Vocal spectrogram") | |
| with gr.Column(): | |
| bgm_spec = gr.Plot(label="Background spectrogram") | |
| gr.Examples( | |
| examples=[ | |
| # I don't have any good examples, please contribute some! | |
| # Suno's generated musix seems to have too many artifacts | |
| ], | |
| inputs=[audio], | |
| ) | |
| gr.Markdown( | |
| """ | |
| - BS-RoFormer: https://arxiv.org/abs/2309.02612 | |
| - Mel-RoFormer: https://arxiv.org/abs/2310.01809 | |
| """ | |
| ) | |
| btn.click( | |
| fn=separate, | |
| inputs=[audio, model], | |
| outputs=[vocals, bgm], | |
| api_name="separate", | |
| ).success( | |
| fn=plot_spectrogram, | |
| inputs=[vocals], | |
| outputs=[vocal_spec], | |
| ).success( | |
| fn=plot_spectrogram, | |
| inputs=[bgm], | |
| outputs=[bgm_spec], | |
| ) | |
| yt_btn.click( | |
| fn=from_youtube, | |
| inputs=[yt, model], | |
| outputs=[audio, vocals, bgm], | |
| ).success( | |
| fn=plot_spectrogram, | |
| inputs=[vocals], | |
| outputs=[vocal_spec], | |
| ).success( | |
| fn=plot_spectrogram, | |
| inputs=[bgm], | |
| outputs=[bgm_spec], | |
| ) | |
| app.launch(show_error=True) | |