Spaces:

frascuchon
/

magenta-realtime

Paused

App Files Files Community

frascuchon HF Staff commited on Jul 28, 2025

Commit

af748a3

1 Parent(s): 6a8e8ce

create basic app

Browse files

Files changed (2) hide show

app.py +177 -0
requirements.txt +4 -0

app.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import concurrent.futures
+import functools
+import typing as tp
+import gradio as gr
+import numpy as np
+from magenta_rt import system, audio as audio_lib
+class AudioFade:
+    """Handles the cross-fade between audio chunks.
+    Args:
+      chunk_size: Number of audio samples per predicted frame (current
+        SpectroStream models produce 25Hz frames corresponding to 1920 audio
+        samples at 48kHz)
+      num_chunks: Number of audio chunks to fade between.
+      stereo: Whether the predicted audio is stereo or mono.
+    """
+    def __init__(self, chunk_size: int, num_chunks: int, stereo: bool):
+        fade_size = chunk_size * num_chunks
+        self.fade_size = fade_size
+        self.num_chunks = num_chunks
+        self.previous_chunk = np.zeros(fade_size)
+        self.ramp = np.sin(np.linspace(0, np.pi / 2, fade_size)) ** 2
+        if stereo:
+            self.previous_chunk = self.previous_chunk[:, np.newaxis]
+            self.ramp = self.ramp[:, np.newaxis]
+    def reset(self):
+        self.previous_chunk = np.zeros_like(self.previous_chunk)
+    def __call__(self, chunk: np.ndarray) -> np.ndarray:
+        chunk[: self.fade_size] *= self.ramp
+        chunk[: self.fade_size] += self.previous_chunk
+        self.previous_chunk = chunk[-self.fade_size:] * np.flip(self.ramp)
+        return chunk[: -self.fade_size]
+class MagentaRTStreamer:
+    """Audio streamer class for our open weights Magenta RT model.
+    This class holds a pretrained Magenta RT model, a cross-fade state, a
+    generation state and an asynchronous executor to handle the embedding of text
+    prompt without interrupting the audio thread.
+    Args:
+      system: A MagentaRTBase instance.
+    """
+    def __init__(self, system: system.MagentaRTBase):
+        super().__init__()
+        self.system = system
+        self.fade = AudioFade(chunk_size=1920, num_chunks=1, stereo=True)
+        self.state = None
+        self.executor = concurrent.futures.ThreadPoolExecutor()
+    @property
+    def warmup(self):
+        return True
+    @functools.cache
+    def embed_style(self, style: str):
+        return self.executor.submit(self.system.embed_style, style)
+    @functools.cache
+    def embed_audio(self, audio: tuple[float]):
+        audio = audio_lib.Waveform(np.asarray(audio), 16000)
+        return self.executor.submit(self.system.embed_style, audio)
+    def get_style_embedding(self, force_wait: bool = False):
+        prompts = [
+            ("syntethizer", 1),
+            ("flamenco guitar", 0.7),
+        ]  # Parameterize with your prompts
+        weighted_embedding = np.zeros((768,), dtype=np.float32)
+        total_weight = 0.0
+        for text_or_audio, weight in prompts:
+            if not weight:
+                continue
+            if isinstance(text_or_audio, np.ndarray):
+                embedding = self.embed_audio(tuple(text_or_audio))
+            else:
+                if not text_or_audio:
+                    continue
+                embedding = self.embed_style(text_or_audio)
+            if force_wait:
+                embedding.result()
+            if embedding.done():
+                weighted_embedding += embedding.result() * weight
+                total_weight += weight
+        if total_weight > 0:
+            weighted_embedding /= total_weight
+        return weighted_embedding
+    def on_stream_start(self):
+        self.get_style_embedding(force_wait=False)
+        self.get_style_embedding(force_wait=True)
+    def reset(self):
+        self.state = None
+        self.fade.reset()
+        self.embed_style.cache_clear()
+    def generate(self):
+        chunk, self.state = self.system.generate_chunk(
+            state=self.state,
+            style=self.get_style_embedding(),
+            seed=None,
+            # **ui_params,
+        )
+        chunk = self.fade(chunk.samples)
+        return chunk
+    def stop(self):
+        self.executor.shutdown(wait=True)
+is_stopped = False
+MRT = system.MagentaRT(tag="large", device="gpu", lazy=False)
+streamer: tp.Union[MagentaRTStreamer, None] = None
+def play():
+    global streamer
+    if streamer is not None:
+        gr.Info("Audio is already playing.")
+        return
+    streamer = MagentaRTStreamer(MRT)
+    streamer.on_stream_start()
+    while not is_stopped:
+        waveform = streamer.generate()
+        yield waveform
+def stop():
+    global is_stopped, streamer
+    if is_stopped is None:
+        gr.Info("No audio is currently playing.")
+    is_stopped = True
+    if streamer is not None:
+        streamer.stop()
+        del streamer
+    gr.Info("Audio playback stopped.")
+with gr.Blocks() as block:
+    gr.Markdown("# Magenta RT Audio Player")
+    with gr.Group():
+        with gr.Row():
+            audio_out = gr.Audio(label="Magenta RT", streaming=True, autoplay=True, loop=False)
+            # text_out = gr.Textbox(label="Output Text", placeholder="Generated text will appear here", lines=2)
+        with gr.Row():
+            play_button = gr.Button("Play", variant="primary")
+            stop_button = gr.Button("Stop", variant="secondary")
+    play_button.click(play, outputs=audio_out)
+    stop_button.click(stop)
+block.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+magenta-rt[gpu] @ git+https://github.com/magenta/magenta-realtime.git@main#egg=magenta_rt
+tf-nightly==2.20.0.dev20250619
+tensorflow-text-nightly==2.20.0.dev20250316
+tf-hub-nightly