Spaces:

justus-tobias
/

Moshi

Paused

App Files Files Community

justus-tobias commited on Sep 25, 2024

Commit

a0de5e2

1 Parent(s): 8e9a234

cleaned code

Browse files

Files changed (2) hide show

README.md +1 -1
app.py +5 -118

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: Moshi
-emoji: 📈
 colorFrom: indigo
 colorTo: gray
 sdk: gradio

 ---
 title: Moshi
+emoji: 💨
 colorFrom: indigo
 colorTo: gray
 sdk: gradio

app.py CHANGED Viewed

@@ -2,79 +2,10 @@ import gradio as gr
 import torch
 from huggingface_hub import hf_hub_download
 from moshi.models import loaders, LMGen
-import tempfile
-import os
-import soundfile as sf
 import numpy as np
-import time
-def process_wav(wav):
-    mimi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MIMI_NAME)
-    mimi = loaders.get_mimi(mimi_weight, device='cpu')
-    mimi.set_num_codebooks(8)  # up to 32 for mimi, but limited to 8 for moshi.
-    #wav = torch.randn(1, 1, 24000 * 10)  # should be [B, C=1, T]
-    with torch.no_grad():
-        codes = mimi.encode(wav)  # [B, K = 8, T]
-        # decoded = mimi.decode(codes)
-        # # Supports streaming too.
-        # frame_size = int(mimi.sample_rate / mimi.frame_rate)
-        # all_codes = []
-        # with mimi.streaming(batch_size=1):
-        #     for offset in range(0, wav.shape[-1], frame_size):
-        #         frame = wav[:, :, offset: offset + frame_size]
-        #         codes = mimi.encode(frame)
-        #         assert codes.shape[-1] == 1, codes.shape
-        #         all_codes.append(codes)
-    all_codes = codes
-    mimi.cuda()
-    moshi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MOSHI_NAME)
-    moshi = loaders.get_moshi_lm(moshi_weight, device='cuda')
-    lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)  # this handles sampling params etc.
-    out_wav_chunks = []
-    # Now we will stream over both Moshi I/O, and decode on the fly with Mimi.
-    with torch.no_grad(), lm_gen.streaming(1), mimi.streaming(1):
-        for idx, code in enumerate(all_codes):
-            tokens_out = lm_gen.step(code.cuda())
-            # tokens_out is [B, 1 + 8, 1], with tokens_out[:, 1] representing the text token.
-            if tokens_out is not None:
-                wav_chunk = mimi.decode(tokens_out[:, 1:])
-                out_wav_chunks.append(wav_chunk)
-            print(idx, end='\r')
-    out_wav = torch.cat(out_wav_chunks, dim=-1)
-    return out_wav
-def select_audio_frame(audio_tensor, frame_size, start_index=0):
-    # Ensure the audio tensor is in the correct shape (1, 1, samples)
-    if audio_tensor.dim() != 3 or audio_tensor.size(0) != 1 or audio_tensor.size(1) != 1:
-        raise ValueError("Audio tensor must have shape (1, 1, samples)")
-    # Get the total number of samples
-    total_samples = audio_tensor.size(2)
-    # If i is not provided, use the total number of samples
-    i = total_samples
-    # Calculate the start and end indices
-    start_index = max(0, i - frame_size)
-    end_index = i
-    # Extract the frame
-    frame = audio_tensor[0, 0, start_index:end_index]
-    # If the frame is smaller than the desired size, pad with zeros at the beginning
-    if frame.size(0) < frame_size:
-        frame = torch.nn.functional.pad(frame, (frame_size - frame.size(0), 0))
-    # Reshape to match the original tensor shape
-    return frame.unsqueeze(0).unsqueeze(0)
 def process_wav_new(in_wav):
     """wav = torch.randn(1, 1, 24000 * 10)  # should be [B, C=1, T]"""
@@ -193,6 +124,10 @@ Monologue” method significantly improves the linguistic quality of generated s
                 - **Demo:** [demo](https://moshi.chat/) """)
     input_audio = gr.Audio(sources="microphone", label="Input Audio")
     output_audio = gr.Audio(label="Processed Audio", streaming=True, autoplay=True)
@@ -221,52 +156,4 @@ Monologue” method significantly improves the linguistic quality of generated s
                 elem_id="citation-button",
                 show_copy_button=True,
             )
-demo.launch(debug=True)
-##########################################################################################################
-##########################################################################################################
-# import gradio as gr
-# import numpy as np
-# import time
-# def process_stream(audio, instream):
-#     if audio is None:
-#         return gr.update(), instream
-#     if instream is None:
-#         ret = audio
-#     else:
-#         print("STREAM RECIEVED")
-#         stream = (audio[0], np.concatenate((instream[1], audio[1])))
-#         # Assuming instream[1] and audio[1] are valid inputs for convert2wav
-#         wav1 = convert2wav(instream[1])
-#         wav2 = convert2wav(audio[1])
-#         # Concatenate along the last dimension (time axis)
-#         combined_wav = torch.cat((wav1, wav2), dim=2)
-#         print("WAV COMBINED")
-#         yield from process_wav_new(combined_wav, stream)
-# with gr.Blocks() as demo:
-#     gr.Markdown("# Moshi Demo")
-#     gr.Markdown(" ")
-#     gr.Markdown("-----------")
-#     inp = gr.Audio(sources="microphone")
-#     out = gr.Audio(autoplay=True)
-#     stream = gr.State()
-#     clear = gr.Button("Clear")
-#     inp.stream(process_stream, [inp, stream], [out, stream])
-#     clear.click(lambda: [None, None, None], None, [inp, out, stream])
-# demo.launch(debug=True)

 import torch
 from huggingface_hub import hf_hub_download
 from moshi.models import loaders, LMGen
 import numpy as np
 def process_wav_new(in_wav):
     """wav = torch.randn(1, 1, 24000 * 10)  # should be [B, C=1, T]"""
                 - **Demo:** [demo](https://moshi.chat/) """)
+    gr.Markdown("""
+                🚨
+                The Model will produce a lot of silence, because it is actually meant to stream the input and output.
+                I will try to create a demo which works with the streaming.""")
     input_audio = gr.Audio(sources="microphone", label="Input Audio")
     output_audio = gr.Audio(label="Processed Audio", streaming=True, autoplay=True)
                 elem_id="citation-button",
                 show_copy_button=True,
             )
+demo.launch(debug=True)