Spaces:

nari-labs
/

Dia2-2B

Running on Zero

App Files Files Community

NariLabs commited on 20 days ago

Commit

1315cad

verified ·

1 Parent(s): 8d6458b

Upload folder using huggingface_hub

Browse files

Files changed (35) hide show

.gitattributes +1 -0
.gitignore +12 -0
LICENSE +201 -0
README.md +103 -12
app.py +251 -0
banner.gif +3 -0
dia2/__init__.py +20 -0
dia2/assets.py +65 -0
dia2/audio/__init__.py +13 -0
dia2/audio/codec.py +58 -0
dia2/audio/grid.py +79 -0
dia2/cli.py +122 -0
dia2/config.py +180 -0
dia2/core/__init__.py +10 -0
dia2/core/cache.py +106 -0
dia2/core/depformer.py +264 -0
dia2/core/layers.py +209 -0
dia2/core/model.py +72 -0
dia2/core/precision.py +23 -0
dia2/core/transformer.py +140 -0
dia2/engine.py +230 -0
dia2/generation.py +158 -0
dia2/runtime/__init__.py +7 -0
dia2/runtime/audio_io.py +69 -0
dia2/runtime/context.py +132 -0
dia2/runtime/generator.py +420 -0
dia2/runtime/guidance.py +38 -0
dia2/runtime/logger.py +33 -0
dia2/runtime/sampler.py +37 -0
dia2/runtime/script_parser.py +69 -0
dia2/runtime/state_machine.py +170 -0
dia2/runtime/voice_clone.py +190 -0
input.txt +1 -0
pyproject.toml +45 -0
uv.lock +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+banner.gif filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,12 @@

+.venv
+_kyutai
+__pycache__
+*.npz
+*.safetensors
+*.model
+*.DS_Store
+*.parquet
+*.wav
+*.mp3
+weights/
+*.egg-info/

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2025 Nari Labs
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,13 +1,104 @@
----
-title: Dia2 2B
-emoji: 💻
-colorFrom: red
-colorTo: purple
-sdk: gradio
-sdk_version: 5.49.1
-app_file: app.py
-pinned: false
-license: apache-2.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+![Banner](banner.gif)
+<div align="center">
+  <a href="https://huggingface.co/nari-labs/Dia2-2B"><img src="https://img.shields.io/badge/HF%20Repo-Dia2--2B-orange?style=for-the-badge"></a>
+  <a href="https://discord.gg/bJq6vjRRKv"><img src="https://img.shields.io/badge/Discord-Join%20Chat-7289DA?logo=discord&style=for-the-badge"></a>
+  <a href="https://github.com/nari-labs/dia2/blob/main/LICENSE"><img src="https://img.shields.io/badge/License-Apache_2.0-blue.svg?style=for-the-badge"></a>
+</div>
+**Dia2** is a **streaming dialogue TTS model** created by Nari Labs.
+The model does not need the entire text to produce the audio, and can start generating as the first few words are given as input. You can condition the output on audio, enabling natural conversations in realtime.
+We provide model checkpoints (1B, 2B) and inference code to accelerate research. The model only supports up to 2 minutes of generation in English.
+⚠️ Quality and voices vary per generation, as the model is not fine-tuned on a specific voice. Use with prefix or fine-tune in order to obtain stable output.
+## Upcoming
+- Bonsai (JAX) implementation
+- Dia2 TTS Server: Real streaming support
+- Sori: Dia2-powered speech-to-speech engine written in Rust
+## Quickstart
+> **Requirement** — install [uv](https://docs.astral.sh/uv/) and use CUDA 12.8+
+> drivers. All commands below run through `uv run …` as a rule.
+1. **Install dependencies (one-time):**
+   ```bash
+   uv sync
+   ```
+2. **Prepare a script:** edit `input.txt` using `[S1]` / `[S2]` speaker tags.
+3. **Generate audio:**
+   ```bash
+   uv run -m dia2.cli \
+     --hf nari-labs/Dia2-2B \
+     --input input.txt \
+     --cfg 6.0 --temperature 0.8 \
+     --cuda-graph --verbose \
+     output.wav
+   ```
+   The first run downloads weights/tokenizer/Mimi. The CLI auto-selects CUDA when available (otherwise CPU) and defaults to bfloat16 precision—override with `--device` / `--dtype` if needed.
+4. **Conditional Generation (recommended for stable use):**
+   ```bash
+   uv run -m dia2.cli \
+     --hf nari-labs/Dia2-2B \
+     --input input.txt \
+     --prefix-speaker-1 example_prefix1.wav \
+     --prefix-speaker-2 example_prefix2.wav \
+     --cuda-graph --verbose \
+     output_conditioned.wav
+   ```
+   Condition the generation on previous conversational context in order to generate natural output for your speech-to-speech system. For example, place the voice of your assistant as prefix speaker 1, place user's audio input as prefix speaker 2, and generate the response to user's input.
+   Whisper is used to transcribe each prefix file, which takes additional time. We include example prefix files as `example_prefix1.wav` and `example_prefix2.wav` (both files are output created by the model).
+6. **Gradio for Easy Usage**
+   ```bash
+   uv run gradio_app.py
+   ```
+### Programmatic Usage
+```python
+from dia2 import Dia2, GenerationConfig, SamplingConfig
+dia = Dia2.from_repo("nari-labs/Dia2-2B", device="cuda", dtype="bfloat16")
+config = GenerationConfig(
+    cfg_scale=2.0,
+    audio=SamplingConfig(temperature=0.8, top_k=50),
+    use_cuda_graph=True,
+)
+result = dia.generate("[S1] Hello Dia2!", config=config, output_wav="hello.wav", verbose=True)
+```
+Generation runs until the runtime config's `max_context_steps` (1500, 2 minutes)
+or until EOS is detected. `GenerationResult` includes audio tokens, waveform tensor,
+and word timestamps relative to Mimi’s ~12.5 Hz frame rate.
+## Hugging Face
+| Variant | Repo |
+| --- | --- |
+| Dia2-1B | [`nari-labs/Dia2-1B`](https://huggingface.co/nari-labs/Dia2-1B)
+| Dia2-2B | [`nari-labs/Dia2-2B`](https://huggingface.co/nari-labs/Dia2-2B)
+## License & Attribution
+Licensed under [Apache 2.0](LICENSE). All third-party assets (Kyutai Mimi codec, etc.) retain their original licenses.
+## Disclaimer
+This project offers a high-fidelity speech generation model intended for research and educational use. The following uses are **strictly forbidden**:
+- **Identity Misuse**: Do not produce audio resembling real individuals without permission.
+- **Deceptive Content**: Do not use this model to generate misleading content (e.g. fake news)
+- **Illegal or Malicious Use**: Do not use this model for activities that are illegal or intended to cause harm.
+By using this model, you agree to uphold relevant legal standards and ethical responsibilities. We **are not responsible** for any misuse and firmly oppose any unethical usage of this technology.
+## Acknowledgements
+- We thank the [TPU Research Cloud](https://sites.research.google/trc/about/) program for providing compute for training.
+- Our work was heavily inspired by [KyutaiTTS](https://kyutai.org/next/tts) and [Sesame](https://www.sesame.com/research/crossing_the_uncanny_valley_of_voice)
+---
+Questions? Join our [Discord](https://discord.gg/bJq6vjRRKv) or open an issue.

app.py ADDED Viewed

	@@ -0,0 +1,251 @@

+from __future__ import annotations
+import contextlib
+import io
+import os
+from pathlib import Path
+from typing import List, Tuple
+import gradio as gr
+import torch
+from dia2 import Dia2, GenerationConfig, SamplingConfig
+DEFAULT_REPO = os.environ.get("DIA2_DEFAULT_REPO", "nari-labs/Dia2-2B")
+MAX_TURNS = 10
+INITIAL_TURNS = 2
+_dia: Dia2 | None = None
+def _get_dia() -> Dia2:
+    global _dia
+    if _dia is None:
+        _dia = Dia2.from_repo(DEFAULT_REPO, device="cuda", dtype="bfloat16")
+    return _dia
+def _concat_script(turn_count: int, turn_values: List[str]) -> str:
+    lines: List[str] = []
+    for idx in range(min(turn_count, len(turn_values))):
+        text = (turn_values[idx] or "").strip()
+        if not text:
+            continue
+        speaker = "[S1]" if idx % 2 == 0 else "[S2]"
+        lines.append(f"{speaker} {text}")
+    return "\n".join(lines)
+EXAMPLES: dict[str, List[str]] = {
+    "Intro": [
+        "Hello Dia2 fans! Today we're unveiling the new open TTS model.",
+        "Sounds exciting. Can you show a sample right now?",
+        "Absolutely. (laughs) Just press generate.",
+    ],
+    "Customer Support": [
+        "Thanks for calling. How can I help you today?",
+        "My parcel never arrived and it's been two weeks.",
+        "I'm sorry about that. Let me check your tracking number.",
+        "Appreciate it. I really need that package soon.",
+    ],
+}
+def _apply_turn_visibility(count: int) -> List[gr.Update]:
+    return [gr.update(visible=i < count) for i in range(MAX_TURNS)]
+def _add_turn(count: int):
+    count = min(count + 1, MAX_TURNS)
+    return (count, *_apply_turn_visibility(count))
+def _remove_turn(count: int):
+    count = max(1, count - 1)
+    return (count, *_apply_turn_visibility(count))
+def _load_example(name: str, count: int):
+    data = EXAMPLES.get(name)
+    if not data:
+        return (count, *_apply_turn_visibility(count))
+    new_count = min(len(data), MAX_TURNS)
+    updates: List[gr.Update] = []
+    for idx in range(MAX_TURNS):
+        if idx < new_count:
+            updates.append(gr.update(value=data[idx], visible=True))
+        else:
+            updates.append(gr.update(value="", visible=idx < INITIAL_TURNS))
+    return (new_count, *updates)
+def _prepare_prefix(file_path: str | None) -> str | None:
+    if not file_path:
+        return None
+    path = Path(file_path)
+    if not path.exists():
+        return None
+    return str(path)
+def generate_audio(
+    turn_count: int,
+    *inputs,
+):
+    turn_values = list(inputs[:MAX_TURNS])
+    voice_s1 = inputs[MAX_TURNS]
+    voice_s2 = inputs[MAX_TURNS + 1]
+    cfg_scale = float(inputs[MAX_TURNS + 2])
+    text_temperature = float(inputs[MAX_TURNS + 3])
+    audio_temperature = float(inputs[MAX_TURNS + 4])
+    text_top_k = int(inputs[MAX_TURNS + 5])
+    audio_top_k = int(inputs[MAX_TURNS + 6])
+    include_prefix = bool(inputs[MAX_TURNS + 7])
+    script = _concat_script(turn_count, turn_values)
+    if not script.strip():
+        raise gr.Error("Please enter at least one non-empty speaker turn.")
+    dia = _get_dia()
+    config = GenerationConfig(
+        cfg_scale=cfg_scale,
+        text=SamplingConfig(temperature=text_temperature, top_k=text_top_k),
+        audio=SamplingConfig(temperature=audio_temperature, top_k=audio_top_k),
+        use_cuda_graph=True,
+    )
+    kwargs = {
+        "prefix_speaker_1": _prepare_prefix(voice_s1),
+        "prefix_speaker_2": _prepare_prefix(voice_s2),
+        "include_prefix": include_prefix,
+    }
+    buffer = io.StringIO()
+    with contextlib.redirect_stdout(buffer):
+        result = dia.generate(
+            script,
+            config=config,
+            output_wav=None,
+            verbose=True,
+            **kwargs,
+        )
+    waveform = result.waveform.detach().cpu().numpy()
+    sample_rate = result.sample_rate
+    timestamps = result.timestamps
+    log_text = buffer.getvalue().strip()
+    table = [[w, round(t, 3)] for w, t in timestamps]
+    return (sample_rate, waveform), table, log_text or "Generation finished."
+def build_interface() -> gr.Blocks:
+    with gr.Blocks(
+        title="Dia2 TTS", css=".compact-turn textarea {min-height: 60px}"
+    ) as demo:
+        gr.Markdown(
+            """## Dia2 — Open TTS Model
+Compose dialogue, attach optional voice prompts, and generate audio (CUDA graphs enabled by default)."""
+        )
+        turn_state = gr.State(INITIAL_TURNS)
+        with gr.Row(equal_height=True):
+            example_dropdown = gr.Dropdown(
+                choices=["(select example)"] + list(EXAMPLES.keys()),
+                label="Examples",
+                value="(select example)",
+            )
+        with gr.Row(equal_height=True):
+            with gr.Column(scale=1):
+                with gr.Group():
+                    gr.Markdown("### Script")
+                    controls = []
+                    for idx in range(MAX_TURNS):
+                        speaker = "[S1]" if idx % 2 == 0 else "[S2]"
+                        box = gr.Textbox(
+                            label=f"{speaker} turn {idx + 1}",
+                            lines=2,
+                            elem_classes=["compact-turn"],
+                            placeholder=f"Enter dialogue for {speaker}…",
+                            visible=idx < INITIAL_TURNS,
+                        )
+                        controls.append(box)
+                    with gr.Row():
+                        add_btn = gr.Button("Add Turn")
+                        remove_btn = gr.Button("Remove Turn")
+                with gr.Group():
+                    gr.Markdown("### Voice Prompts")
+                    with gr.Row():
+                        voice_s1 = gr.File(
+                            label="[S1] voice (wav/mp3)", type="filepath"
+                        )
+                        voice_s2 = gr.File(
+                            label="[S2] voice (wav/mp3)", type="filepath"
+                        )
+                with gr.Group():
+                    gr.Markdown("### Sampling")
+                    cfg_scale = gr.Slider(
+                        1.0, 8.0, value=6.0, step=0.1, label="CFG Scale"
+                    )
+                    with gr.Group():
+                        gr.Markdown("#### Text Sampling")
+                        text_temperature = gr.Slider(
+                            0.1, 1.5, value=0.6, step=0.05, label="Text Temperature"
+                        )
+                        text_top_k = gr.Slider(
+                            1, 200, value=50, step=1, label="Text Top-K"
+                        )
+                    with gr.Group():
+                        gr.Markdown("#### Audio Sampling")
+                        audio_temperature = gr.Slider(
+                            0.1, 1.5, value=0.8, step=0.05, label="Audio Temperature"
+                        )
+                        audio_top_k = gr.Slider(
+                            1, 200, value=50, step=1, label="Audio Top-K"
+                        )
+                    include_prefix = gr.Checkbox(
+                        label="Keep prefix audio in output", value=False
+                    )
+                    generate_btn = gr.Button("Generate", variant="primary")
+            with gr.Column(scale=1):
+                gr.Markdown("### Output")
+                audio_out = gr.Audio(label="Waveform", interactive=False)
+                timestamps = gr.Dataframe(
+                    headers=["word", "seconds"], label="Timestamps"
+                )
+                log_box = gr.Textbox(label="Logs", lines=8)
+        add_btn.click(
+            lambda c: _add_turn(c),
+            inputs=turn_state,
+            outputs=[turn_state, *controls],
+        )
+        remove_btn.click(
+            lambda c: _remove_turn(c),
+            inputs=turn_state,
+            outputs=[turn_state, *controls],
+        )
+        example_dropdown.change(
+            lambda name, c: _load_example(name, c),
+            inputs=[example_dropdown, turn_state],
+            outputs=[turn_state, *controls],
+        )
+        generate_btn.click(
+            generate_audio,
+            inputs=[
+                turn_state,
+                *controls,
+                voice_s1,
+                voice_s2,
+                cfg_scale,
+                text_temperature,
+                audio_temperature,
+                text_top_k,
+                audio_top_k,
+                include_prefix,
+            ],
+            outputs=[audio_out, timestamps, log_box],
+        )
+    return demo
+if __name__ == "__main__":
+    app = build_interface()
+    app.queue(default_concurrency_limit=1)
+    app.launch(share=True)

banner.gif ADDED Viewed

Git LFS Details

SHA256: 43198c9f6216a8884031b8875aea5efb8822f1ea188ee4e059512001cc837d11
Pointer size: 132 Bytes
Size of remote file: 2.01 MB

dia2/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from .config import DiaConfig, load_config
+from .core.model import Dia2Model
+from .engine import Dia2
+from .generation import (
+    GenerationConfig,
+    GenerationResult,
+    PrefixConfig,
+    SamplingConfig,
+)
+__all__ = [
+    "DiaConfig",
+    "Dia2Model",
+    "load_config",
+    "GenerationConfig",
+    "GenerationResult",
+    "PrefixConfig",
+    "SamplingConfig",
+    "Dia2",
+]

dia2/assets.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from __future__ import annotations
+import json
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+from huggingface_hub import hf_hub_download
+ASSET_MANIFEST = os.environ.get("DIA2_ASSET_MANIFEST", "dia2_assets.json")
+@dataclass(frozen=True)
+class AssetBundle:
+    config_path: str
+    weights_path: str
+    tokenizer_id: Optional[str]
+    mimi_id: Optional[str]
+    repo_id: Optional[str]
+def resolve_assets(
+    *,
+    repo: Optional[str],
+    config_path: Optional[str | Path],
+    weights_path: Optional[str | Path],
+    manifest_name: Optional[str] = None,
+) -> AssetBundle:
+    repo_id = repo
+    manifest_name = manifest_name or ASSET_MANIFEST
+    if repo_id and (config_path or weights_path):
+        raise ValueError("Provide either repo or config+weights, not both")
+    if config_path is None or weights_path is None:
+        if repo_id is None:
+            raise ValueError("Must specify repo or config+weights")
+        manifest = load_manifest(repo_id, manifest_name)
+        config_name = manifest.get("config", "config.json")
+        weights_name = manifest.get("weights", "model.safetensors")
+        config_local = hf_hub_download(repo_id, config_name)
+        weights_local = hf_hub_download(repo_id, weights_name)
+        return AssetBundle(
+            config_path=config_local,
+            weights_path=weights_local,
+            tokenizer_id=manifest.get("tokenizer") or repo_id,
+            mimi_id=manifest.get("mimi"),
+            repo_id=repo_id,
+        )
+    return AssetBundle(str(config_path), str(weights_path), None, None, repo_id)
+def load_manifest(repo_id: str, manifest_name: str) -> dict:
+    if not manifest_name:
+        return {}
+    try:
+        path = hf_hub_download(repo_id, manifest_name)
+    except Exception:
+        return {}
+    try:
+        return json.loads(Path(path).read_text())
+    except json.JSONDecodeError:
+        return {}
+__all__ = ["AssetBundle", "ASSET_MANIFEST", "resolve_assets", "load_manifest"]

dia2/audio/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from .codec import MimiCodec, DEFAULT_MIMI_MODEL_ID, MimiConfig
+from .grid import delay_frames, undelay_frames, mask_audio_logits, fill_audio_channels, write_wav
+__all__ = [
+    "MimiCodec",
+    "DEFAULT_MIMI_MODEL_ID",
+    "MimiConfig",
+    "delay_frames",
+    "undelay_frames",
+    "mask_audio_logits",
+    "fill_audio_channels",
+    "write_wav",
+]

dia2/audio/codec.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Optional
+import torch
+from torch import nn
+from transformers import MimiModel
+DEFAULT_MIMI_MODEL_ID = "kyutai/mimi"
+@dataclass(frozen=True)
+class MimiConfig:
+    model_id: str = DEFAULT_MIMI_MODEL_ID
+    dtype: Optional[torch.dtype] = None
+class MimiCodec(nn.Module):
+    """Thin wrapper around transformers' MimiModel for decoding audio tokens."""
+    def __init__(self, model: MimiModel, device: torch.device) -> None:
+        super().__init__()
+        self.model = model
+        self.device = device
+        cfg = getattr(model, "config", None)
+        self.sample_rate = getattr(cfg, "sampling_rate", 24000)
+        self.frame_rate = getattr(cfg, "frame_rate", 12.5)
+        self.samples_per_frame = int(round(self.sample_rate / self.frame_rate)) if self.frame_rate else 0
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_id: str = DEFAULT_MIMI_MODEL_ID,
+        *,
+        device: torch.device,
+        dtype: Optional[torch.dtype] = None,
+    ) -> "MimiCodec":
+        model = MimiModel.from_pretrained(
+            model_id,
+            torch_dtype=dtype,
+            low_cpu_mem_usage=True,
+        )
+        model = model.to(device)
+        model.eval()
+        return cls(model, device)
+    def decode(self, codes: torch.Tensor) -> torch.Tensor:
+        codes = codes.to(self.device)
+        with torch.inference_mode():
+            audio, _ = self.model.decode(codes, return_dict=False)
+            return torch.clamp(audio, -1.0, 1.0)
+    def encode(self, audio: torch.Tensor, *, return_dict: bool = False):
+        audio = audio.to(self.device)
+        with torch.inference_mode():
+            return self.model.encode(audio, return_dict=return_dict)

dia2/audio/grid.py ADDED Viewed

	@@ -0,0 +1,79 @@

+from __future__ import annotations
+from pathlib import Path
+from typing import Sequence
+import numpy as np
+import torch
+def delay_frames(aligned: torch.Tensor, delays: Sequence[int], pad_id: int) -> torch.Tensor:
+    channels, total = aligned.shape
+    max_delay = max(delays) if delays else 0
+    out = aligned.new_full((channels, total + max_delay), pad_id)
+    for idx, delay in enumerate(delays):
+        out[idx, delay : delay + total] = aligned[idx]
+    return out
+def undelay_frames(delayed: torch.Tensor, delays: Sequence[int], pad_id: int) -> torch.Tensor:
+    channels, total = delayed.shape
+    max_delay = max(delays) if delays else 0
+    target = max(0, total - max_delay)
+    out = delayed.new_full((channels, target), pad_id)
+    for idx, delay in enumerate(delays):
+        out[idx] = delayed[idx, delay : delay + target]
+    return out
+def mask_audio_logits(logits: torch.Tensor, pad_idx: int, bos_idx: int) -> torch.Tensor:
+    if logits.shape[-1] == 0:
+        return logits
+    max_idx = logits.shape[-1] - 1
+    targets = [idx for idx in (pad_idx, bos_idx) if 0 <= idx <= max_idx]
+    if not targets:
+        return logits
+    masked = logits.clone()
+    neg_inf = torch.finfo(masked.dtype).min
+    for idx in targets:
+        masked[..., idx] = neg_inf
+    return masked
+def fill_audio_channels(
+    delays: Sequence[int],
+    constants,
+    step: int,
+    step_tokens: torch.Tensor,
+    audio_buf: torch.Tensor,
+) -> None:
+    for cb, delay in enumerate(delays):
+        idx = step - delay
+        in_bounds = idx >= 0 and step < audio_buf.shape[-1]
+        if in_bounds:
+            step_tokens[:, 2 + cb, 0] = audio_buf[:, cb, step]
+        else:
+            step_tokens[:, 2 + cb, 0] = constants.audio_bos
+def write_wav(path: str | Path, audio: np.ndarray, sample_rate: int) -> None:
+    path = Path(path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    audio = np.clip(audio, -1.0, 1.0)
+    pcm16 = (audio * 32767.0).astype(np.int16)
+    import wave
+    with wave.open(str(path), "wb") as handle:
+        handle.setnchannels(1)
+        handle.setsampwidth(2)
+        handle.setframerate(sample_rate)
+        handle.writeframes(pcm16.tobytes())
+__all__ = [
+    "delay_frames",
+    "undelay_frames",
+    "mask_audio_logits",
+    "fill_audio_channels",
+    "write_wav",
+]

dia2/cli.py ADDED Viewed

	@@ -0,0 +1,122 @@

+from __future__ import annotations
+import argparse
+import torch
+from .engine import Dia2
+from .generation import (
+    build_generation_config,
+    load_script_text,
+    validate_generation_params,
+)
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate audio with Dia2")
+    parser.add_argument("--config", help="Path to config.json (overrides repo lookup)")
+    parser.add_argument(
+        "--weights", help="Path to model.safetensors (overrides repo lookup)"
+    )
+    parser.add_argument(
+        "--hf",
+        required=False,
+        help="Hugging Face repo id to download config/weights from (e.g. nari-labs/Dia2-2B)",
+    )
+    parser.add_argument(
+        "--input", default="input.txt", help="Script text file (default: input.txt)"
+    )
+    parser.add_argument("output", help="Output WAV path")
+    parser.add_argument(
+        "--device",
+        default=None,
+        help="Computation device (defaults to cuda if available, else cpu)",
+    )
+    parser.add_argument(
+        "--dtype",
+        choices=["auto", "float32", "bfloat16"],
+        default="bfloat16",
+        help="Computation dtype (default: bfloat16)",
+    )
+    parser.add_argument("--topk", type=int, default=50)
+    parser.add_argument("--temperature", type=float, default=0.8)
+    parser.add_argument("--cfg", type=float, default=1.0)
+    parser.add_argument("--tokenizer", help="Tokenizer repo or local path override")
+    parser.add_argument(
+        "--mimi", help="Mimi repo id override (defaults to config/assets)"
+    )
+    parser.add_argument("--prefix-speaker-1", help="Prefix audio file for speaker 1")
+    parser.add_argument("--prefix-speaker-2", help="Prefix audio file for speaker 2")
+    parser.add_argument(
+        "--include-prefix",
+        action="store_true",
+        help="Keep prefix audio in the final waveform (default: trimmed)",
+    )
+    parser.add_argument(
+        "--verbose", action="store_true", help="Print generation progress logs"
+    )
+    parser.add_argument(
+        "--cuda-graph",
+        action="store_true",
+        help="Run generation with CUDA graph capture",
+    )
+    args = parser.parse_args()
+    device = args.device
+    if device is None or device == "auto":
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype = args.dtype or "bfloat16"
+    repo = args.hf
+    if repo:
+        dia = Dia2(
+            repo=repo,
+            device=device,
+            dtype=dtype,
+            tokenizer_id=args.tokenizer,
+            mimi_id=args.mimi,
+        )
+    elif args.config and args.weights:
+        dia = Dia2.from_local(
+            config_path=args.config,
+            weights_path=args.weights,
+            device=device,
+            dtype=dtype,
+            tokenizer_id=args.tokenizer,
+            mimi_id=args.mimi,
+        )
+    else:
+        raise ValueError("Provide --hf/--variant or both --config and --weights")
+    script = load_script_text(args.input)
+    temperature, top_k, cfg_scale = validate_generation_params(
+        temperature=args.temperature,
+        top_k=args.topk,
+        cfg_scale=args.cfg,
+    )
+    config = build_generation_config(
+        temperature=temperature,
+        top_k=top_k,
+        cfg_scale=cfg_scale,
+    )
+    overrides = {}
+    if args.cuda_graph:
+        overrides["use_cuda_graph"] = True
+    if args.prefix_speaker_1:
+        overrides["prefix_speaker_1"] = args.prefix_speaker_1
+    if args.prefix_speaker_2:
+        overrides["prefix_speaker_2"] = args.prefix_speaker_2
+    if args.include_prefix:
+        overrides["include_prefix"] = True
+    dia.generate(
+        script,
+        config=config,
+        output_wav=args.output,
+        verbose=args.verbose,
+        **overrides,
+    )
+if __name__ == "__main__":
+    main()

dia2/config.py ADDED Viewed

	@@ -0,0 +1,180 @@

+from __future__ import annotations
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Optional
+@dataclass(frozen=True)
+class DataConfig:
+    channels: int
+    text_vocab_size: int
+    audio_vocab_size: int
+    action_vocab_size: int
+    text_pad_token_id: int
+    text_new_word_token_id: int
+    text_zero_token_id: int
+    audio_pad_token_id: int
+    audio_bos_token_id: int
+    action_pad_token_id: int
+    action_new_word_token_id: int
+    delay_pattern: List[int]
+    first_word_min_start: int
+    max_pad: int
+    second_stream_ahead: int
+    tokenizer_path: Optional[str] = None
+@dataclass(frozen=True)
+class DecoderConfig:
+    n_layer: int
+    n_embd: int
+    n_hidden: int
+    gqa_query_heads: int
+    kv_heads: int
+    gqa_head_dim: int
+    dropout: float
+    low_rank_dim: int | None = None
+@dataclass(frozen=True)
+class DepformerConfig:
+    n_layer: int
+    n_embd: int
+    n_hidden: int
+    gqa_query_heads: int
+    kv_heads: int
+    gqa_head_dim: int
+    apply_rope: bool
+    text_embedding: bool
+    mlp_activations: List[str]
+@dataclass(frozen=True)
+class LinearHeadConfig:
+    mlp_activations: List[str]
+@dataclass(frozen=True)
+class ModelConfig:
+    decoder: DecoderConfig
+    depformer: DepformerConfig
+    linear: LinearHeadConfig
+    dropout: float
+    rope_min_timescale: int
+    rope_max_timescale: int
+    normalization_layer_epsilon: float
+@dataclass(frozen=True)
+class RuntimeConfig:
+    weights_schedule: List[int]
+    max_context_steps: int
+@dataclass(frozen=True)
+class AssetsConfig:
+    tokenizer: Optional[str]
+    mimi: Optional[str]
+@dataclass(frozen=True)
+class DiaConfig:
+    data: DataConfig
+    model: ModelConfig
+    runtime: RuntimeConfig
+    assets: AssetsConfig
+def _resolve_runtime(block: dict | None, data_cfg: DataConfig) -> RuntimeConfig:
+    block = block or {}
+    weights_schedule = block.get("weights_schedule")
+    if weights_schedule is None:
+        audio_channels = max(0, data_cfg.channels - 2)
+        weights_schedule = list(range(max(audio_channels - 1, 0)))
+    max_context = block.get("max_context_steps", 1500)
+    return RuntimeConfig(
+        weights_schedule=list(weights_schedule),
+        max_context_steps=int(max_context),
+    )
+def load_config(path: str | Path) -> DiaConfig:
+    cfg = json.loads(Path(path).read_text())
+    data = cfg["data"]
+    model = cfg["model"]
+    runtime_cfg_raw = cfg.get("runtime")
+    if runtime_cfg_raw is None:
+        raise ValueError(f"Config '{path}' is missing a runtime block")
+    decoder_cfg = DecoderConfig(
+        n_layer=model["decoder"]["n_layer"],
+        n_embd=model["decoder"]["n_embd"],
+        n_hidden=model["decoder"]["n_hidden"],
+        gqa_query_heads=model["decoder"]["gqa_query_heads"],
+        kv_heads=model["decoder"]["kv_heads"],
+        gqa_head_dim=model["decoder"]["gqa_head_dim"],
+        dropout=model.get("dropout", 0.0),
+        low_rank_dim=model["decoder"].get("low_rank_dim"),
+    )
+    depformer_cfg = DepformerConfig(
+        n_layer=model["depformer"]["n_layer"],
+        n_embd=model["depformer"]["n_embd"],
+        n_hidden=model["depformer"]["n_hidden"],
+        gqa_query_heads=model["depformer"]["gqa_query_heads"],
+        kv_heads=model["depformer"]["kv_heads"],
+        gqa_head_dim=model["depformer"]["gqa_head_dim"],
+        apply_rope=model["depformer"].get("apply_rope", True),
+        text_embedding=model["depformer"].get("text_embedding", True),
+        mlp_activations=model["depformer"].get("mlp_activations", ["silu", "linear"]),
+    )
+    data_cfg = DataConfig(
+        channels=data["channels"],
+        text_vocab_size=data["text_vocab_size"],
+        audio_vocab_size=data["audio_vocab_size"],
+        action_vocab_size=data["action_vocab_size"],
+        text_pad_token_id=data["text_pad_token_id"],
+        text_new_word_token_id=data["text_new_word_token_id"],
+        text_zero_token_id=data.get("text_zero_token_id", 7),
+        audio_pad_token_id=data.get("audio_pad_token_id", data["audio_vocab_size"] - 1),
+        audio_bos_token_id=data.get("audio_bos_token_id", data["audio_vocab_size"] - 2),
+        action_pad_token_id=data["action_pad_token_id"],
+        action_new_word_token_id=data["action_new_word_token_id"],
+        delay_pattern=list(data.get("delay_pattern", [])),
+        first_word_min_start=data.get("first_word_min_start", 0),
+        max_pad=data.get("max_pad", 0),
+        second_stream_ahead=data.get("second_stream_ahead", 0),
+        tokenizer_path=data.get("tokenizer_path"),
+    )
+    runtime_cfg = _resolve_runtime(runtime_cfg_raw, data_cfg)
+    linear_cfg = LinearHeadConfig(
+        mlp_activations=model.get("linear", {}).get("mlp_activations", ["silu", "linear"]),
+    )
+    model_cfg = ModelConfig(
+        decoder=decoder_cfg,
+        depformer=depformer_cfg,
+        linear=linear_cfg,
+        dropout=model.get("dropout", 0.0),
+        rope_min_timescale=model.get("rope_min_timescale", 1),
+        rope_max_timescale=model.get("rope_max_timescale", 10000),
+        normalization_layer_epsilon=model.get("normalization_layer_epsilon", 1e-5),
+    )
+    assets_raw = cfg.get("assets") or {}
+    assets_cfg = AssetsConfig(
+        tokenizer=assets_raw.get("tokenizer") or data_cfg.tokenizer_path,
+        mimi=assets_raw.get("mimi"),
+    )
+    return DiaConfig(
+        data=data_cfg,
+        model=model_cfg,
+        runtime=runtime_cfg,
+        assets=assets_cfg,
+    )

dia2/core/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from .model import Dia2Model, DecodeState
+from .transformer import TransformerDecoder
+from .depformer import Depformer
+__all__ = [
+    "Dia2Model",
+    "DecodeState",
+    "TransformerDecoder",
+    "Depformer",
+]

dia2/core/cache.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from typing import List
+import torch
+@dataclass
+class CacheSlot:
+    keys: torch.Tensor
+    values: torch.Tensor
+    def __post_init__(self) -> None:
+        self.max_steps = self.keys.shape[2]
+        self.head_dim = self.keys.shape[3]
+        self.flat_heads = self.keys.shape[0] * self.keys.shape[1]
+        device = self.keys.device
+        self.length = torch.zeros((), dtype=torch.long, device=device)
+        self.positions = torch.arange(self.max_steps, dtype=torch.long, device=device)
+    @classmethod
+    def allocate(
+        cls,
+        *,
+        batch_size: int,
+        heads: int,
+        max_steps: int,
+        head_dim: int,
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> "CacheSlot":
+        keys = torch.zeros(batch_size, heads, max_steps, head_dim, device=device, dtype=dtype)
+        values = torch.zeros_like(keys)
+        return cls(keys, values)
+    def reset(self) -> None:
+        self.length.zero_()
+    def write_and_view(
+        self,
+        key_chunk: torch.Tensor,
+        value_chunk: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        step = key_chunk.shape[2]
+        start = self.length
+        indices = self.positions[:step] + start
+        expanded = indices.unsqueeze(0).expand(self.flat_heads, -1)
+        flat_keys = self.keys.view(self.flat_heads, self.max_steps, self.head_dim)
+        flat_values = self.values.view(self.flat_heads, self.max_steps, self.head_dim)
+        flat_key_chunk = key_chunk.reshape(self.flat_heads, step, self.head_dim)
+        flat_value_chunk = value_chunk.reshape(self.flat_heads, step, self.head_dim)
+        scatter_index = expanded.unsqueeze(-1).expand_as(flat_key_chunk)
+        flat_keys.scatter_(1, scatter_index, flat_key_chunk)
+        flat_values.scatter_(1, scatter_index, flat_value_chunk)
+        self.length.add_(step)
+        bool_mask = (self.positions >= self.length).view(1, 1, 1, self.max_steps)
+        mask_dtype = self.keys.dtype
+        mask_value = torch.finfo(mask_dtype).min
+        attn_mask = torch.zeros_like(bool_mask, dtype=mask_dtype)
+        attn_mask = attn_mask.masked_fill(bool_mask, mask_value)
+        return self.keys, self.values, attn_mask
+class KVCache:
+    def __init__(self, slots: List[CacheSlot]) -> None:
+        self.slots = slots
+    @classmethod
+    def allocate(
+        cls,
+        *,
+        num_layers: int,
+        batch_size: int,
+        heads: int,
+        max_steps: int,
+        head_dim: int,
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> "KVCache":
+        slots = [
+            CacheSlot.allocate(
+                batch_size=batch_size,
+                heads=heads,
+                max_steps=max_steps,
+                head_dim=head_dim,
+                device=device,
+                dtype=dtype,
+            )
+            for _ in range(num_layers)
+        ]
+        return cls(slots)
+    def get_slot(self, index: int) -> CacheSlot:
+        return self.slots[index]
+    def reset(self) -> None:
+        for slot in self.slots:
+            slot.reset()
+    clear = reset
+__all__ = ["CacheSlot", "KVCache"]

dia2/core/depformer.py ADDED Viewed

	@@ -0,0 +1,264 @@

+from __future__ import annotations
+from typing import Optional, Tuple
+import torch
+from torch import nn
+import torch.nn.functional as F
+from ..config import DiaConfig
+from .cache import KVCache
+from .layers import MultiStreamEmbedding, Mlp, RotaryEmbedding
+from .precision import Precision
+class ScheduleAttention(nn.Module):
+    """Depformer attention that mirrors dia_v2 ScheduleAttention."""
+    def __init__(self, config: DiaConfig, compute_dtype: torch.dtype) -> None:
+        super().__init__()
+        dep_cfg = config.model.depformer
+        runtime = config.runtime
+        self.schedule = runtime.weights_schedule
+        self.num_query_heads = dep_cfg.gqa_query_heads
+        self.num_kv_heads = dep_cfg.kv_heads
+        self.head_dim = dep_cfg.gqa_head_dim
+        self.num_gqa_groups = self.num_query_heads // max(self.num_kv_heads, 1)
+        self.apply_rope = dep_cfg.apply_rope
+        self.used_ids = sorted(set(self.schedule))
+        self.compute_dtype = compute_dtype
+        self.in_proj = nn.ModuleDict(
+            {
+                str(i): nn.Linear(
+                    dep_cfg.n_embd,
+                    3 * self.num_query_heads * self.head_dim,
+                    bias=False,
+                )
+                for i in self.used_ids
+            }
+        )
+        self.out_proj = nn.ModuleDict(
+            {
+                str(i): nn.Linear(
+                    self.num_query_heads * self.head_dim,
+                    dep_cfg.n_embd,
+                    bias=False,
+                )
+                for i in self.used_ids
+            }
+        )
+        eps = config.model.normalization_layer_epsilon
+        self.q_norm = nn.RMSNorm(self.head_dim, eps=eps, dtype=torch.float32)
+        self.k_norm = nn.RMSNorm(self.head_dim, eps=eps, dtype=torch.float32)
+        if self.apply_rope:
+            self.rotary = RotaryEmbedding(
+                self.head_dim,
+                config.model.rope_min_timescale,
+                config.model.rope_max_timescale,
+            )
+            stage_count = max(len(self.schedule), 1)
+            self.register_buffer(
+                "stage_positions",
+                torch.arange(stage_count, dtype=torch.long).view(stage_count, 1),
+                persistent=False,
+            )
+        else:
+            self.rotary = None
+            self.register_buffer(
+                "stage_positions",
+                torch.zeros(0, 1, dtype=torch.long),
+                persistent=False,
+            )
+    def forward_incremental(
+        self,
+        x_t: torch.Tensor,
+        stage_index: int,
+        cache_slot,
+    ) -> Tuple[torch.Tensor, object]:
+        bsz, seq, _ = x_t.shape
+        if seq != 1:
+            raise ValueError("ScheduleAttention expects seq len 1 during decoding")
+        orig_dtype = x_t.dtype
+        module_index = self.schedule[stage_index]
+        proj = self.in_proj[str(module_index)](x_t.to(torch.float32))
+        proj = proj.view(bsz, seq, 3, self.num_query_heads, self.head_dim).to(self.compute_dtype)
+        q_proj = self.q_norm(proj[:, :, 0])
+        k_proj = self.k_norm(proj[:, :, 1])
+        v_proj = proj[:, :, 2]
+        if self.apply_rope:
+            pos_ids = self.stage_positions[stage_index : stage_index + 1]
+            if pos_ids.device != x_t.device:
+                pos_ids = pos_ids.to(x_t.device)
+            q_proj = self.rotary(q_proj, pos_ids)
+            k_proj = self.rotary(k_proj, pos_ids)
+        q = q_proj.transpose(1, 2)
+        k = k_proj.transpose(1, 2)
+        v = v_proj.transpose(1, 2)
+        if cache_slot is not None:
+            k, v, attn_mask = cache_slot.write_and_view(k, v)
+        else:
+            attn_mask = None
+        attn = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            scale=1.0,
+            attn_mask=attn_mask,
+            enable_gqa=self.num_gqa_groups > 1,
+        )
+        attn = attn.transpose(1, 2).contiguous()
+        flat = attn.reshape(bsz, seq, self.num_query_heads * self.head_dim)
+        out = self.out_proj[str(module_index)](flat.to(torch.float32))
+        return out.to(orig_dtype), cache_slot
+class DepformerLayer(nn.Module):
+    def __init__(self, config: DiaConfig, compute_dtype: torch.dtype):
+        super().__init__()
+        dep_cfg = config.model.depformer
+        eps = config.model.normalization_layer_epsilon
+        self.pre_norm = nn.RMSNorm(dep_cfg.n_embd, eps=eps, dtype=torch.float32)
+        self.post_norm = nn.RMSNorm(dep_cfg.n_embd, eps=eps, dtype=torch.float32)
+        self.self_attention = ScheduleAttention(config, compute_dtype)
+        self.mlp = Mlp(
+            dep_cfg.n_embd,
+            dep_cfg.n_hidden,
+            compute_dtype,
+            tuple(config.model.depformer.mlp_activations),
+        )
+    def decode_step(
+        self,
+        x_t: torch.Tensor,
+        stage_index: int,
+        cache_slot,
+    ) -> Tuple[torch.Tensor, object]:
+        residual = x_t
+        x_norm = self.pre_norm(x_t)
+        sa_out, _ = self.self_attention.forward_incremental(x_norm, stage_index, cache_slot)
+        x = residual + sa_out
+        residual2 = x
+        x_norm2 = self.post_norm(x)
+        mlp_out = self.mlp(x_norm2)
+        return residual2 + mlp_out, cache_slot
+class Depformer(nn.Module):
+    def __init__(self, config: DiaConfig, precision: Precision):
+        super().__init__()
+        self.config = config
+        self.precision = precision
+        dep_cfg = config.model.depformer
+        data_cfg = config.data
+        runtime = config.runtime
+        self.num_audio_channels = max(0, data_cfg.channels - 2)
+        self.num_depth = max(self.num_audio_channels - 1, 0)
+        self.weights_schedule = runtime.weights_schedule
+        self.audio_embeds = nn.ModuleList(
+            [nn.Embedding(data_cfg.audio_vocab_size, dep_cfg.n_embd) for _ in range(self.num_depth)]
+        )
+        if dep_cfg.text_embedding:
+            self.text_embed = MultiStreamEmbedding(
+                data_cfg.text_vocab_size,
+                dep_cfg.n_embd,
+                pad_id=data_cfg.text_pad_token_id,
+                output_dtype=precision.compute,
+            )
+        else:
+            self.text_embed = None
+        used_ids = sorted(set(self.weights_schedule))
+        self.depformer_in = nn.ModuleDict(
+            {
+                str(i): nn.Linear(
+                    config.model.decoder.n_embd,
+                    dep_cfg.n_embd,
+                    bias=False,
+                )
+                for i in used_ids
+            }
+        )
+        self.layers = nn.ModuleList([DepformerLayer(config, precision.compute) for _ in range(dep_cfg.n_layer)])
+        self.norm = nn.RMSNorm(dep_cfg.n_embd, eps=config.model.normalization_layer_epsilon)
+        self.logits_dtype = precision.logits
+        self.logits = nn.ModuleList(
+            [
+                nn.Linear(dep_cfg.n_embd, data_cfg.audio_vocab_size, bias=False)
+                for _ in range(self.num_depth)
+            ]
+        )
+        self.audio_vocab_limit = min(data_cfg.audio_pad_token_id, data_cfg.audio_bos_token_id)
+    def init_cache(self, batch_size: int, device: torch.device, max_steps: int) -> KVCache:
+        heads = self.layers[0].self_attention.num_kv_heads
+        head_dim = self.layers[0].self_attention.head_dim
+        return KVCache.allocate(
+            num_layers=len(self.layers),
+            batch_size=batch_size,
+            heads=heads,
+            max_steps=max_steps,
+            head_dim=head_dim,
+            device=device,
+            dtype=self.precision.compute,
+        )
+    def forward_step(
+        self,
+        prev_audio: torch.Tensor,
+        transformer_out: torch.Tensor,
+        stage_index: int,
+        cache: KVCache,
+        main_text: Optional[torch.Tensor],
+        second_text: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, KVCache]:
+        self._validate_inputs(stage_index, cache)
+        return self._forward_stage(stage_index, prev_audio, transformer_out, cache, main_text, second_text)
+    def _forward_stage(
+        self,
+        stage_index: int,
+        prev_audio: torch.Tensor,
+        transformer_out: torch.Tensor,
+        cache: KVCache,
+        main_text: Optional[torch.Tensor],
+        second_text: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, KVCache]:
+        prev_audio = prev_audio.long()
+        weight_idx = self.weights_schedule[stage_index]
+        token_emb = self.audio_embeds[stage_index](prev_audio[:, None]).to(self.precision.compute)
+        if stage_index == 0 and self.text_embed is not None:
+            if main_text is None or second_text is None:
+                raise ValueError("stage 0 requires text tokens")
+            token_emb = token_emb + self.text_embed(main_text[:, None], second_text[:, None])
+        dep_in = self.depformer_in[str(weight_idx)](transformer_out.to(torch.float32))
+        dep_in = dep_in.to(self.precision.compute)
+        dep_in = dep_in + token_emb.to(dep_in.dtype)
+        x = dep_in
+        for idx, layer in enumerate(self.layers):
+            slot = cache.get_slot(idx)
+            x, _ = layer.decode_step(x, stage_index, slot)
+        hidden = self.norm(x)
+        logits = self.logits[stage_index](hidden.to(torch.float32))
+        logits = logits.to(self.logits_dtype)
+        logits = logits.unsqueeze(1)
+        logits = logits[..., : self.audio_vocab_limit]
+        return logits, cache
+    def _validate_inputs(self, stage_index: int, cache: KVCache | None) -> None:
+        if stage_index < 0 or stage_index >= self.num_depth:
+            raise ValueError(f"stage_index {stage_index} out of range (depth={self.num_depth})")
+        if cache is None:
+            raise ValueError("depformer cache must be initialized")

dia2/core/layers.py ADDED Viewed

	@@ -0,0 +1,209 @@

+from __future__ import annotations
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union, List
+import torch
+from torch import nn
+import torch.nn.functional as F
+class RotaryEmbedding(nn.Module):
+    def __init__(self, head_dim: int, min_timescale: int, max_timescale: int):
+        super().__init__()
+        if head_dim % 2 != 0:
+            raise ValueError("RoPE dimension must be even")
+        half_dim = head_dim // 2
+        fraction = (2.0 * torch.arange(0, half_dim)) / head_dim
+        timescale = min_timescale * (max_timescale / min_timescale) ** fraction
+        inv_freq = 1.0 / timescale
+        self.register_buffer("inv_freq", inv_freq.to(torch.float32), persistent=False)
+    def forward(self, x: torch.Tensor, position_ids: torch.Tensor) -> torch.Tensor:
+        pos = position_ids.to(self.inv_freq.dtype)
+        freqs = torch.einsum("...i,j->...ij", pos, self.inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        while emb.dim() < x.dim():
+            emb = emb.unsqueeze(-2)
+        cos = emb.cos().to(x.dtype)
+        sin = emb.sin().to(x.dtype)
+        x1, x2 = torch.chunk(x, 2, dim=-1)
+        rotated = torch.cat((-x2, x1), dim=-1)
+        return (x * cos) + (rotated * sin)
+def _rotate_half(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[..., ::2]
+    x2 = x[..., 1::2]
+    return torch.stack((-x2, x1), dim=-1).reshape_as(x)
+def _get_activation(name: str) -> nn.Module:
+    name = name.lower()
+    if name in ("silu", "swish", "swiglu"):
+        return nn.SiLU()
+    if name in ("gelu", "geglu"):
+        return nn.GELU()
+    if name == "relu":
+        return nn.ReLU()
+    if name == "linear":
+        return nn.Identity()
+    raise ValueError(f"Unsupported activation {name}")
+@dataclass
+class AttentionShape:
+    dim: int
+    heads: int
+    kv_heads: int
+    head_dim: int
+    rope_min: int
+    rope_max: int
+    apply_rope: bool
+class Attention(nn.Module):
+    """Byte-for-byte port of dia_v2 Attention.forward_incremental."""
+    def __init__(self, config: DiaConfig, dim: int, compute_dtype: torch.dtype) -> None:
+        super().__init__()
+        dec = config.model.decoder
+        self.num_query_heads = dec.gqa_query_heads
+        self.num_kv_heads = dec.kv_heads
+        self.head_dim = dec.gqa_head_dim
+        self.num_gqa_groups = self.num_query_heads // max(self.num_kv_heads, 1)
+        self.compute_dtype = compute_dtype
+        self.q_proj = nn.Linear(dim, self.num_query_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(dim, self.num_kv_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(dim, self.num_kv_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_query_heads * self.head_dim, dim, bias=False)
+        eps = config.model.normalization_layer_epsilon
+        self.q_norm = nn.RMSNorm(self.head_dim, eps=eps, dtype=torch.float32)
+        self.k_norm = nn.RMSNorm(self.head_dim, eps=eps, dtype=torch.float32)
+        self.rotary = RotaryEmbedding(
+            self.head_dim,
+            config.model.rope_min_timescale,
+            config.model.rope_max_timescale,
+        )
+    def forward_incremental(
+        self,
+        x: torch.Tensor,
+        pos: Optional[torch.Tensor],
+        cache_slot,
+    ) -> Tuple[torch.Tensor, object]:
+        B, T, _ = x.shape
+        if T != 1:
+            raise ValueError("Attention expects sequence length 1 during decoding")
+        orig_dtype = x.dtype
+        q_proj = self._project_heads(self.q_proj, x, self.num_query_heads)
+        k_proj = self._project_heads(self.k_proj, x, self.num_kv_heads)
+        v_proj = self._project_heads(self.v_proj, x, self.num_kv_heads)
+        q_proj = self.q_norm(q_proj)
+        k_proj = self.k_norm(k_proj)
+        if pos is not None:
+            q_proj = self.rotary(q_proj, pos)
+            k_proj = self.rotary(k_proj, pos)
+        q = q_proj.transpose(1, 2)
+        k = k_proj.transpose(1, 2)
+        v = v_proj.transpose(1, 2)
+        if cache_slot is not None:
+            k_cache, v_cache, attn_mask = cache_slot.write_and_view(k, v)
+        else:
+            k_cache, v_cache = k, v
+            attn_mask = None
+        attn = F.scaled_dot_product_attention(
+            q,
+            k_cache,
+            v_cache,
+            scale=1.0,
+            attn_mask=attn_mask,
+            enable_gqa=self.num_gqa_groups > 1,
+        )
+        attn = attn.transpose(1, 2).contiguous()
+        flat = attn.reshape(B, T, self.num_query_heads * self.head_dim)
+        out = self.o_proj(flat.to(torch.float32))
+        return out.to(orig_dtype), cache_slot
+    def _project_heads(self, layer: nn.Linear, x: torch.Tensor, heads: int) -> torch.Tensor:
+        proj = layer(x.to(torch.float32))
+        B, T, _ = proj.shape
+        proj = proj.view(B, T, heads, self.head_dim)
+        return proj.to(self.compute_dtype)
+    def forward(
+        self,
+        x: torch.Tensor,
+        positions: Optional[torch.Tensor],
+        cache=None,
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        return self.forward_incremental(x, positions, cache)
+class MultiStreamEmbedding(nn.Module):
+    """Port of dia_v2 MultiStreamEmbed."""
+    def __init__(
+        self,
+        vocab_size: int,
+        dim: int,
+        pad_id: int,
+        *,
+        output_dtype: torch.dtype,
+        low_rank_dim: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+        self.pad_id = pad_id
+        self.dtype = output_dtype
+        base_dim = low_rank_dim if low_rank_dim is not None else dim
+        self.embedding = nn.Embedding(vocab_size, base_dim)
+        self.main_proj = nn.Linear(base_dim, dim, bias=False)
+        self.second_proj = nn.Linear(base_dim, dim, bias=False)
+    def forward(self, main_inputs: torch.Tensor, second_inputs: torch.Tensor) -> torch.Tensor:
+        main_inputs = main_inputs.long()
+        second_inputs = second_inputs.long()
+        if self.pad_id is not None:
+            second_is_pad = second_inputs == self.pad_id
+        else:
+            second_is_pad = torch.zeros_like(second_inputs, dtype=torch.bool)
+        use_second = ~second_is_pad
+        emb_main = self.embedding(main_inputs)
+        emb_second = self.embedding(second_inputs)
+        out_main = self.main_proj(emb_main.to(torch.float32))
+        out_second = self.second_proj(emb_second.to(torch.float32))
+        zeros = torch.zeros_like(out_second)
+        y = out_main + torch.where(use_second.unsqueeze(-1), out_second, zeros)
+        target_dtype = self.dtype if self.dtype is not None else y.dtype
+        return y.to(target_dtype)
+class Mlp(nn.Module):
+    """Port of dia_v2 MlpBlock (two-activation gated MLP)."""
+    def __init__(
+        self,
+        dim: int,
+        hidden: int,
+        compute_dtype: torch.dtype,
+        activations: Sequence[str],
+    ) -> None:
+        super().__init__()
+        if len(activations) != 2:
+            raise ValueError("Mlp expects two activation functions.")
+        self.dtype = compute_dtype
+        self.hidden = hidden
+        self.branch_count = len(activations)
+        self.wi = nn.Linear(dim, self.branch_count * hidden, bias=False)
+        self.wo = nn.Linear(hidden, dim, bias=False)
+        self.activation_fns = [_get_activation(activations[0]), _get_activation(activations[1])]
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        proj = self.wi(x.to(torch.float32))
+        proj = proj.view(*x.shape[:-1], self.branch_count, self.hidden).to(self.dtype)
+        gate, up = proj.unbind(dim=-2)
+        hidden = self.activation_fns[0](gate) * self.activation_fns[1](up)
+        out = self.wo(hidden.to(torch.float32))
+        return out.to(self.dtype)

dia2/core/model.py ADDED Viewed

	@@ -0,0 +1,72 @@

+from __future__ import annotations
+from dataclasses import dataclass
+import torch
+from torch import nn
+from ..config import DiaConfig
+from .cache import KVCache
+from .depformer import Depformer
+from .precision import Precision
+from .transformer import TransformerDecoder
+@dataclass
+class DecodeState:
+    transformer: KVCache
+    depformer: KVCache
+class Dia2Model(nn.Module):
+    def __init__(self, config: DiaConfig, precision: Precision):
+        super().__init__()
+        self.config = config
+        self.precision = precision
+        self.transformer = TransformerDecoder(config, precision)
+        self.depformer = Depformer(config, precision)
+        self._cast_norms_to_compute()
+    def init_state(self, batch_size: int, device: torch.device, max_steps: int) -> DecodeState:
+        transformer_cache = self.transformer.init_cache(batch_size, device, max_steps)
+        depformer_cache = self.depformer.init_cache(batch_size, device, self.depformer.num_depth)
+        return DecodeState(transformer_cache, depformer_cache)
+    def step_text(
+        self,
+        tokens: torch.Tensor,
+        positions: torch.Tensor,
+        state: DecodeState,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        hidden, action, cb0, cache = self.transformer.forward_step(tokens, positions, state.transformer)
+        state.transformer = cache
+        return hidden, action, cb0
+    def step_audio_stage(
+        self,
+        stage_index: int,
+        prev_audio: torch.Tensor,
+        transformer_hidden: torch.Tensor,
+        state: DecodeState,
+        main_text: Optional[torch.Tensor],
+        second_text: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        cache = state.depformer
+        logits, new_cache = self.depformer.forward_step(
+            prev_audio,
+            transformer_hidden,
+            stage_index,
+            cache,
+            main_text,
+            second_text,
+        )
+        state.depformer = new_cache
+        return logits
+    def _cast_norms_to_compute(self) -> None:
+        """Cast RMSNorm weights/biases to the compute dtype to avoid bf16 warnings."""
+        def _convert(module: nn.Module) -> None:
+            if isinstance(module, nn.RMSNorm):
+                module.to(self.precision.compute)
+        self.apply(_convert)

dia2/core/precision.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from __future__ import annotations
+from dataclasses import dataclass
+import torch
+@dataclass(frozen=True)
+class Precision:
+    compute: torch.dtype
+    logits: torch.dtype
+def resolve_precision(kind: str | None, device: torch.device) -> Precision:
+    normalized = (kind or "auto").lower()
+    if normalized == "auto":
+        normalized = "bfloat16" if device.type == "cuda" else "float32"
+    if normalized == "bfloat16":
+        compute = torch.bfloat16 if device.type == "cuda" else torch.float32
+        return Precision(compute=compute, logits=torch.float32)
+    if normalized == "float32":
+        return Precision(compute=torch.float32, logits=torch.float32)
+    raise ValueError(f"Unsupported dtype '{kind}'")

dia2/core/transformer.py ADDED Viewed

	@@ -0,0 +1,140 @@

+from __future__ import annotations
+from typing import Optional, Tuple
+import torch
+from torch import nn
+import torch.nn.functional as F
+from ..config import DiaConfig
+from .cache import KVCache
+from .precision import Precision
+from .layers import (
+    AttentionShape,
+    MultiStreamEmbedding,
+    Mlp,
+    Attention,
+)
+class TransformerDecoder(nn.Module):
+    """Inference-time port of dia_v2.model.Transformer."""
+    def __init__(self, config: DiaConfig, precision: Precision):
+        super().__init__()
+        self.config = config
+        self.precision = precision
+        data_cfg = config.data
+        dec_cfg = config.model.decoder
+        self.audio_embeds = nn.ModuleList(
+            [
+                nn.Embedding(
+                    data_cfg.audio_vocab_size,
+                    dec_cfg.n_embd,
+                )
+                for _ in range(max(0, data_cfg.channels - 2))
+            ]
+        )
+        self.text_embed = MultiStreamEmbedding(
+            data_cfg.text_vocab_size,
+            dec_cfg.n_embd,
+            pad_id=data_cfg.text_pad_token_id,
+            output_dtype=self.precision.compute,
+            low_rank_dim=dec_cfg.low_rank_dim,
+        )
+        self.layers = nn.ModuleList([DecoderLayer(config, precision) for _ in range(dec_cfg.n_layer)])
+        self.norm = nn.RMSNorm(dec_cfg.n_embd, eps=config.model.normalization_layer_epsilon, dtype=torch.float32)
+        self.action_head = nn.Linear(dec_cfg.n_embd, data_cfg.action_vocab_size, bias=False)
+        self.cb0_head = nn.Linear(dec_cfg.n_embd, data_cfg.audio_vocab_size, bias=False)
+    def init_cache(self, batch_size: int, device: torch.device, max_steps: int) -> KVCache:
+        heads = self.layers[0].attn.num_kv_heads
+        head_dim = self.layers[0].attn.head_dim
+        return KVCache.allocate(
+            num_layers=len(self.layers),
+            batch_size=batch_size,
+            heads=heads,
+            max_steps=max_steps,
+            head_dim=head_dim,
+            device=device,
+            dtype=self.precision.compute,
+        )
+    def forward_step(
+        self,
+        tokens: torch.Tensor,
+        positions: torch.Tensor,
+        cache: KVCache,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, KVCache]:
+        if cache is None:
+            raise ValueError("Transformer cache must be initialized")
+        B, C, T1 = tokens.shape
+        if T1 != 1:
+            raise ValueError("forward_step expects sequence length 1")
+        num_audio_channels = max(0, C - 2)
+        hidden_t = self.text_embed(tokens[:, 0, :], tokens[:, 1, :])
+        for idx in range(num_audio_channels):
+            audio_emb = self.audio_embeds[idx](tokens[:, idx + 2, :])
+            hidden_t.add_(audio_emb)
+        hidden_t = hidden_t.to(self.precision.compute)
+        x = hidden_t
+        for idx, layer in enumerate(self.layers):
+            slot = cache.get_slot(idx)
+            x, _ = layer.decode_step(x, positions, slot)
+        hidden_norm = self.norm(x)
+        action_logits = self.action_head(hidden_norm.to(torch.float32)).to(self.precision.logits)
+        cb0_logits = self.cb0_head(hidden_norm.to(torch.float32)).to(self.precision.logits)
+        return hidden_norm, action_logits, cb0_logits, cache
+    def _embed(self, tokens: torch.Tensor) -> torch.Tensor:
+        B, C, T1 = tokens.shape
+        if T1 != 1:
+            raise ValueError("_embed expects sequence length 1")
+        num_audio_channels = max(0, C - 2)
+        text_hidden = self.text_embed(tokens[:, 0, :], tokens[:, 1, :])
+        audio_terms: list[torch.Tensor] = []
+        for idx in range(num_audio_channels):
+            audio_emb = self.audio_embeds[idx](tokens[:, idx + 2, :])
+            audio_terms.append(audio_emb)
+        hidden = text_hidden
+        for term in audio_terms:
+            hidden = hidden + term
+        final = hidden.to(self.precision.compute)
+        return final
+class DecoderLayer(nn.Module):
+    def __init__(self, config: DiaConfig, precision: Precision):
+        super().__init__()
+        dec = config.model.decoder
+        eps = config.model.normalization_layer_epsilon
+        self.pre_norm = nn.RMSNorm(dec.n_embd, eps=eps, dtype=torch.float32)
+        self.attn = Attention(config, dec.n_embd, precision.compute)
+        self.post_norm = nn.RMSNorm(dec.n_embd, eps=eps, dtype=torch.float32)
+        self.mlp = Mlp(
+            dec.n_embd,
+            dec.n_hidden,
+            precision.compute,
+            tuple(config.model.linear.mlp_activations),
+        )
+    def decode_step(
+        self,
+        x: torch.Tensor,
+        pos: torch.Tensor,
+        cache_slot,
+    ) -> Tuple[torch.Tensor, object]:
+        residual = x
+        x_norm = self.pre_norm(x)
+        attn_out, _ = self.attn(x_norm, pos, cache_slot)
+        x = residual + attn_out
+        residual2 = x
+        x_norm2 = self.post_norm(x)
+        mlp_out = self.mlp(x_norm2)
+        return residual2 + mlp_out, cache_slot

dia2/engine.py ADDED Viewed

	@@ -0,0 +1,230 @@

+from __future__ import annotations
+from pathlib import Path
+from typing import Optional, Sequence
+from .assets import resolve_assets
+from .runtime.context import RuntimeContext, build_runtime
+from .runtime.generator import (
+    build_initial_state,
+    decode_audio,
+    run_generation_loop,
+    warmup_with_prefix,
+)
+from .runtime.script_parser import parse_script
+from .audio.grid import undelay_frames, write_wav
+from .runtime.voice_clone import build_prefix_plan
+from .generation import (
+    GenerationConfig,
+    GenerationResult,
+    merge_generation_config,
+    normalize_script,
+)
+from .runtime.logger import RuntimeLogger
+class Dia2:
+    def __init__(
+        self,
+        *,
+        repo: Optional[str] = None,
+        config_path: Optional[str | Path] = None,
+        weights_path: Optional[str | Path] = None,
+        tokenizer_id: Optional[str | Path] = None,
+        mimi_id: Optional[str] = None,
+        device: str = "cuda",
+        dtype: str = "auto",
+        default_config: Optional[GenerationConfig] = None,
+    ) -> None:
+        bundle = resolve_assets(
+            repo=repo,
+            config_path=config_path,
+            weights_path=weights_path,
+        )
+        self._config_path = bundle.config_path
+        self._weights_path = bundle.weights_path
+        self._tokenizer_id = (str(tokenizer_id) if tokenizer_id else None) or bundle.tokenizer_id
+        self._repo_id = bundle.repo_id
+        self._mimi_id = mimi_id or bundle.mimi_id
+        self.device = device
+        self._dtype_pref = dtype or "auto"
+        self.default_config = default_config or GenerationConfig()
+        self._runtime: Optional[RuntimeContext] = None
+    @classmethod
+    def from_repo(
+        cls,
+        repo: str,
+        *,
+        device: str = "cuda",
+        dtype: str = "auto",
+        tokenizer_id: Optional[str] = None,
+        mimi_id: Optional[str] = None,
+    ) -> "Dia2":
+        return cls(repo=repo, device=device, dtype=dtype, tokenizer_id=tokenizer_id, mimi_id=mimi_id)
+    @classmethod
+    def from_local(
+        cls,
+        config_path: str | Path,
+        weights_path: str | Path,
+        *,
+        device: str = "cuda",
+        dtype: str = "auto",
+        tokenizer_id: Optional[str | Path] = None,
+        mimi_id: Optional[str] = None,
+    ) -> "Dia2":
+        return cls(
+            config_path=config_path,
+            weights_path=weights_path,
+            tokenizer_id=tokenizer_id,
+            device=device,
+            dtype=dtype,
+            mimi_id=mimi_id,
+        )
+    def set_device(self, device: str, *, dtype: Optional[str] = None) -> None:
+        desired_dtype = dtype or self._dtype_pref
+        if self.device == device and desired_dtype == self._dtype_pref:
+            return
+        self.device = device
+        self._dtype_pref = desired_dtype
+        self._runtime = None
+    def close(self) -> None:
+        self._runtime = None
+    def _ensure_runtime(self) -> RuntimeContext:
+        if self._runtime is None:
+            self._runtime = self._build_runtime()
+        return self._runtime
+    def generate(
+        self,
+        script: str | Sequence[str],
+        *,
+        config: Optional[GenerationConfig] = None,
+        output_wav: Optional[str | Path] = None,
+        prefix_speaker_1: Optional[str] = None,
+        prefix_speaker_2: Optional[str] = None,
+        include_prefix: Optional[bool] = None,
+        verbose: bool = False,
+        **overrides,
+    ):
+        runtime = self._ensure_runtime()
+        logger = RuntimeLogger(verbose)
+        merged_overrides = dict(overrides)
+        if prefix_speaker_1 is not None:
+            merged_overrides["prefix_speaker_1"] = prefix_speaker_1
+        if prefix_speaker_2 is not None:
+            merged_overrides["prefix_speaker_2"] = prefix_speaker_2
+        if include_prefix is not None:
+            merged_overrides["include_prefix"] = include_prefix
+        merged = merge_generation_config(base=config or self.default_config, overrides=merged_overrides)
+        max_context = runtime.config.runtime.max_context_steps
+        text = normalize_script(script)
+        prefix_plan = build_prefix_plan(runtime, merged.prefix)
+        entries = []
+        if prefix_plan is not None:
+            entries.extend(prefix_plan.entries)
+        entries.extend(parse_script([text], runtime.tokenizer, runtime.constants, runtime.frame_rate))
+        runtime.machine.initial_padding = merged.initial_padding
+        logger.event(
+            f"starting generation: max_context={max_context} cfg_scale={merged.cfg_scale:.2f} "
+            f"device={self.device} dtype={self._dtype_pref}"
+        )
+        state = runtime.machine.new_state(entries)
+        cfg_active = merged.cfg_scale != 1.0
+        if cfg_active:
+            logger.event(f"classifier-free guidance enabled (scale={merged.cfg_scale:.2f})")
+        else:
+            logger.event("classifier-free guidance disabled (scale=1.0)")
+        gen_state = build_initial_state(
+            runtime,
+            prefix=prefix_plan,
+        )
+        include_prefix_audio = bool(prefix_plan and merged.prefix and merged.prefix.include_audio)
+        start_step = 0
+        if prefix_plan is not None:
+            logger.event(f"warming up with prefix ({prefix_plan.aligned_frames} frames)")
+            start_step = warmup_with_prefix(runtime, prefix_plan, state, gen_state)
+            if include_prefix_audio:
+                logger.event("prefix audio will be kept in output")
+            else:
+                logger.event("prefix audio trimmed from output")
+        first_word_frame, audio_buf = run_generation_loop(
+            runtime,
+            state=state,
+            generation=gen_state,
+            config=merged,
+            start_step=start_step,
+            logger=logger,
+        )
+        aligned = undelay_frames(audio_buf[0], runtime.audio_delays, runtime.constants.audio_pad).unsqueeze(0)
+        crop = 0 if include_prefix_audio else max(first_word_frame, 0)
+        if crop > 0 and crop < aligned.shape[-1]:
+            aligned = aligned[:, :, crop:]
+        elif crop >= aligned.shape[-1]:
+            crop = 0
+        logger.event(f"decoding {aligned.shape[-1]} Mimi frames")
+        waveform = decode_audio(runtime, aligned)
+        if output_wav is not None:
+            write_wav(str(output_wav), waveform.detach().cpu().numpy(), runtime.mimi.sample_rate)
+            duration = waveform.shape[-1] / max(runtime.mimi.sample_rate, 1)
+            logger.event(f"saved {output_wav} ({duration:.2f}s)")
+        frame_rate = max(runtime.frame_rate, 1.0)
+        prefix_entry_count = len(prefix_plan.entries) if prefix_plan is not None else 0
+        transcript_entries = state.transcript
+        if prefix_plan is not None and not include_prefix_audio:
+            if len(transcript_entries) > prefix_entry_count:
+                transcript_entries = transcript_entries[prefix_entry_count:]
+            else:
+                transcript_entries = []
+        timestamps = []
+        for word, step in transcript_entries:
+            adj = step - crop
+            if adj < 0:
+                continue
+            timestamps.append((word, adj / frame_rate))
+        logger.event(f"generation finished in {logger.elapsed():.2f}s")
+        return GenerationResult(aligned, waveform, runtime.mimi.sample_rate, timestamps)
+    def save_wav(self, script: str | Sequence[str], path: str | Path, **kwargs):
+        return self.generate(script, output_wav=path, **kwargs)
+    @property
+    def sample_rate(self) -> int:
+        return self._ensure_runtime().mimi.sample_rate
+    @property
+    def tokenizer_id(self) -> Optional[str]:
+        if self._tokenizer_id:
+            return self._tokenizer_id
+        if self._runtime is not None:
+            return getattr(self._runtime.tokenizer, "name_or_path", None)
+        return self._repo_id
+    @property
+    def dtype(self) -> str:
+        return self._dtype_pref
+    @property
+    def max_context_steps(self) -> int:
+        return self._ensure_runtime().config.runtime.max_context_steps
+    @property
+    def repo(self) -> Optional[str]:
+        return self._repo_id
+    def _build_runtime(self) -> RuntimeContext:
+        runtime, tokenizer_ref, mimi_ref = build_runtime(
+            config_path=self._config_path,
+            weights_path=self._weights_path,
+            tokenizer_id=self._tokenizer_id,
+            repo_id=self._repo_id,
+            mimi_id=self._mimi_id,
+            device=self.device,
+            dtype_pref=self._dtype_pref,
+        )
+        self._tokenizer_id = tokenizer_ref
+        self._mimi_id = mimi_ref
+        return runtime

dia2/generation.py ADDED Viewed

	@@ -0,0 +1,158 @@

+from __future__ import annotations
+import sys
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import List, Mapping, Optional, Sequence, Tuple
+import torch
+@dataclass(frozen=True)
+class SamplingConfig:
+    temperature: float = 0.8
+    top_k: int = 50
+def _default_text_sampling() -> SamplingConfig:
+    return SamplingConfig(temperature=0.6, top_k=50)
+def _default_audio_sampling() -> SamplingConfig:
+    return SamplingConfig(temperature=0.8, top_k=50)
+@dataclass(frozen=True)
+class PrefixConfig:
+    speaker_1: Optional[str] = None
+    speaker_2: Optional[str] = None
+    include_audio: bool = False
+@dataclass(frozen=True)
+class GenerationConfig:
+    text: SamplingConfig = field(default_factory=_default_text_sampling)
+    audio: SamplingConfig = field(default_factory=_default_audio_sampling)
+    cfg_scale: float = 2.0
+    cfg_filter_k: int = 50
+    initial_padding: int = 2
+    prefix: Optional["PrefixConfig"] = None
+    use_cuda_graph: bool = False
+@dataclass(frozen=True)
+class GenerationResult:
+    audio_tokens: torch.Tensor
+    waveform: torch.Tensor
+    sample_rate: int
+    timestamps: List[Tuple[str, float]]
+def normalize_script(script: str | Sequence[str]) -> str:
+    if isinstance(script, str):
+        return script.strip()
+    return "\n".join(line.strip() for line in script)
+def load_script_text(path: str | Path) -> str:
+    if path == "-":
+        return sys.stdin.read().strip()
+    path_obj = Path(path)
+    if path_obj.exists():
+        return path_obj.read_text().strip()
+    return str(path).strip()
+def validate_generation_params(
+    *,
+    temperature: float,
+    top_k: int,
+    cfg_scale: float,
+) -> tuple[float, int, float]:
+    if temperature <= 0:
+        raise ValueError("temperature must be positive")
+    if top_k <= 0:
+        raise ValueError("top_k must be positive")
+    if cfg_scale <= 0:
+        raise ValueError("cfg_scale must be positive")
+    return temperature, top_k, cfg_scale
+def build_generation_config(
+    *,
+    temperature: float,
+    top_k: int,
+    cfg_scale: float,
+) -> GenerationConfig:
+    sampling = SamplingConfig(temperature=temperature, top_k=top_k)
+    return GenerationConfig(
+        text=sampling,
+        audio=sampling,
+        cfg_scale=cfg_scale,
+    )
+def merge_generation_config(
+    *,
+    base: GenerationConfig,
+    overrides: Mapping[str, object],
+) -> GenerationConfig:
+    clean_overrides = {k: v for k, v in overrides.items() if v is not None}
+    text_temp = clean_overrides.pop("temp_text", None)
+    text_topk = clean_overrides.pop("topk_text", None)
+    audio_temp = clean_overrides.pop("temp_audio", None)
+    audio_topk = clean_overrides.pop("topk_audio", None)
+    prefix_speaker_1 = clean_overrides.pop("prefix_speaker_1", None)
+    prefix_speaker_2 = clean_overrides.pop("prefix_speaker_2", None)
+    include_prefix = clean_overrides.pop("include_prefix", None)
+    text_sampling = base.text
+    if text_temp is not None or text_topk is not None:
+        text_sampling = SamplingConfig(
+            temperature=text_temp if text_temp is not None else text_sampling.temperature,
+            top_k=text_topk if text_topk is not None else text_sampling.top_k,
+        )
+    audio_sampling = base.audio
+    if audio_temp is not None or audio_topk is not None:
+        audio_sampling = SamplingConfig(
+            temperature=audio_temp if audio_temp is not None else audio_sampling.temperature,
+            top_k=audio_topk if audio_topk is not None else audio_sampling.top_k,
+        )
+    prefix_cfg = base.prefix
+    if (
+        prefix_speaker_1 is not None
+        or prefix_speaker_2 is not None
+        or include_prefix is not None
+        or prefix_cfg is not None
+    ):
+        prefix_cfg = prefix_cfg or PrefixConfig()
+        prefix_cfg = PrefixConfig(
+            speaker_1=prefix_speaker_1 if prefix_speaker_1 is not None else prefix_cfg.speaker_1,
+            speaker_2=prefix_speaker_2 if prefix_speaker_2 is not None else prefix_cfg.speaker_2,
+            include_audio=include_prefix if include_prefix is not None else prefix_cfg.include_audio,
+        )
+    return GenerationConfig(
+        text=text_sampling,
+        audio=audio_sampling,
+        cfg_scale=clean_overrides.pop("cfg_scale", base.cfg_scale),
+        cfg_filter_k=clean_overrides.pop("cfg_filter_k", base.cfg_filter_k),
+        initial_padding=clean_overrides.pop("initial_padding", base.initial_padding),
+        prefix=prefix_cfg,
+        use_cuda_graph=clean_overrides.pop("use_cuda_graph", base.use_cuda_graph),
+    )
+__all__ = [
+    "SamplingConfig",
+    "GenerationConfig",
+    "GenerationResult",
+    "PrefixConfig",
+    "normalize_script",
+    "load_script_text",
+    "validate_generation_params",
+    "build_generation_config",
+    "merge_generation_config",
+]

dia2/runtime/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .state_machine import Entry, StateMachine, TokenIds
+__all__ = [
+    "Entry",
+    "StateMachine",
+    "TokenIds",
+]

dia2/runtime/audio_io.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from __future__ import annotations
+from pathlib import Path
+from typing import Union
+import numpy as np
+import sphn
+import torch
+import torch.nn.functional as F
+from ..audio import MimiCodec
+PathLike = Union[str, Path]
+def load_mono_audio(path: PathLike, target_sr: int) -> np.ndarray:
+    """Read an audio file, convert to mono float32, and resample to target_sr."""
+    path = str(path)
+    try:
+        audio, sr = sphn.read_wav(path)
+    except Exception:
+        import soundfile as sf  # Local fallback
+        audio, sr = sf.read(path, dtype="float32", always_2d=False)
+    audio = np.asarray(audio, dtype=np.float32)
+    if audio.ndim == 2:
+        audio = audio.mean(axis=1)
+    if sr != target_sr:
+        if hasattr(sphn, "resample_audio"):
+            audio = sphn.resample_audio(audio, sr, target_sr).astype(np.float32)
+        else:
+            audio = _resample_linear(audio, sr, target_sr)
+    return audio
+def audio_to_tensor(audio: np.ndarray, device: torch.device) -> torch.Tensor:
+    """Convert mono PCM samples into shape [1, 1, T] tensor."""
+    tensor = torch.from_numpy(audio).to(device)
+    if tensor.dim() == 1:
+        tensor = tensor.unsqueeze(0)
+    if tensor.dim() == 2:
+        tensor = tensor.unsqueeze(0)
+    return tensor
+def encode_audio_tokens(mimi: MimiCodec, audio: np.ndarray) -> torch.Tensor:
+    """Encode PCM audio into Mimi codebook tokens [C, T]."""
+    waveform = audio_to_tensor(audio, mimi.device)
+    with torch.inference_mode():
+        codes, *_ = mimi.encode(waveform, return_dict=False)
+    if isinstance(codes, (tuple, list)):
+        codes = codes[0]
+    # Mimi.encode returns [B, num_codebooks, T]; select batch 0.
+    codes = codes[0].to(torch.long)
+    return codes
+def _resample_linear(audio: np.ndarray, src_sr: int, dst_sr: int) -> np.ndarray:
+    if src_sr == dst_sr:
+        return audio.astype(np.float32)
+    length = audio.shape[0]
+    new_length = max(1, int(round(length * dst_sr / src_sr)))
+    tensor = torch.from_numpy(audio.astype(np.float32)).unsqueeze(0).unsqueeze(0)
+    with torch.no_grad():
+        resampled = F.interpolate(tensor, size=new_length, mode="linear", align_corners=False)
+    return resampled.squeeze(0).squeeze(0).cpu().numpy().astype(np.float32)
+__all__ = ["load_mono_audio", "audio_to_tensor", "encode_audio_tokens"]

dia2/runtime/context.py ADDED Viewed

	@@ -0,0 +1,132 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+import warnings
+import torch
+from safetensors.torch import load_file
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+from ..config import DiaConfig, load_config
+from ..core.model import Dia2Model
+from ..core.precision import Precision, resolve_precision
+from ..audio import MimiCodec, DEFAULT_MIMI_MODEL_ID
+from .state_machine import StateMachine, TokenIds
+@dataclass
+class RuntimeContext:
+    config: DiaConfig
+    model: Dia2Model
+    precision: Precision
+    tokenizer: PreTrainedTokenizerBase
+    mimi: MimiCodec
+    device: torch.device
+    machine: StateMachine
+    transformer_step: callable
+    depformer_step: callable
+    constants: TokenIds
+    audio_delays: list[int]
+    audio_delay_tensor: torch.Tensor
+    frame_rate: float
+def build_runtime(
+    *,
+    config_path: str | Path,
+    weights_path: str | Path,
+    tokenizer_id: Optional[str],
+    repo_id: Optional[str],
+    mimi_id: Optional[str],
+    device: str,
+    dtype_pref: str,
+) -> tuple[RuntimeContext, str, str]:
+    device_obj = torch.device(device)
+    if device_obj.type == "cuda":
+        cuda_matmul = torch.backends.cuda.matmul
+        cudnn_conv = torch.backends.cudnn.conv
+        if hasattr(cuda_matmul, "fp32_precision"):
+            cuda_matmul.fp32_precision = "tf32"
+            with warnings.catch_warnings():
+                warnings.filterwarnings(
+                    "ignore",
+                    message="Please use the new API settings",
+                )
+                torch.backends.cuda.matmul.allow_tf32 = True
+        else:  # pragma: no cover - compatibility with older PyTorch
+            torch.backends.cuda.matmul.allow_tf32 = True
+        if hasattr(cudnn_conv, "fp32_precision"):
+            cudnn_conv.fp32_precision = "tf32"
+            with warnings.catch_warnings():
+                warnings.filterwarnings(
+                    "ignore",
+                    message="Please use the new API settings",
+                )
+                torch.backends.cudnn.allow_tf32 = True
+        else:  # pragma: no cover
+            torch.backends.cudnn.allow_tf32 = True
+    precision = resolve_precision(dtype_pref, device_obj)
+    config = load_config(config_path)
+    model = Dia2Model(config, precision)
+    state = load_file(str(weights_path))
+    model.load_state_dict(state)
+    model = model.to(device_obj)
+    tokenizer_ref = tokenizer_id or config.assets.tokenizer or repo_id
+    if tokenizer_ref is None:
+        raise ValueError("Tokenizer id is missing. Provide --tokenizer or add assets.tokenizer to the config.")
+    tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_ref,
+        use_fast=False,
+        trust_remote_code=True,
+    )
+    mimi_ref = mimi_id or config.assets.mimi or DEFAULT_MIMI_MODEL_ID
+    mimi = MimiCodec.from_pretrained(mimi_ref, device=device_obj)
+    data_cfg = config.data
+    constants = TokenIds(
+        card=data_cfg.text_vocab_size,
+        new_word=data_cfg.text_new_word_token_id,
+        pad=data_cfg.text_pad_token_id,
+        bos=getattr(tokenizer, "bos_token_id", 1) or 1,
+        zero=data_cfg.text_zero_token_id,
+        spk1=tokenizer.convert_tokens_to_ids("[S1]") if "[S1]" in tokenizer.get_vocab() else data_cfg.text_new_word_token_id,
+        spk2=tokenizer.convert_tokens_to_ids("[S2]") if "[S2]" in tokenizer.get_vocab() else data_cfg.text_new_word_token_id,
+        audio_pad=data_cfg.audio_pad_token_id,
+        audio_bos=data_cfg.audio_bos_token_id,
+    )
+    machine = StateMachine(
+        token_ids=constants,
+        second_stream_ahead=data_cfg.second_stream_ahead,
+        max_padding=6,
+        initial_padding=0,
+    )
+    audio_delays = list(data_cfg.delay_pattern)
+    audio_delay_tensor = torch.tensor(audio_delays, device=device_obj, dtype=torch.long) if audio_delays else torch.empty(0, dtype=torch.long, device=device_obj)
+    frame_rate = getattr(mimi, "frame_rate", 75.0)
+    runtime = RuntimeContext(
+        config=config,
+        precision=precision,
+        model=model,
+        tokenizer=tokenizer,
+        mimi=mimi,
+        device=device_obj,
+        machine=machine,
+        constants=constants,
+        audio_delays=audio_delays,
+        audio_delay_tensor=audio_delay_tensor,
+        frame_rate=frame_rate,
+        transformer_step=model.transformer.forward_step,
+        depformer_step=model.depformer.forward_step,
+    )
+    return runtime, tokenizer_ref, mimi_ref
+__all__ = [
+    "RuntimeContext",
+    "build_runtime",
+]

dia2/runtime/generator.py ADDED Viewed

	@@ -0,0 +1,420 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Optional, Tuple
+import torch
+from ..core.cache import KVCache
+from ..core.model import DecodeState
+from ..generation import GenerationConfig
+from ..audio.grid import delay_frames, mask_audio_logits, undelay_frames
+from .context import RuntimeContext
+from .state_machine import State, TokenIds
+from .guidance import apply_classifier_guidance, sample_audio_logits
+from .sampler import sample_token
+from .voice_clone import PrefixPlan
+from .logger import RuntimeLogger
+_GRAPH_CUBLAS_READY = False
+def _ensure_graph_cublas_ready(device: torch.device) -> None:
+    global _GRAPH_CUBLAS_READY
+    if _GRAPH_CUBLAS_READY or device.type != "cuda":
+        return
+    tmp = torch.empty((1, 1), device=device, dtype=torch.float32)
+    torch.matmul(tmp, tmp)
+    torch.cuda.synchronize()
+    _GRAPH_CUBLAS_READY = True
+@dataclass
+class GenerationState:
+    decode: DecodeState
+    step_tokens: torch.Tensor
+    audio_buf: torch.Tensor
+    def trim_audio(self, limit: int, pad_token: int, ungenerated: int) -> torch.Tensor:
+        trimmed = self.audio_buf[:, :, :limit]
+        pad = torch.full_like(trimmed, pad_token)
+        trimmed = torch.where(trimmed == ungenerated, pad, trimmed)
+        self.audio_buf = trimmed
+        return trimmed
+    @property
+    def transformer_cache(self) -> KVCache:
+        return self.decode.transformer
+    @transformer_cache.setter
+    def transformer_cache(self, cache: KVCache) -> None:
+        self.decode.transformer = cache
+    @property
+    def depformer_cache(self) -> KVCache:
+        return self.decode.depformer
+    @depformer_cache.setter
+    def depformer_cache(self, cache: KVCache) -> None:
+        self.decode.depformer = cache
+    def reset_dep_cache(self) -> None:
+        self.decode.depformer.reset()
+@dataclass
+class NetworkBuffers:
+    text: torch.Tensor
+    cb0: torch.Tensor
+    dep: list[torch.Tensor]
+def _allocate_network_buffers(runtime: RuntimeContext, branches: int) -> NetworkBuffers:
+    device = runtime.device
+    logits_dtype = runtime.precision.logits
+    data_cfg = runtime.config.data
+    text_logits = torch.empty((branches, 1, data_cfg.action_vocab_size), dtype=logits_dtype, device=device)
+    cb0_logits = torch.empty((branches, 1, data_cfg.audio_vocab_size), dtype=logits_dtype, device=device)
+    dep_vocab = runtime.model.depformer.audio_vocab_limit or data_cfg.audio_vocab_size
+    dep_logits = [
+        torch.empty((branches, 1, 1, dep_vocab), dtype=logits_dtype, device=device)
+        for _ in range(runtime.model.depformer.num_depth)
+    ]
+    return NetworkBuffers(text=text_logits, cb0=cb0_logits, dep=dep_logits)
+def build_initial_state(
+    runtime: RuntimeContext,
+    *,
+    prefix: PrefixPlan | None = None,
+) -> GenerationState:
+    dep_q = runtime.model.depformer.num_audio_channels
+    channels = 2 + dep_q
+    branches = 2
+    token_ids = runtime.constants
+    step_tokens = torch.full(
+        (branches, channels, 1),
+        token_ids.pad,
+        dtype=torch.long,
+        device=runtime.device,
+    )
+    step_tokens[0, 0, 0] = token_ids.bos
+    step_tokens[0, 1, 0] = token_ids.pad
+    step_tokens[1, 0, 0] = token_ids.zero
+    step_tokens[1, 1, 0] = token_ids.pad
+    prefix_len = 0
+    if prefix is not None:
+        delayed = delay_frames(prefix.aligned_tokens, runtime.audio_delays, token_ids.audio_pad)
+        prefix_len = delayed.shape[1]
+    limit = runtime.config.runtime.max_context_steps
+    total_steps = max(limit + prefix_len + 1, limit)
+    decode_state = runtime.model.init_state(branches, runtime.device, total_steps)
+    audio_buf = torch.full(
+        (branches, dep_q, total_steps),
+        token_ids.ungenerated,
+        dtype=torch.long,
+        device=runtime.device,
+    )
+    if prefix is not None:
+        delayed = delay_frames(prefix.aligned_tokens, runtime.audio_delays, token_ids.audio_pad).to(runtime.device)
+        audio_buf[0, :, : delayed.shape[1]] = delayed
+        if branches > 1:
+            audio_buf[1:, :, : delayed.shape[1]] = delayed
+    return GenerationState(decode_state, step_tokens, audio_buf)
+def _fill_audio_channels(
+    step_tokens: torch.Tensor,
+    audio_buf: torch.Tensor,
+    delays: torch.Tensor,
+    step: int,
+    bos_token: int,
+) -> None:
+    channels = delays.numel()
+    if channels == 0:
+        return
+    target = step_tokens[:, 2 : 2 + channels, 0]
+    if step < audio_buf.shape[-1]:
+        target.copy_(audio_buf[:, :channels, step])
+    else:
+        target.fill_(bos_token)
+    mask = delays > step
+    if mask.any().item():
+        target[:, mask] = bos_token
+def _execute_transformer_step(
+    step_tokens: torch.Tensor,
+    positions_view: torch.Tensor,
+    generation: GenerationState,
+    transformer_step,
+    buffers: NetworkBuffers,
+) -> torch.Tensor:
+    hidden_t, text_logits_t, cb0_logits_t, present = transformer_step(
+        step_tokens,
+        positions_view,
+        generation.transformer_cache,
+    )
+    buffers.text.copy_(text_logits_t)
+    buffers.cb0.copy_(cb0_logits_t)
+    generation.transformer_cache = present
+    return hidden_t
+def _execute_depformer_stage(
+    stage_index: int,
+    prev_audio: torch.Tensor,
+    hidden_t: torch.Tensor,
+    generation: GenerationState,
+    depformer_step,
+    main_tokens: Optional[torch.Tensor],
+    second_tokens: Optional[torch.Tensor],
+    buffers: NetworkBuffers,
+) -> None:
+    logits_stage, dep_present = depformer_step(
+        prev_audio=prev_audio,
+        transformer_out=hidden_t,
+        stage_index=stage_index,
+        cache=generation.depformer_cache,
+        main_text=main_tokens if stage_index == 0 else None,
+        second_text=second_tokens if stage_index == 0 else None,
+    )
+    target = buffers.dep[stage_index]
+    if logits_stage.shape != target.shape:
+        raise RuntimeError(
+            f"depformer logits shape mismatch: {logits_stage.shape} vs {target.shape}"
+        )
+    target.copy_(logits_stage)
+    generation.depformer_cache = dep_present
+def run_generation_loop(
+    runtime: RuntimeContext,
+    *,
+    state: State,
+    generation: GenerationState,
+    config: GenerationConfig,
+    start_step: int = 0,
+    logger: RuntimeLogger | None = None,
+) -> tuple[Optional[int], torch.Tensor]:
+    step_tokens = generation.step_tokens
+    audio_buf = generation.audio_buf
+    branches = step_tokens.shape[0]
+    max_context = runtime.config.runtime.max_context_steps
+    if max_context <= 0:
+        raise ValueError("Runtime configuration must specify a positive max_context_steps")
+    positions = torch.empty(1, 1, dtype=torch.long, device=runtime.device)
+    main_tokens = torch.empty(branches, dtype=torch.long, device=runtime.device)
+    aux_tokens = torch.empty(branches, dtype=torch.long, device=runtime.device)
+    cfg_active = config.cfg_scale != 1.0
+    token_ids = runtime.constants
+    delay_tensor = runtime.audio_delay_tensor
+    max_delay = int(delay_tensor.max().item()) if delay_tensor.numel() else 0
+    flush_tail = max_delay + getattr(runtime.machine, "max_padding", 0)
+    first_word_frame: Optional[int] = None
+    eos_cutoff: Optional[int] = None
+    last_step = start_step - 1
+    use_graph = bool(config.use_cuda_graph and runtime.device.type == "cuda")
+    transformer_step = runtime.transformer_step
+    depformer_step = runtime.depformer_step
+    buffers = _allocate_network_buffers(runtime, branches)
+    positions_view = positions.expand(branches, -1)
+    transformer_capture = None
+    dep_captures: list[dict] | None = None
+    if use_graph:
+        _ensure_graph_cublas_ready(runtime.device)
+    processed_steps = 0
+    report_interval = 12
+    with torch.inference_mode():
+        for offset in range(max_context):
+            t = start_step + offset
+            if eos_cutoff is not None and t >= eos_cutoff:
+                break
+            if t + 1 >= audio_buf.shape[-1]:
+                break
+            generation.reset_dep_cache()
+            positions.fill_(t)
+            _fill_audio_channels(step_tokens, audio_buf, delay_tensor, t, token_ids.audio_bos)
+            if branches > 1:
+                step_tokens[1:, 0, 0] = token_ids.zero
+                step_tokens[1:, 1, 0] = token_ids.pad
+            if use_graph:
+                if transformer_capture is None:
+                    torch.cuda.synchronize()
+                    graph = torch.cuda.CUDAGraph()
+                    with torch.cuda.graph(graph):
+                        hidden_ref = _execute_transformer_step(
+                            step_tokens,
+                            positions_view,
+                            generation,
+                            transformer_step,
+                            buffers,
+                        )
+                    transformer_capture = (graph, hidden_ref)
+                    if runtime.model.depformer.num_depth > 0:
+                        dep_captures = []
+                        for idx in range(runtime.model.depformer.num_depth):
+                            capture = {
+                                "graph": torch.cuda.CUDAGraph(),
+                                "captured": False,
+                                "prev_audio": torch.empty((branches,), dtype=torch.long, device=runtime.device),
+                                "main_tokens": torch.empty((branches,), dtype=torch.long, device=runtime.device) if idx == 0 else None,
+                                "second_tokens": torch.empty((branches,), dtype=torch.long, device=runtime.device) if idx == 0 else None,
+                            }
+                            dep_captures.append(capture)
+                else:
+                    transformer_capture[0].replay()
+                hidden_t = transformer_capture[1]
+            else:
+                hidden_t = _execute_transformer_step(
+                    step_tokens,
+                    positions_view,
+                    generation,
+                    transformer_step,
+                    buffers,
+                )
+            guided_text = apply_classifier_guidance(buffers.text, cfg_active, config.cfg_scale, config.cfg_filter_k)
+            if guided_text.shape[0] > 1:
+                guided_text = guided_text[:1]
+            text_token = sample_token(
+                guided_text,
+                temp=config.text.temperature,
+                top_k=config.text.top_k,
+            ).item()
+            main_token, aux_token, _ = runtime.machine.process(t, state, text_token)
+            second_token = aux_token if aux_token != -1 else token_ids.pad
+            if first_word_frame is None and main_token == token_ids.new_word:
+                first_word_frame = t - config.initial_padding
+            step_tokens[:, 0, 0] = main_token
+            step_tokens[:, 1, 0] = second_token
+            guided_cb0 = apply_classifier_guidance(buffers.cb0, cfg_active, config.cfg_scale, config.cfg_filter_k)
+            if guided_cb0.shape[0] > 1:
+                guided_cb0 = guided_cb0[:1]
+            masked_cb0 = mask_audio_logits(guided_cb0, token_ids.audio_pad, token_ids.audio_bos)
+            codebook_token = sample_audio_logits(masked_cb0, config.audio.temperature, config.audio.top_k)
+            audio_buf[:, 0, t + 1] = codebook_token
+            prev_audio = codebook_token.expand(branches)
+            main_tokens.fill_(main_token)
+            aux_tokens.fill_(second_token)
+            for stage in range(runtime.model.depformer.num_depth):
+                if use_graph and dep_captures is not None:
+                    capture = dep_captures[stage]
+                    capture["prev_audio"].copy_(prev_audio)
+                    if capture["main_tokens"] is not None and stage == 0:
+                        capture["main_tokens"].copy_(main_tokens)
+                        capture["second_tokens"].copy_(aux_tokens)
+                    if not capture["captured"]:
+                        torch.cuda.synchronize()
+                        with torch.cuda.graph(capture["graph"]):
+                            _execute_depformer_stage(
+                                stage_index=stage,
+                                prev_audio=capture["prev_audio"],
+                                hidden_t=hidden_t,
+                                generation=generation,
+                                depformer_step=depformer_step,
+                                main_tokens=capture["main_tokens"],
+                                second_tokens=capture["second_tokens"],
+                                buffers=buffers,
+                            )
+                        capture["captured"] = True
+                    else:
+                        capture["graph"].replay()
+                else:
+                    _execute_depformer_stage(
+                        stage_index=stage,
+                        prev_audio=prev_audio,
+                        hidden_t=hidden_t,
+                        generation=generation,
+                        depformer_step=depformer_step,
+                        main_tokens=main_tokens,
+                        second_tokens=aux_tokens,
+                        buffers=buffers,
+                    )
+                dep_logits = apply_classifier_guidance(buffers.dep[stage], cfg_active, config.cfg_scale, config.cfg_filter_k)
+                if dep_logits.shape[0] > 1:
+                    dep_logits = dep_logits[:1]
+                stage_token = sample_audio_logits(
+                    dep_logits,
+                    config.audio.temperature,
+                    config.audio.top_k,
+                )
+                audio_buf[:, stage + 1, t + 1] = stage_token
+                prev_audio = stage_token.expand(branches)
+            last_step = t
+            if eos_cutoff is None and state.end_step is not None:
+                eos_cutoff = state.end_step + flush_tail
+            processed_steps = offset + 1
+            if logger and processed_steps % report_interval == 0:
+                logger.progress(processed_steps, max_context)
+    if logger and processed_steps and processed_steps % report_interval != 0:
+        logger.progress(processed_steps, max_context)
+    if first_word_frame is None:
+        first_word_frame = start_step
+    if last_step < start_step:
+        limit = min(start_step + 1, audio_buf.shape[-1])
+    else:
+        limit = min(last_step + 2, audio_buf.shape[-1])
+    trimmed = generation.trim_audio(limit, token_ids.audio_pad, token_ids.ungenerated)
+    return first_word_frame, trimmed
+def decode_audio(runtime: RuntimeContext, tokens: torch.Tensor) -> torch.Tensor:
+    if tokens.shape[-1] == 0:
+        return torch.zeros(0, device=runtime.device)
+    with torch.inference_mode():
+        pcm = runtime.mimi.decode(tokens.to(runtime.device))
+        return pcm[0, 0]
+def warmup_with_prefix(
+    runtime: RuntimeContext,
+    plan: PrefixPlan,
+    state: State,
+    generation: GenerationState,
+) -> int:
+    step_tokens = generation.step_tokens
+    model_state = generation.decode
+    branches = step_tokens.shape[0]
+    device = runtime.device
+    tokens = plan.aligned_tokens.to(device)
+    new_word_steps = set(plan.new_word_steps)
+    positions = torch.empty(1, 1, dtype=torch.long, device=device)
+    with torch.inference_mode():
+        for t in range(plan.aligned_frames):
+            positions.fill_(t)
+            channels = tokens.shape[0]
+            for cb in range(channels):
+                delay = runtime.audio_delays[cb] if cb < len(runtime.audio_delays) else 0
+                idx = t - delay
+                value = tokens[cb, idx] if idx >= 0 else runtime.constants.audio_bos
+                step_tokens[:, 2 + cb, 0] = value
+            hidden, text_logits, cb0_logits, present = runtime.model.transformer.forward_step(
+                step_tokens,
+                positions.expand(branches, -1),
+                model_state.transformer,
+            )
+            model_state.transformer = present
+            forced = runtime.constants.new_word if t in new_word_steps else runtime.constants.pad
+            main_token, aux_token, _ = runtime.machine.process(t, state, forced, is_forced=True)
+            second_token = runtime.constants.pad if aux_token == -1 else aux_token
+            step_tokens[0, 0, 0] = main_token
+            step_tokens[0, 1, 0] = second_token
+            if branches > 1:
+                step_tokens[1:, 0, 0] = runtime.constants.zero
+                step_tokens[1:, 1, 0] = runtime.constants.pad
+    return max(plan.aligned_frames - 1, 0)
+__all__ = [
+    "build_initial_state",
+    "run_generation_loop",
+    "decode_audio",
+    "warmup_with_prefix",
+    "GenerationState",
+]

dia2/runtime/guidance.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from __future__ import annotations
+import torch
+from .sampler import sample_token
+def apply_classifier_guidance(
+    logits: torch.Tensor,
+    cfg_active: bool,
+    scale: float,
+    top_k: int,
+) -> torch.Tensor:
+    if not cfg_active:
+        return logits
+    conditional = logits[0:1]
+    unconditional = logits[1:2]
+    cond32 = conditional.to(torch.float32)
+    uncond32 = unconditional.to(torch.float32)
+    guided = torch.lerp(uncond32, cond32, scale)
+    if top_k > 0 and guided.shape[-1] > 0:
+        k = min(top_k, guided.shape[-1])
+        threshold = torch.topk(guided, k=k, dim=-1, sorted=False).values[..., -1:]
+        mask = guided >= threshold
+        neg_inf = torch.full_like(cond32, float("-inf"))
+        cond32 = torch.where(mask, cond32, neg_inf)
+    return cond32.to(conditional.dtype)
+def sample_audio_logits(logits: torch.Tensor, temp: float, top_k: int) -> torch.Tensor:
+    """Sample a single audio token (shape [1]) from logits."""
+    return (
+        sample_token(
+            logits,
+            temp=temp,
+            top_k=top_k,
+        ).view(1)
+    )

dia2/runtime/logger.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from __future__ import annotations
+import time
+class RuntimeLogger:
+    def __init__(self, enabled: bool) -> None:
+        self.enabled = enabled
+        self.start_time = time.perf_counter()
+        self.last_time = self.start_time
+        self.last_step = 0
+    def event(self, message: str) -> None:
+        if self.enabled:
+            print(f"[dia2] {message}")
+    def progress(self, step: int, total: Optional[int] = None) -> None:
+        if not self.enabled:
+            return
+        now = time.perf_counter()
+        delta_t = max(now - self.last_time, 1e-6)
+        delta_steps = max(step - self.last_step, 1)
+        speed = delta_steps / delta_t
+        if total is None:
+            self.event(f"step {step} :: {speed:.1f} toks/s")
+        else:
+            self.event(f"step {step}/{total} :: {speed:.1f} toks/s")
+        self.last_time = now
+        self.last_step = step
+    def elapsed(self) -> float:
+        return time.perf_counter() - self.start_time
+__all__ = ["RuntimeLogger"]

dia2/runtime/sampler.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from __future__ import annotations
+import torch
+def sample_token(
+    logits: torch.Tensor,
+    *,
+    temp: float,
+    top_k: int = 0,
+) -> torch.Tensor:
+    logits32 = logits.to(torch.float32)
+    if temp <= 0.0:
+        return torch.argmax(logits32, dim=-1, keepdim=True)
+    probs = torch.softmax(logits32 / max(temp, 1e-6), dim=-1)
+    probs = torch.nan_to_num(probs, nan=0.0, posinf=0.0, neginf=0.0)
+    probs = torch.clamp_min(probs, 0.0)
+    flat = probs.reshape(-1, probs.shape[-1])
+    norm = flat.sum(dim=-1, keepdim=True)
+    zero_mask = norm <= 0
+    norm = norm.clamp_min(1e-12)
+    flat = flat / norm
+    if zero_mask.any():
+        filler = torch.zeros_like(flat)
+        filler[..., 0] = 1.0
+        mask = zero_mask.expand_as(flat)
+        flat = torch.where(mask, filler, flat)
+    vocab = flat.shape[-1]
+    if top_k > 0 and top_k < vocab:
+        topv, indices = torch.topk(flat, top_k, dim=-1)
+        topv = topv / topv.sum(dim=-1, keepdim=True).clamp_min(1e-12)
+        draws = torch.multinomial(topv, num_samples=1)
+        picks = torch.gather(indices, dim=-1, index=draws)
+    else:
+        picks = torch.multinomial(flat, num_samples=1)
+    picks = picks.reshape(*probs.shape[:-1], 1)
+    return picks

dia2/runtime/script_parser.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from __future__ import annotations
+import re
+from typing import List, Optional, Sequence
+from .state_machine import Entry
+def parse_script(
+    script: Sequence[str],
+    tokenizer,
+    constants,
+    frame_rate: float,
+) -> List[Entry]:
+    entries: List[Entry] = []
+    speaker_tokens = [constants.spk1, constants.spk2]
+    padding_between = 1
+    event_re = re.compile(r"(?:<break\s+time=\"([0-9]+(?:.[0-9]*)?)s\"\s*/?>)|(?:\s+)")
+    last_speaker_idx = [None]
+    def add_entry(idx: int, word: str, *, pending: Optional[int], first_content: List[bool]):
+        tokens: List[int]
+        if pending is not None:
+            prefix = "[S1]" if pending == constants.spk1 else "[S2]"
+            tokens = tokenizer.encode(f"{prefix} {word}", add_special_tokens=False)
+        else:
+            tokens = tokenizer.encode(word, add_special_tokens=False)
+        if first_content[0]:
+            if speaker_tokens:
+                speaker_idx = idx % len(speaker_tokens)
+                speaker_token = speaker_tokens[speaker_idx]
+                if speaker_token is not None and last_speaker_idx[0] != speaker_idx:
+                    if not tokens or tokens[0] != speaker_token:
+                        tokens.insert(0, speaker_token)
+                    last_speaker_idx[0] = speaker_idx
+            first_content[0] = False
+        padding = max(0, padding_between + len(tokens) - 1)
+        entries.append(Entry(tokens=tokens, text=word, padding=padding))
+    for idx, line in enumerate(script):
+        normalized = line.replace("’", "'").replace(":", " ")
+        remaining = normalized
+        first_content = [True]
+        pending_speaker: Optional[int] = None
+        while remaining:
+            match = event_re.search(remaining)
+            if match is None:
+                segment = remaining
+                remaining = ""
+            else:
+                segment = remaining[: match.start()]
+                remaining = remaining[match.end() :]
+            if segment:
+                for raw_word in segment.split():
+                    if raw_word in ("[S1]", "[S2]"):
+                        pending_speaker = (
+                            constants.spk1 if raw_word == "[S1]" else constants.spk2
+                        )
+                        continue
+                    add_entry(idx, raw_word, pending=pending_speaker, first_content=first_content)
+                    pending_speaker = None
+            if match and match.group(1):
+                seconds = float(match.group(1))
+                padding = int(round(seconds * frame_rate))
+                if padding > 0:
+                    entries.append(Entry(tokens=[], text="", padding=padding))
+        if remaining:
+            continue
+    return entries

dia2/runtime/state_machine.py ADDED Viewed

	@@ -0,0 +1,170 @@

+from __future__ import annotations
+from collections import deque
+from dataclasses import dataclass, field
+from typing import Deque, Iterable, List, Sequence, Tuple
+@dataclass
+class TokenIds:
+    card: int
+    new_word: int
+    pad: int
+    bos: int
+    zero: int
+    spk1: int
+    spk2: int
+    audio_pad: int
+    audio_bos: int
+    ungenerated: int = -2
+@dataclass
+class Entry:
+    tokens: List[int]
+    text: str
+    padding: int = 0
+@dataclass
+class State:
+    entries: Deque[Entry]
+    padding_budget: int
+    forced_padding: int
+    pending_tokens: Deque[int] = field(default_factory=deque)
+    lookahead_tokens: Deque[int] = field(default_factory=deque)
+    end_step: int | None = None
+    consumption_times: List[int] = field(default_factory=list)
+    transcript: List[Tuple[str, int]] = field(default_factory=list)
+    def peek_tokens(self, count: int) -> List[int]:
+        """Return tokens from upcoming entries (used for second-stream lookahead)."""
+        assert count > 0
+        for entry in self.entries:
+            if entry.tokens:
+                count -= 1
+                if count == 0:
+                    return entry.tokens
+        return []
+class StateMachine:
+    def __init__(
+        self,
+        token_ids: TokenIds,
+        *,
+        second_stream_ahead: int = 0,
+        max_padding: int = 6,
+        initial_padding: int = 0,
+    ) -> None:
+        self.token_ids = token_ids
+        self.second_stream_ahead = second_stream_ahead
+        self.max_padding = max_padding
+        self.initial_padding = initial_padding
+    def new_state(self, entries: Iterable[Entry]) -> State:
+        return State(
+            entries=deque(entries),
+            padding_budget=self.initial_padding,
+            forced_padding=self.initial_padding,
+        )
+    def process(
+        self,
+        step: int,
+        state: State,
+        token: int,
+        is_forced: bool = False,
+    ) -> Tuple[int, int, bool]:
+        token = self._sanitize_token(token)
+        token = self._enforce_token_constraints(state, token, is_forced)
+        token, consumed_new_word = self._handle_new_word(step, state, token)
+        output_token = self._select_output_token(state, token)
+        final_main, final_second = self._maybe_multiplex_second_stream(
+            state, output_token
+        )
+        return final_main, final_second, consumed_new_word
+    def _sanitize_token(self, token: int) -> int:
+        if token == 1:
+            token = self.token_ids.new_word
+        elif token == 0:
+            token = self.token_ids.pad
+        if token not in (self.token_ids.new_word, self.token_ids.pad):
+            return self.token_ids.pad
+        return token
+    def _enforce_token_constraints(
+        self, state: State, token: int, is_forced: bool
+    ) -> int:
+        if state.pending_tokens:
+            return self.token_ids.pad
+        if is_forced:
+            return token
+        if state.forced_padding > 0:
+            if token != self.token_ids.pad:
+                token = self.token_ids.pad
+            return token
+        if state.padding_budget <= 0 and token != self.token_ids.new_word:
+            return self.token_ids.new_word
+        return token
+    def _handle_new_word(
+        self, step: int, state: State, token: int
+    ) -> Tuple[int, bool]:
+        if token != self.token_ids.new_word:
+            return token, False
+        if state.entries:
+            entry = state.entries.popleft()
+            state.consumption_times.append(step)
+            if entry.tokens:
+                state.transcript.append((entry.text, step))
+                state.pending_tokens.extend(entry.tokens)
+                if self.second_stream_ahead:
+                    state.lookahead_tokens.extend(
+                        state.peek_tokens(self.second_stream_ahead)
+                    )
+                state.padding_budget = self.max_padding
+            else:
+                token = self.token_ids.pad
+            state.forced_padding = entry.padding
+            return token, True
+        token = self.token_ids.pad
+        if self.second_stream_ahead and state.end_step is None:
+            token = self.token_ids.new_word
+        if state.end_step is None:
+            state.end_step = step
+        return token, False
+    def _select_output_token(self, state: State, token: int) -> int:
+        if token == self.token_ids.pad:
+            if state.padding_budget > 0:
+                state.padding_budget -= 1
+            if state.forced_padding > 0:
+                state.forced_padding -= 1
+            if state.pending_tokens:
+                return state.pending_tokens.popleft()
+            return self.token_ids.pad
+        if token == self.token_ids.new_word:
+            return self.token_ids.new_word
+        if token == self.token_ids.zero:
+            return token
+        raise RuntimeError(f"Invalid token {token}")
+    def _maybe_multiplex_second_stream(
+        self, state: State, output: int
+    ) -> Tuple[int, int]:
+        if not self.second_stream_ahead:
+            return output, output
+        second = -1
+        if output == self.token_ids.new_word:
+            second = self.token_ids.new_word
+            if state.pending_tokens:
+                output = state.pending_tokens.popleft()
+            else:
+                output = self.token_ids.pad
+        elif state.lookahead_tokens:
+            second = state.lookahead_tokens.popleft()
+        else:
+            second = self.token_ids.pad
+        return output, second

dia2/runtime/voice_clone.py ADDED Viewed

	@@ -0,0 +1,190 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Callable, List, Optional, Sequence, TYPE_CHECKING
+import numpy as np
+import torch
+from ..generation import PrefixConfig
+from .audio_io import encode_audio_tokens, load_mono_audio
+from .state_machine import Entry
+if TYPE_CHECKING:  # pragma: no cover
+    from .context import RuntimeContext
+@dataclass
+class WhisperWord:
+    text: str
+    start: float
+    end: float
+@dataclass
+class PrefixPlan:
+    entries: List[Entry]
+    new_word_steps: List[int]
+    aligned_tokens: torch.Tensor
+    aligned_frames: int
+def build_prefix_plan(
+    runtime: "RuntimeContext",
+    prefix: Optional[PrefixConfig],
+    *,
+    transcribe_fn: Optional[Callable[[str, torch.device], List[WhisperWord]]] = None,
+    load_audio_fn: Optional[Callable[[str, int], np.ndarray]] = None,
+    encode_fn: Optional[Callable[[np.ndarray], torch.Tensor]] = None,
+) -> Optional[PrefixPlan]:
+    if prefix is None:
+        return None
+    if not prefix.speaker_1:
+        if prefix.speaker_2:
+            raise ValueError("speaker_2 requires speaker_1 to be provided")
+        return None
+    transcribe = transcribe_fn or (lambda path, device: transcribe_words(path, device))
+    load_audio = load_audio_fn or (lambda path, sr: load_mono_audio(path, sr))
+    encode_audio = encode_fn or (lambda audio: encode_audio_tokens(runtime.mimi, audio))
+    entries1, steps1, tokens1 = _process_prefix_audio(
+        runtime=runtime,
+        audio_path=prefix.speaker_1,
+        speaker_token=runtime.constants.spk1,
+        transcribe=transcribe,
+        load_audio=load_audio,
+        encode_audio=encode_audio,
+    )
+    offset = 3  # Match legacy BOS/PAD offset
+    entries = list(entries1)
+    new_word_steps = [step + offset for step in steps1]
+    audio_tokens = tokens1.to(runtime.device)
+    if prefix.speaker_2:
+        entries2, steps2, tokens2 = _process_prefix_audio(
+            runtime=runtime,
+            audio_path=prefix.speaker_2,
+            speaker_token=runtime.constants.spk2,
+            transcribe=transcribe,
+            load_audio=load_audio,
+            encode_audio=encode_audio,
+        )
+        spk1_frames = audio_tokens.shape[-1]
+        new_word_steps.extend(step + spk1_frames for step in steps2)
+        entries.extend(entries2)
+        audio_tokens = torch.cat([audio_tokens, tokens2.to(runtime.device)], dim=1)
+    return PrefixPlan(
+        entries=entries,
+        new_word_steps=new_word_steps,
+        aligned_tokens=audio_tokens,
+        aligned_frames=audio_tokens.shape[-1],
+    )
+def _process_prefix_audio(
+    runtime: "RuntimeContext",
+    audio_path: str,
+    speaker_token: int,
+    *,
+    transcribe: Callable[[str, torch.device], List[WhisperWord]],
+    load_audio: Callable[[str, int], np.ndarray],
+    encode_audio: Callable[[np.ndarray], torch.Tensor],
+) -> tuple[List[Entry], List[int], torch.Tensor]:
+    words = transcribe(audio_path, runtime.device)
+    entries, steps = words_to_entries(
+        words=words,
+        tokenizer=runtime.tokenizer,
+        speaker_token=speaker_token,
+        frame_rate=runtime.frame_rate,
+    )
+    audio = load_audio(audio_path, runtime.mimi.sample_rate)
+    tokens = encode_audio(audio)
+    return entries, steps, tokens
+def transcribe_words(
+    audio_path: str,
+    device: torch.device,
+    language: Optional[str] = None,
+) -> List[WhisperWord]:
+    import whisper_timestamped as wts  # Imported lazily
+    model = wts.load_model("openai/whisper-large-v3", device=str(device))
+    result = wts.transcribe(model, audio_path, language=language)
+    words: List[WhisperWord] = []
+    for segment in result.get("segments", []):
+        for word in segment.get("words", []):
+            text = (word.get("text") or word.get("word") or "").strip()
+            if not text:
+                continue
+            words.append(
+                WhisperWord(
+                    text=text,
+                    start=float(word.get("start", 0.0)),
+                    end=float(word.get("end", 0.0)),
+                )
+            )
+    return words
+def words_to_entries(
+    *,
+    words: Sequence[WhisperWord],
+    tokenizer,
+    speaker_token: int,
+    frame_rate: float,
+) -> tuple[List[Entry], List[int]]:
+    entries: List[Entry] = []
+    new_word_steps: List[int] = []
+    if not words:
+        return entries, new_word_steps
+    convert = getattr(tokenizer, "convert_tokens_to_ids", None)
+    speaker_prefix: Optional[str] = None
+    if callable(convert):
+        s1_id = convert("[S1]")
+        s2_id = convert("[S2]")
+        if speaker_token == s1_id:
+            speaker_prefix = "[S1]"
+        elif speaker_token == s2_id:
+            speaker_prefix = "[S2]"
+    pending_prefix: Optional[str] = speaker_prefix
+    current_pos = 0
+    for idx, word in enumerate(words):
+        tokens = _encode_word(word.text, tokenizer, pending_prefix)
+        pending_prefix = None
+        start_frame = max(current_pos + 1, int(round(word.start * frame_rate)))
+        end_frame = start_frame + len(tokens)
+        new_word_steps.append(start_frame - 1)
+        if idx < len(words) - 1:
+            next_start = int(round(words[idx + 1].start * frame_rate))
+            next_word_start = max(end_frame + 1, next_start)
+        else:
+            end_time = int(round(words[-1].end * frame_rate))
+            next_word_start = max(end_frame + 1, end_time)
+        padding = max(0, next_word_start - start_frame - 1)
+        entries.append(Entry(tokens=tokens, text=word.text, padding=padding))
+        current_pos = end_frame
+    return entries, new_word_steps
+def _encode_word(text: str, tokenizer, prefix: Optional[str]) -> List[int]:
+    if prefix:
+        return tokenizer.encode(f"{prefix} {text}", add_special_tokens=False)
+    return tokenizer.encode(text, add_special_tokens=False)
+__all__ = [
+    "PrefixPlan",
+    "WhisperWord",
+    "build_prefix_plan",
+    "transcribe_words",
+    "words_to_entries",
+]

input.txt ADDED Viewed

	@@ -0,0 +1 @@

+ [S1] Um, like, I don't know, I've never actually, like, been on a real vacation, you know? [S2] Oh, seriously? That's wild. I've, uh, only been on, like, one trip myself, and it was kinda stressful. [S1] Yeah, I always see people going places on, like, Instagram, but then I'm just, um, at home thinking, "Maybe next year." [S2] Honestly, same. I, like, plan stuff in my head but then forget or just, you know, bail at the last minute. [S1] So, we should, like, totally go somewhere together one day. [S2] For real, that would be awesome.

pyproject.toml ADDED Viewed

	@@ -0,0 +1,45 @@

+[build-system]
+requires = ["setuptools>=70.0", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "dia2"
+version = "0.1.0"
+description = "Dia2 CUDA-only text-to-speech runtime"
+readme = "README.md"
+requires-python = ">=3.10"
+authors = [{ name = "Dia Contributors" }]
+license = { file = "LICENSE" }
+dependencies = [
+  "torch>=2.8.0",
+  "numpy>=2.1.0,<3.0",
+  "transformers>=4.55.3",
+  "safetensors==0.5.3",
+  "huggingface-hub>=0.24.7",
+  "sphn>=0.2.0",
+  "soundfile>=0.12.1",
+  "whisper-timestamped>=1.14.2",
+  "gradio>=4.44.1",
+]
+[project.optional-dependencies]
+dev = [
+  "ruff>=0.6.9",
+  "pyright>=1.1.385",
+]
+[tool.uv]
+package = true
+[tool.uv.sources]
+torch = [
+  { index = "pytorch-cu128", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+]
+[[tool.uv.index]]
+name = "pytorch-cu128"
+url = "https://download.pytorch.org/whl/cu128"
+explicit = true
+[tool.setuptools]
+packages = ["dia2"]

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff