Spaces:

SakibAhmed
/

Piper-TTS-API-and-Frontend

Paused

App Files Files Community

SakibAhmed commited on Nov 5

Commit

c0b6368

verified ·

1 Parent(s): 590e0ea

Upload 8 files

Browse files

Files changed (8) hide show

Dockerfile +20 -0
README.md +1 -11
app.py +147 -0
postman.json +42 -0
requirements.txt +3 -0
templates/index.html +62 -0
voices/en_GB-alba-medium.onnx +3 -0
voices/en_GB-alba-medium.onnx.json +493 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,20 @@

+# Dockerfile
+FROM python:3.11
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY . .
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+EXPOSE 7860
+# Run both Flask and agent using supervisord or similar
+CMD python -m flask run --host=0.0.0.0 --port=7860 & python app.py

README.md CHANGED Viewed

@@ -1,11 +1 @@
----
-title: Piper TTS API And Frontend
-emoji: 👀
-colorFrom: blue
-colorTo: gray
-sdk: docker
-pinned: false
-short_description: Piper-TTS-API-and-Frontend
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference


1	+ # Piper-TTS-API-and-Frontend

app.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import io
+import logging
+import wave
+from pathlib import Path
+import struct
+from flask import Flask, Response, jsonify, render_template, request, send_file, stream_with_context
+from flask_cors import CORS
+from piper import PiperVoice
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+app = Flask(__name__)
+CORS(app)  # This will enable CORS for all routes.
+# In-memory cache for PiperVoice instances
+tts_instances = {}
+# Directory where voice models are stored
+VOICES_DIR = Path(__file__).parent / "voices"
+def get_tts_instance(voice):
+    """
+    Retrieves a cached PiperVoice instance or creates a new one.
+    Loads the model and its required .onnx.json config file.
+    """
+    if voice not in tts_instances:
+        logger.info(f"Creating new PiperVoice instance for voice: {voice}")
+        try:
+            model_path, config_path = None, None
+            possible_paths = [
+                VOICES_DIR / f"{voice}.onnx",
+                Path(__file__).parent / f"{voice}.onnx",
+                Path(f"{voice}.onnx"),
+            ]
+            for path in possible_paths:
+                if path.exists():
+                    model_path = str(path)
+                    potential_config_path = path.with_suffix(".onnx.json")
+                    if potential_config_path.exists():
+                        config_path = str(potential_config_path)
+                        logger.info(f"Found model at: {model_path}")
+                        logger.info(f"Found config at: {config_path}")
+                        break
+            if not model_path or not config_path:
+                logger.error(f"Voice model or config not found for '{voice}'. Ensure both '.onnx' and '.onnx.json' are present.")
+                return None
+            tts_instances[voice] = PiperVoice.load(model_path, config_path=config_path)
+        except Exception as e:
+            logger.error(f"Failed to create PiperVoice instance for voice {voice}: {e}", exc_info=True)
+            return None
+    return tts_instances[voice]
+@app.route('/')
+def index():
+    """Serves the index.html frontend."""
+    return render_template('index.html')
+@app.route('/api/tts', methods=['GET'])
+def synthesize_audio_full():
+    """
+    Generates the full audio file and returns it.
+    """
+    text = request.args.get('text')
+    voice = request.args.get('voice', 'en_GB-alba-medium')
+    if not text:
+        return jsonify({"error": "Text to synthesize is required."}), 400
+    tts_instance = get_tts_instance(voice)
+    if not tts_instance:
+        return jsonify({"error": f"Could not load voice model for '{voice}'."}), 500
+    try:
+        wav_io = io.BytesIO()
+        with wave.open(wav_io, 'wb') as wav_file:
+            wav_file.setnchannels(1)
+            wav_file.setsampwidth(2)  # 16-bit
+            wav_file.setframerate(tts_instance.config.sample_rate)
+            # THE CORRECT FIX, BASED ON YOUR PROVIDED `piper_tts.py`
+            # The AudioChunk object has a specific attribute for the raw bytes.
+            for audio_chunk in tts_instance.synthesize(text):
+                wav_file.writeframes(audio_chunk.audio_int16_bytes)
+        wav_io.seek(0)
+        return send_file(
+            wav_io,
+            mimetype='audio/wav',
+            as_attachment=True,
+            download_name='output.wav'
+        )
+    except Exception as e:
+        logger.error(f"Error during full synthesis: {e}", exc_info=True)
+        return jsonify({"error": f"Failed to synthesize audio: {str(e)}"}), 500
+def generate_audio_stream(tts_instance, text):
+    """A generator function that streams the synthesized audio."""
+    try:
+        # 1. Create and yield the WAV header.
+        def create_wav_header(sample_rate, bits_per_sample=16, channels=1):
+            datasize = 2**32 - 1 # Use max value for streaming
+            o = [b'RIFF', struct.pack('<I', datasize + 36), b'WAVE', b'fmt ',
+                 struct.pack('<I', 16), struct.pack('<H', 1), struct.pack('<H', channels),
+                 struct.pack('<I', sample_rate),
+                 struct.pack('<I', sample_rate * channels * bits_per_sample // 8),
+                 struct.pack('<H', channels * bits_per_sample // 8),
+                 struct.pack('<H', bits_per_sample), b'data', struct.pack('<I', datasize)]
+            return b"".join(o)
+        header = create_wav_header(tts_instance.config.sample_rate)
+        yield header
+        # 2. THE CORRECT FIX, APPLIED TO STREAMING
+        # Yield the raw bytes from the .audio_int16_bytes attribute.
+        for audio_chunk in tts_instance.synthesize(text):
+            yield audio_chunk.audio_int16_bytes
+    except Exception as e:
+        logger.error(f"Error during stream generation: {e}", exc_info=True)
+@app.route('/api/tts-stream', methods=['GET'])
+def synthesize_audio_stream():
+    """
+    Streams the synthesized audio back to the client as it's generated.
+    """
+    text = request.args.get('text')
+    voice = request.args.get('voice', 'en_GB-alba-medium')
+    if not text:
+        return jsonify({"error": "Text to synthesize is required."}), 400
+    tts_instance = get_tts_instance(voice)
+    if not tts_instance:
+        return jsonify({"error": f"Could not load voice model for '{voice}'."}), 500
+    stream_generator = generate_audio_stream(tts_instance, text)
+    return Response(stream_with_context(stream_generator), mimetype='audio/wav')
+if __name__ == '__main__':
+    app.run(debug=True, port=5001)

postman.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+	"info": {
+		"_postman_id": "YOUR_COLLECTION_ID",
+		"name": "NOW Piper TTS API",
+		"schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json"
+	},
+	"item": [
+		{
+			"name": "Synthesize Audio",
+			"request": {
+				"method": "GET",
+				"header": [],
+				"url": {
+					"raw": "http://127.0.0.1:5001/api/tts?text=Hello%2C%20this%20is%20a%20test.&voice=en_US-lessac-medium",
+					"protocol": "http",
+					"host": [
+						"127",
+						"0",
+						"0",
+						"1"
+					],
+					"port": "5001",
+					"path": [
+						"api",
+						"tts"
+					],
+					"query": [
+						{
+							"key": "text",
+							"value": "Hello, this is a test."
+						},
+						{
+							"key": "voice",
+							"value": "en_US-lessac-medium"
+						}
+					]
+				}
+			},
+			"response": []
+		}
+	]
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+Flask==3.1.2
+Flask_Cors==5.0.0
+piper_tts==1.3.0

templates/index.html ADDED Viewed

	@@ -0,0 +1,62 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Piper TTS</title>
+    <style>
+        body { font-family: sans-serif; margin: 2em; }
+        textarea { width: 100%; height: 100px; }
+        button { margin-top: 1em; }
+        audio { margin-top: 1em; }
+    </style>
+</head>
+<body>
+    <h1>Piper Text-to-Speech</h1>
+    <textarea id="text-input" placeholder="Enter text to synthesize..."></textarea>
+    <br>
+    <label for="voice-select">Select Voice:</label>
+    <select id="voice-select">
+        <option value="en_US-lessac-medium">English (US) - Lessac Medium</option>
+        <!-- Add more voice options here as you download models -->
+    </select>
+    <br>
+    <button id="synthesize-button">Synthesize</button>
+    <br>
+    <audio id="audio-player" controls></audio>
+    <script>
+        const textInput = document.getElementById('text-input');
+        const voiceSelect = document.getElementById('voice-select');
+        const synthesizeButton = document.getElementById('synthesize-button');
+        const audioPlayer = document.getElementById('audio-player');
+        synthesizeButton.addEventListener('click', async () => {
+            const text = textInput.value;
+            const voice = voiceSelect.value;
+            if (!text) {
+                alert('Please enter some text to synthesize.');
+                return;
+            }
+            try {
+                const response = await fetch(`/api/tts?text=${encodeURIComponent(text)}&voice=${encodeURIComponent(voice)}`);
+                if (response.ok) {
+                    const blob = await response.blob();
+                    const url = URL.createObjectURL(blob);
+                    audioPlayer.src = url;
+                    audioPlayer.play();
+                } else {
+                    const error = await response.json();
+                    alert(`Error: ${error.error}`);
+                }
+            } catch (error) {
+                console.error('Error fetching audio:', error);
+                alert('An unexpected error occurred.');
+            }
+        });
+    </script>
+</body>
+</html>

voices/en_GB-alba-medium.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:401369c4a81d09fdd86c32c5c864440811dbdcc66466cde2d64f7133a66ad03b
+size 63201294

voices/en_GB-alba-medium.onnx.json ADDED Viewed

	@@ -0,0 +1,493 @@

+{
+  "audio": {
+    "sample_rate": 22050,
+    "quality": "medium"
+  },
+  "espeak": {
+    "voice": "en-gb-x-rp"
+  },
+  "inference": {
+    "noise_scale": 0.667,
+    "length_scale": 1,
+    "noise_w": 0.8
+  },
+  "phoneme_type": "espeak",
+  "phoneme_map": {},
+  "phoneme_id_map": {
+    "_": [
+      0
+    ],
+    "^": [
+      1
+    ],
+    "$": [
+      2
+    ],
+    " ": [
+      3
+    ],
+    "!": [
+      4
+    ],
+    "'": [
+      5
+    ],
+    "(": [
+      6
+    ],
+    ")": [
+      7
+    ],
+    ",": [
+      8
+    ],
+    "-": [
+      9
+    ],
+    ".": [
+      10
+    ],
+    ":": [
+      11
+    ],
+    ";": [
+      12
+    ],
+    "?": [
+      13
+    ],
+    "a": [
+      14
+    ],
+    "b": [
+      15
+    ],
+    "c": [
+      16
+    ],
+    "d": [
+      17
+    ],
+    "e": [
+      18
+    ],
+    "f": [
+      19
+    ],
+    "h": [
+      20
+    ],
+    "i": [
+      21
+    ],
+    "j": [
+      22
+    ],
+    "k": [
+      23
+    ],
+    "l": [
+      24
+    ],
+    "m": [
+      25
+    ],
+    "n": [
+      26
+    ],
+    "o": [
+      27
+    ],
+    "p": [
+      28
+    ],
+    "q": [
+      29
+    ],
+    "r": [
+      30
+    ],
+    "s": [
+      31
+    ],
+    "t": [
+      32
+    ],
+    "u": [
+      33
+    ],
+    "v": [
+      34
+    ],
+    "w": [
+      35
+    ],
+    "x": [
+      36
+    ],
+    "y": [
+      37
+    ],
+    "z": [
+      38
+    ],
+    "æ": [
+      39
+    ],
+    "ç": [
+      40
+    ],
+    "ð": [
+      41
+    ],
+    "ø": [
+      42
+    ],
+    "ħ": [
+      43
+    ],
+    "ŋ": [
+      44
+    ],
+    "œ": [
+      45
+    ],
+    "ǀ": [
+      46
+    ],
+    "ǁ": [
+      47
+    ],
+    "ǂ": [
+      48
+    ],
+    "ǃ": [
+      49
+    ],
+    "ɐ": [
+      50
+    ],
+    "ɑ": [
+      51
+    ],
+    "ɒ": [
+      52
+    ],
+    "ɓ": [
+      53
+    ],
+    "ɔ": [
+      54
+    ],
+    "ɕ": [
+      55
+    ],
+    "ɖ": [
+      56
+    ],
+    "ɗ": [
+      57
+    ],
+    "ɘ": [
+      58
+    ],
+    "ə": [
+      59
+    ],
+    "ɚ": [
+      60
+    ],
+    "ɛ": [
+      61
+    ],
+    "ɜ": [
+      62
+    ],
+    "ɞ": [
+      63
+    ],
+    "ɟ": [
+      64
+    ],
+    "ɠ": [
+      65
+    ],
+    "ɡ": [
+      66
+    ],
+    "ɢ": [
+      67
+    ],
+    "ɣ": [
+      68
+    ],
+    "ɤ": [
+      69
+    ],
+    "ɥ": [
+      70
+    ],
+    "ɦ": [
+      71
+    ],
+    "ɧ": [
+      72
+    ],
+    "ɨ": [
+      73
+    ],
+    "ɪ": [
+      74
+    ],
+    "ɫ": [
+      75
+    ],
+    "ɬ": [
+      76
+    ],
+    "ɭ": [
+      77
+    ],
+    "ɮ": [
+      78
+    ],
+    "ɯ": [
+      79
+    ],
+    "ɰ": [
+      80
+    ],
+    "ɱ": [
+      81
+    ],
+    "ɲ": [
+      82
+    ],
+    "ɳ": [
+      83
+    ],
+    "ɴ": [
+      84
+    ],
+    "ɵ": [
+      85
+    ],
+    "ɶ": [
+      86
+    ],
+    "ɸ": [
+      87
+    ],
+    "ɹ": [
+      88
+    ],
+    "ɺ": [
+      89
+    ],
+    "ɻ": [
+      90
+    ],
+    "ɽ": [
+      91
+    ],
+    "ɾ": [
+      92
+    ],
+    "ʀ": [
+      93
+    ],
+    "ʁ": [
+      94
+    ],
+    "ʂ": [
+      95
+    ],
+    "ʃ": [
+      96
+    ],
+    "ʄ": [
+      97
+    ],
+    "ʈ": [
+      98
+    ],
+    "ʉ": [
+      99
+    ],
+    "ʊ": [
+      100
+    ],
+    "ʋ": [
+      101
+    ],
+    "ʌ": [
+      102
+    ],
+    "ʍ": [
+      103
+    ],
+    "ʎ": [
+      104
+    ],
+    "ʏ": [
+      105
+    ],
+    "ʐ": [
+      106
+    ],
+    "ʑ": [
+      107
+    ],
+    "ʒ": [
+      108
+    ],
+    "ʔ": [
+      109
+    ],
+    "ʕ": [
+      110
+    ],
+    "ʘ": [
+      111
+    ],
+    "ʙ": [
+      112
+    ],
+    "ʛ": [
+      113
+    ],
+    "ʜ": [
+      114
+    ],
+    "ʝ": [
+      115
+    ],
+    "ʟ": [
+      116
+    ],
+    "ʡ": [
+      117
+    ],
+    "ʢ": [
+      118
+    ],
+    "ʲ": [
+      119
+    ],
+    "ˈ": [
+      120
+    ],
+    "ˌ": [
+      121
+    ],
+    "ː": [
+      122
+    ],
+    "ˑ": [
+      123
+    ],
+    "˞": [
+      124
+    ],
+    "β": [
+      125
+    ],
+    "θ": [
+      126
+    ],
+    "χ": [
+      127
+    ],
+    "ᵻ": [
+      128
+    ],
+    "ⱱ": [
+      129
+    ],
+    "0": [
+      130
+    ],
+    "1": [
+      131
+    ],
+    "2": [
+      132
+    ],
+    "3": [
+      133
+    ],
+    "4": [
+      134
+    ],
+    "5": [
+      135
+    ],
+    "6": [
+      136
+    ],
+    "7": [
+      137
+    ],
+    "8": [
+      138
+    ],
+    "9": [
+      139
+    ],
+    "̧": [
+      140
+    ],
+    "̃": [
+      141
+    ],
+    "̪": [
+      142
+    ],
+    "̯": [
+      143
+    ],
+    "̩": [
+      144
+    ],
+    "ʰ": [
+      145
+    ],
+    "ˤ": [
+      146
+    ],
+    "ε": [
+      147
+    ],
+    "↓": [
+      148
+    ],
+    "#": [
+      149
+    ],
+    "\"": [
+      150
+    ],
+    "↑": [
+      151
+    ],
+    "̺": [
+      152
+    ],
+    "̻": [
+      153
+    ]
+  },
+  "num_symbols": 256,
+  "num_speakers": 1,
+  "speaker_id_map": {},
+  "piper_version": "1.0.0",
+  "language": {
+    "code": "en_GB",
+    "family": "en",
+    "region": "GB",
+    "name_native": "English",
+    "name_english": "English",
+    "country_english": "Great Britain"
+  },
+  "dataset": "alba"
+}