github-actions[bot] commited on
Commit
68a99fc
Β·
1 Parent(s): 4cf286c

Auto-deploy from GitHub: c1cbfa3a37f6853e24d067af55ebc1ab447d9fc0

Browse files
.gitattributes CHANGED
@@ -1,35 +1,5 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.wav filter=lfs diff=lfs merge=lfs -text
2
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
3
+ *.flac filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  *.pth filter=lfs diff=lfs merge=lfs -text
5
+ *.bin filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
Dockerfile ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Set working directory
4
+ WORKDIR /app
5
+
6
+ # Install system dependencies
7
+ # Install system dependencies
8
+ RUN apt-get update && apt-get install -y \
9
+ ffmpeg \
10
+ git \
11
+ curl \
12
+ espeak-ng \
13
+ && rm -rf /var/lib/apt/lists/*
14
+
15
+ # Copy application files
16
+ COPY . .
17
+
18
+ # Install Python dependencies
19
+ RUN pip install --no-cache-dir -r requirements.txt
20
+
21
+ # Create necessary directories
22
+ RUN mkdir -p uploads temp_dir
23
+
24
+ # Expose port
25
+ EXPOSE 7860
26
+
27
+ # Run only the Flask app (worker starts automatically on first upload)
28
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,10 +1,31 @@
1
  ---
2
- title: TTS
3
- emoji: πŸ“š
4
  colorFrom: blue
5
- colorTo: blue
6
  sdk: docker
7
  pinned: false
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: TTS Text-to-Speech Generator
3
+ emoji: 🎡
4
  colorFrom: blue
5
+ colorTo: purple
6
  sdk: docker
7
  pinned: false
8
+ license: mit
9
  ---
10
 
11
+ # TTS Text-to-Speech Generator
12
+
13
+ A Python-based text-to-speech service with a neobrutalist web interface.
14
+
15
+ ## Features
16
+ - πŸ“ Text-to-Speech generation
17
+ - πŸ€– Multiple voices and speeds
18
+ - πŸ’Ύ SQLite database for queue management
19
+ - 🎨 Neobrutalist UI with smooth animations
20
+ - πŸ”„ Real-time status updates
21
+
22
+ ## Usage
23
+ Access the web interface at the Space URL above.
24
+
25
+ ## API Endpoints
26
+ - POST `/api/generate` - Generate audio from text
27
+ - GET `/api/files` - Get all files
28
+ - GET `/api/download/<id>` - Download generated audio
29
+
30
+ ---
31
+ *Auto-deployed from GitHub*
app.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify, send_from_directory, send_file
2
+ from flask_cors import CORS
3
+ import sqlite3
4
+ import os
5
+ import uuid
6
+ from datetime import datetime
7
+ from werkzeug.utils import secure_filename
8
+ import threading
9
+ import subprocess
10
+ import time
11
+ import shutil
12
+
13
+ app = Flask(__name__)
14
+ CORS(app)
15
+
16
+ UPLOAD_FOLDER = 'uploads'
17
+ os.makedirs(UPLOAD_FOLDER, exist_ok=True)
18
+ os.makedirs('temp_dir', exist_ok=True)
19
+
20
+ # Worker state
21
+ worker_thread = None
22
+ worker_running = False
23
+
24
+ def init_db():
25
+ conn = sqlite3.connect('tts_tasks.db')
26
+ c = conn.cursor()
27
+ c.execute('''CREATE TABLE IF NOT EXISTS tasks
28
+ (id TEXT PRIMARY KEY,
29
+ text TEXT NOT NULL,
30
+ voice TEXT,
31
+ speed REAL,
32
+ status TEXT NOT NULL,
33
+ output_file TEXT,
34
+ created_at TEXT NOT NULL,
35
+ processed_at TEXT,
36
+ error TEXT)''')
37
+ conn.commit()
38
+ conn.close()
39
+
40
+ def start_worker():
41
+ """Start the worker thread if not already running"""
42
+ global worker_thread, worker_running
43
+
44
+ if not worker_running:
45
+ worker_running = True
46
+ worker_thread = threading.Thread(target=worker_loop, daemon=True)
47
+ worker_thread.start()
48
+ print("βœ… Worker thread started")
49
+
50
+ def worker_loop():
51
+ """Main worker loop that processes TTS tasks"""
52
+ print("πŸ€– TTS Worker started. Monitoring for new tasks...")
53
+
54
+ CWD = "./"
55
+ PYTHON_PATH = "python3" # Or just python
56
+ POLL_INTERVAL = 2 # seconds
57
+
58
+ while worker_running:
59
+ try:
60
+ # Get next unprocessed task
61
+ conn = sqlite3.connect('tts_tasks.db')
62
+ conn.row_factory = sqlite3.Row
63
+ c = conn.cursor()
64
+ c.execute('''SELECT * FROM tasks
65
+ WHERE status = 'not_started'
66
+ ORDER BY created_at ASC
67
+ LIMIT 1''')
68
+ row = c.fetchone()
69
+ conn.close()
70
+
71
+ if row:
72
+ task_id = row['id']
73
+ text = row['text']
74
+ voice = row['voice'] or '8' # Default voice
75
+ speed = row['speed'] or 1.0
76
+
77
+ print(f"\n{'='*60}")
78
+ print(f"🎡 Processing Task: {task_id}")
79
+ print(f"πŸ“ Text: {text[:50]}...")
80
+ print(f"{'='*60}")
81
+
82
+ # Update status to processing
83
+ update_status(task_id, 'processing')
84
+
85
+ try:
86
+ # Write text to content.txt
87
+ with open('content.txt', 'w', encoding='utf-8') as f:
88
+ f.write(text)
89
+
90
+ # Run TTS command
91
+ # python3 -m tts_runner.runner --model kokoro --voice <voice> --speed <speed>
92
+ print(f"πŸ”„ Running TTS...")
93
+ command = [
94
+ PYTHON_PATH, "-m", "tts_runner.runner",
95
+ "--model", "kokoro",
96
+ "--voice", str(voice),
97
+ "--speed", str(speed)
98
+ ]
99
+
100
+ subprocess.run(
101
+ command,
102
+ check=True,
103
+ cwd=CWD,
104
+ env={
105
+ **os.environ,
106
+ 'PYTHONUNBUFFERED': '1',
107
+ 'CUDA_LAUNCH_BLOCKING': '1'
108
+ }
109
+ )
110
+
111
+ # Check for output file
112
+ output_filename = "output_audio.wav"
113
+ if os.path.exists(output_filename):
114
+ # Move to uploads folder
115
+ target_filename = f"{task_id}.wav"
116
+ target_path = os.path.join(UPLOAD_FOLDER, target_filename)
117
+ shutil.move(output_filename, target_path)
118
+
119
+ print(f"βœ… Successfully processed: {target_filename}")
120
+
121
+ # Update database with success
122
+ update_status(task_id, 'completed', output_file=target_filename)
123
+ else:
124
+ raise Exception("Output audio file not found")
125
+
126
+ except Exception as e:
127
+ print(f"❌ Failed to process: {task_id}")
128
+ print(f"Error: {str(e)}")
129
+ update_status(task_id, 'failed', error=str(e))
130
+
131
+ else:
132
+ # No tasks to process, sleep for a bit
133
+ time.sleep(POLL_INTERVAL)
134
+
135
+ except Exception as e:
136
+ print(f"⚠️ Worker error: {str(e)}")
137
+ time.sleep(POLL_INTERVAL)
138
+
139
+ def update_status(task_id, status, output_file=None, error=None):
140
+ """Update the status of a task in the database"""
141
+ conn = sqlite3.connect('tts_tasks.db')
142
+ c = conn.cursor()
143
+
144
+ if status == 'completed':
145
+ c.execute('''UPDATE tasks
146
+ SET status = ?, output_file = ?, processed_at = ?
147
+ WHERE id = ?''',
148
+ (status, output_file, datetime.now().isoformat(), task_id))
149
+ elif status == 'failed':
150
+ c.execute('''UPDATE tasks
151
+ SET status = ?, error = ?, processed_at = ?
152
+ WHERE id = ?''',
153
+ (status, str(error), datetime.now().isoformat(), task_id))
154
+ else:
155
+ c.execute('UPDATE tasks SET status = ? WHERE id = ?', (status, task_id))
156
+
157
+ conn.commit()
158
+ conn.close()
159
+
160
+ @app.route('/')
161
+ def index():
162
+ return send_from_directory('.', 'index.html')
163
+
164
+ @app.route('/api/generate', methods=['POST'])
165
+ def generate_audio():
166
+ data = request.json
167
+ if not data or 'text' not in data:
168
+ return jsonify({'error': 'No text provided'}), 400
169
+
170
+ text = data['text']
171
+ voice = data.get('voice', '8')
172
+ speed = data.get('speed', 1.0)
173
+
174
+ if not text.strip():
175
+ return jsonify({'error': 'Text cannot be empty'}), 400
176
+
177
+ task_id = str(uuid.uuid4())
178
+
179
+ conn = sqlite3.connect('tts_tasks.db')
180
+ c = conn.cursor()
181
+ c.execute('''INSERT INTO tasks
182
+ (id, text, voice, speed, status, created_at)
183
+ VALUES (?, ?, ?, ?, ?, ?)''',
184
+ (task_id, text, voice, speed, 'not_started', datetime.now().isoformat()))
185
+ conn.commit()
186
+ conn.close()
187
+
188
+ # Start worker on first request
189
+ start_worker()
190
+
191
+ return jsonify({
192
+ 'id': task_id,
193
+ 'status': 'not_started',
194
+ 'message': 'Task queued successfully'
195
+ }), 201
196
+
197
+ @app.route('/api/files', methods=['GET'])
198
+ def get_files():
199
+ conn = sqlite3.connect('tts_tasks.db')
200
+ conn.row_factory = sqlite3.Row
201
+ c = conn.cursor()
202
+ c.execute('SELECT * FROM tasks ORDER BY created_at DESC')
203
+ rows = c.fetchall()
204
+ conn.close()
205
+
206
+ files = []
207
+ for row in rows:
208
+ files.append({
209
+ 'id': row['id'],
210
+ 'text': row['text'],
211
+ 'status': row['status'],
212
+ 'output_file': row['output_file'],
213
+ 'created_at': row['created_at'],
214
+ 'processed_at': row['processed_at'],
215
+ 'error': row['error']
216
+ })
217
+
218
+ return jsonify(files)
219
+
220
+ @app.route('/api/download/<task_id>', methods=['GET'])
221
+ def download_file(task_id):
222
+ conn = sqlite3.connect('tts_tasks.db')
223
+ conn.row_factory = sqlite3.Row
224
+ c = conn.cursor()
225
+ c.execute('SELECT * FROM tasks WHERE id = ?', (task_id,))
226
+ row = c.fetchone()
227
+ conn.close()
228
+
229
+ if row is None or not row['output_file']:
230
+ return jsonify({'error': 'File not found'}), 404
231
+
232
+ file_path = os.path.join(UPLOAD_FOLDER, row['output_file'])
233
+ if not os.path.exists(file_path):
234
+ return jsonify({'error': 'File missing on server'}), 404
235
+
236
+ return send_file(file_path, as_attachment=True, download_name=f"tts_{task_id}.wav")
237
+
238
+ @app.route('/health', methods=['GET'])
239
+ def health():
240
+ return jsonify({
241
+ 'status': 'healthy',
242
+ 'service': 'tts-generator',
243
+ 'worker_running': worker_running
244
+ })
245
+
246
+ if __name__ == '__main__':
247
+ init_db()
248
+ print("\n" + "="*60)
249
+ print("πŸš€ TTS Generator API Server")
250
+ print("="*60)
251
+ print("πŸ“Œ Worker will start automatically on first request")
252
+ print("="*60 + "\n")
253
+
254
+ # Use PORT environment variable for Hugging Face compatibility
255
+ port = int(os.environ.get('PORT', 7860))
256
+ app.run(debug=False, host='0.0.0.0', port=port)
index.html ADDED
@@ -0,0 +1,687 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+
4
+ <head>
5
+ <meta charset="UTF-8">
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
7
+ <title>TTS Generator</title>
8
+ <style>
9
+ * {
10
+ margin: 0;
11
+ padding: 0;
12
+ box-sizing: border-box;
13
+ }
14
+
15
+ :root {
16
+ --bg: #0a0e27;
17
+ --surface: #141b3d;
18
+ --primary: #00ff88;
19
+ --secondary: #ff00ff;
20
+ --accent: #00d4ff;
21
+ --error: #ff1744;
22
+ --text: #ffffff;
23
+ --border: 4px;
24
+ }
25
+
26
+ body {
27
+ font-family: 'Space Grotesk', 'Courier New', monospace;
28
+ background: var(--bg);
29
+ color: var(--text);
30
+ min-height: 100vh;
31
+ overflow-x: hidden;
32
+ position: relative;
33
+ }
34
+
35
+ body::before {
36
+ content: '';
37
+ position: fixed;
38
+ top: 0;
39
+ left: 0;
40
+ width: 100%;
41
+ height: 100%;
42
+ background:
43
+ radial-gradient(circle at 20% 50%, rgba(0, 255, 136, 0.1) 0%, transparent 50%),
44
+ radial-gradient(circle at 80% 80%, rgba(255, 0, 255, 0.1) 0%, transparent 50%),
45
+ radial-gradient(circle at 40% 20%, rgba(0, 212, 255, 0.1) 0%, transparent 50%);
46
+ pointer-events: none;
47
+ z-index: 0;
48
+ }
49
+
50
+ .container {
51
+ max-width: 1400px;
52
+ margin: 0 auto;
53
+ padding: 2rem;
54
+ position: relative;
55
+ z-index: 1;
56
+ }
57
+
58
+ header {
59
+ text-align: center;
60
+ margin-bottom: 3rem;
61
+ animation: slideDown 0.6s cubic-bezier(0.68, -0.55, 0.265, 1.55);
62
+ }
63
+
64
+ @keyframes slideDown {
65
+ from {
66
+ opacity: 0;
67
+ transform: translateY(-50px);
68
+ }
69
+
70
+ to {
71
+ opacity: 1;
72
+ transform: translateY(0);
73
+ }
74
+ }
75
+
76
+ h1 {
77
+ font-size: clamp(2rem, 5vw, 4rem);
78
+ font-weight: 900;
79
+ background: linear-gradient(135deg, var(--primary) 0%, var(--accent) 50%, var(--secondary) 100%);
80
+ -webkit-background-clip: text;
81
+ -webkit-text-fill-color: transparent;
82
+ background-clip: text;
83
+ text-transform: uppercase;
84
+ letter-spacing: -2px;
85
+ margin-bottom: 1rem;
86
+ position: relative;
87
+ display: inline-block;
88
+ }
89
+
90
+ h1::after {
91
+ content: '';
92
+ position: absolute;
93
+ bottom: -10px;
94
+ left: 50%;
95
+ transform: translateX(-50%);
96
+ width: 60%;
97
+ height: 6px;
98
+ background: linear-gradient(90deg, transparent, var(--primary), transparent);
99
+ animation: glow 2s ease-in-out infinite;
100
+ }
101
+
102
+ @keyframes glow {
103
+
104
+ 0%,
105
+ 100% {
106
+ opacity: 0.5;
107
+ }
108
+
109
+ 50% {
110
+ opacity: 1;
111
+ }
112
+ }
113
+
114
+ .subtitle {
115
+ font-size: 1.2rem;
116
+ color: var(--accent);
117
+ letter-spacing: 2px;
118
+ }
119
+
120
+ .input-section {
121
+ background: var(--surface);
122
+ border: var(--border) solid var(--primary);
123
+ box-shadow: 8px 8px 0 var(--primary);
124
+ padding: 2rem;
125
+ margin-bottom: 3rem;
126
+ position: relative;
127
+ transition: all 0.3s ease;
128
+ animation: slideUp 0.6s cubic-bezier(0.68, -0.55, 0.265, 1.55) 0.2s both;
129
+ }
130
+
131
+ @keyframes slideUp {
132
+ from {
133
+ opacity: 0;
134
+ transform: translateY(50px);
135
+ }
136
+
137
+ to {
138
+ opacity: 1;
139
+ transform: translateY(0);
140
+ }
141
+ }
142
+
143
+ .input-section:hover {
144
+ transform: translate(-2px, -2px);
145
+ box-shadow: 12px 12px 0 var(--primary);
146
+ }
147
+
148
+ textarea {
149
+ width: 100%;
150
+ height: 150px;
151
+ background: rgba(0, 212, 255, 0.05);
152
+ border: 3px solid var(--accent);
153
+ color: var(--text);
154
+ padding: 1rem;
155
+ font-family: inherit;
156
+ font-size: 1.1rem;
157
+ resize: vertical;
158
+ margin-bottom: 1.5rem;
159
+ transition: all 0.3s ease;
160
+ }
161
+
162
+ textarea:focus {
163
+ outline: none;
164
+ border-color: var(--primary);
165
+ background: rgba(0, 255, 136, 0.05);
166
+ }
167
+
168
+ .controls {
169
+ display: flex;
170
+ gap: 1rem;
171
+ margin-bottom: 1.5rem;
172
+ flex-wrap: wrap;
173
+ }
174
+
175
+ .control-group {
176
+ flex: 1;
177
+ min-width: 200px;
178
+ }
179
+
180
+ label {
181
+ display: block;
182
+ margin-bottom: 0.5rem;
183
+ color: var(--accent);
184
+ font-weight: bold;
185
+ }
186
+
187
+ select, input[type="number"] {
188
+ width: 100%;
189
+ padding: 0.8rem;
190
+ background: var(--bg);
191
+ border: 2px solid var(--accent);
192
+ color: var(--text);
193
+ font-family: inherit;
194
+ font-size: 1rem;
195
+ }
196
+
197
+ .btn {
198
+ background: var(--primary);
199
+ color: var(--bg);
200
+ border: var(--border) solid var(--bg);
201
+ padding: 1rem 2rem;
202
+ font-size: 1.1rem;
203
+ font-weight: 900;
204
+ text-transform: uppercase;
205
+ cursor: pointer;
206
+ transition: all 0.2s ease;
207
+ box-shadow: 4px 4px 0 var(--bg);
208
+ letter-spacing: 1px;
209
+ position: relative;
210
+ width: 100%;
211
+ }
212
+
213
+ .btn:hover:not(:disabled) {
214
+ transform: translate(-2px, -2px);
215
+ box-shadow: 6px 6px 0 var(--bg);
216
+ }
217
+
218
+ .btn:active:not(:disabled) {
219
+ transform: translate(2px, 2px);
220
+ box-shadow: 2px 2px 0 var(--bg);
221
+ }
222
+
223
+ .btn:disabled {
224
+ opacity: 0.6;
225
+ cursor: not-allowed;
226
+ }
227
+
228
+ .btn-secondary {
229
+ background: var(--accent);
230
+ }
231
+
232
+ .btn-small {
233
+ padding: 0.5rem 1rem;
234
+ font-size: 0.85rem;
235
+ box-shadow: 3px 3px 0 var(--bg);
236
+ text-decoration: none;
237
+ display: inline-block;
238
+ color: var(--bg);
239
+ }
240
+
241
+ .btn-small:hover:not(:disabled) {
242
+ box-shadow: 4px 4px 0 var(--bg);
243
+ transform: translate(-2px, -2px);
244
+ }
245
+
246
+ .table-section {
247
+ animation: slideUp 0.6s cubic-bezier(0.68, -0.55, 0.265, 1.55) 0.4s both;
248
+ }
249
+
250
+ .table-wrapper {
251
+ overflow-x: auto;
252
+ background: var(--surface);
253
+ border: var(--border) solid var(--secondary);
254
+ box-shadow: 8px 8px 0 var(--secondary);
255
+ }
256
+
257
+ table {
258
+ width: 100%;
259
+ border-collapse: collapse;
260
+ }
261
+
262
+ thead {
263
+ background: linear-gradient(135deg, var(--primary), var(--accent));
264
+ }
265
+
266
+ th {
267
+ padding: 1.5rem 1rem;
268
+ text-align: left;
269
+ font-weight: 900;
270
+ text-transform: uppercase;
271
+ letter-spacing: 1px;
272
+ color: var(--bg);
273
+ border-right: 3px solid var(--bg);
274
+ }
275
+
276
+ th:last-child {
277
+ border-right: none;
278
+ }
279
+
280
+ tbody tr {
281
+ border-bottom: 2px solid rgba(0, 212, 255, 0.2);
282
+ transition: all 0.3s ease;
283
+ animation: fadeIn 0.5s ease;
284
+ }
285
+
286
+ @keyframes fadeIn {
287
+ from {
288
+ opacity: 0;
289
+ }
290
+
291
+ to {
292
+ opacity: 1;
293
+ }
294
+ }
295
+
296
+ tbody tr:hover {
297
+ background: rgba(0, 255, 136, 0.1);
298
+ }
299
+
300
+ td {
301
+ padding: 1.5rem 1rem;
302
+ color: var(--text);
303
+ }
304
+
305
+ .status {
306
+ display: inline-block;
307
+ padding: 0.5rem 1rem;
308
+ border: 3px solid;
309
+ font-weight: 900;
310
+ text-transform: uppercase;
311
+ font-size: 0.85rem;
312
+ letter-spacing: 1px;
313
+ }
314
+
315
+ .status-not_started {
316
+ background: var(--bg);
317
+ border-color: var(--accent);
318
+ color: var(--accent);
319
+ }
320
+
321
+ .status-processing {
322
+ background: var(--bg);
323
+ border-color: var(--primary);
324
+ color: var(--primary);
325
+ animation: pulse 1.5s ease-in-out infinite;
326
+ }
327
+
328
+ @keyframes pulse {
329
+
330
+ 0%,
331
+ 100% {
332
+ opacity: 1;
333
+ }
334
+
335
+ 50% {
336
+ opacity: 0.6;
337
+ }
338
+ }
339
+
340
+ .status-completed {
341
+ background: var(--primary);
342
+ border-color: var(--primary);
343
+ color: var(--bg);
344
+ }
345
+
346
+ .status-failed {
347
+ background: var(--error);
348
+ border-color: var(--error);
349
+ color: var(--text);
350
+ }
351
+
352
+ .text-cell {
353
+ max-width: 300px;
354
+ overflow: hidden;
355
+ text-overflow: ellipsis;
356
+ white-space: nowrap;
357
+ }
358
+
359
+ .empty-state {
360
+ text-align: center;
361
+ padding: 4rem 2rem;
362
+ color: var(--accent);
363
+ font-size: 1.2rem;
364
+ }
365
+
366
+ .refresh-btn {
367
+ position: fixed;
368
+ bottom: 2rem;
369
+ right: 2rem;
370
+ width: 60px;
371
+ height: 60px;
372
+ border-radius: 50%;
373
+ background: var(--secondary);
374
+ border: var(--border) solid var(--bg);
375
+ box-shadow: 4px 4px 0 var(--bg);
376
+ cursor: pointer;
377
+ transition: all 0.3s ease;
378
+ display: flex;
379
+ align-items: center;
380
+ justify-content: center;
381
+ font-size: 1.5rem;
382
+ z-index: 1000;
383
+ }
384
+
385
+ .refresh-btn:hover {
386
+ transform: rotate(180deg) scale(1.1);
387
+ box-shadow: 6px 6px 0 var(--bg);
388
+ }
389
+
390
+ /* Loader styles */
391
+ .loader-overlay {
392
+ position: fixed;
393
+ top: 0;
394
+ left: 0;
395
+ width: 100%;
396
+ height: 100%;
397
+ background: rgba(10, 14, 39, 0.95);
398
+ display: flex;
399
+ align-items: center;
400
+ justify-content: center;
401
+ z-index: 9999;
402
+ animation: fadeIn 0.3s ease;
403
+ }
404
+
405
+ .loader {
406
+ width: 80px;
407
+ height: 80px;
408
+ border: 6px solid var(--surface);
409
+ border-top: 6px solid var(--primary);
410
+ border-right: 6px solid var(--accent);
411
+ border-bottom: 6px solid var(--secondary);
412
+ border-radius: 50%;
413
+ animation: spin 1s linear infinite;
414
+ }
415
+
416
+ @keyframes spin {
417
+ 0% {
418
+ transform: rotate(0deg);
419
+ }
420
+
421
+ 100% {
422
+ transform: rotate(360deg);
423
+ }
424
+ }
425
+
426
+ .loader-text {
427
+ position: absolute;
428
+ margin-top: 120px;
429
+ font-size: 1.2rem;
430
+ font-weight: 900;
431
+ color: var(--primary);
432
+ text-transform: uppercase;
433
+ letter-spacing: 2px;
434
+ }
435
+
436
+ @media (max-width: 768px) {
437
+ .container {
438
+ padding: 1rem;
439
+ }
440
+
441
+ .input-section,
442
+ .table-wrapper {
443
+ box-shadow: 4px 4px 0 var(--primary);
444
+ }
445
+
446
+ th,
447
+ td {
448
+ padding: 1rem 0.5rem;
449
+ font-size: 0.9rem;
450
+ }
451
+
452
+ .text-cell {
453
+ max-width: 150px;
454
+ }
455
+ }
456
+
457
+ .notification {
458
+ position: fixed;
459
+ top: 2rem;
460
+ right: 2rem;
461
+ padding: 1.5rem 2rem;
462
+ background: var(--primary);
463
+ color: var(--bg);
464
+ border: var(--border) solid var(--bg);
465
+ box-shadow: 6px 6px 0 var(--bg);
466
+ font-weight: 900;
467
+ z-index: 2000;
468
+ animation: slideInRight 0.5s ease, slideOutRight 0.5s ease 3.5s;
469
+ }
470
+
471
+ @keyframes slideInRight {
472
+ from {
473
+ transform: translateX(400px);
474
+ opacity: 0;
475
+ }
476
+
477
+ to {
478
+ transform: translateX(0);
479
+ opacity: 1;
480
+ }
481
+ }
482
+
483
+ @keyframes slideOutRight {
484
+ to {
485
+ transform: translateX(400px);
486
+ opacity: 0;
487
+ }
488
+ }
489
+ </style>
490
+ </head>
491
+
492
+ <body>
493
+ <div class="container">
494
+ <header>
495
+ <h1>TTS Generator</h1>
496
+ <p class="subtitle">Text β€’ Process β€’ Audio</p>
497
+ </header>
498
+
499
+ <div class="input-section">
500
+ <h2 style="margin-bottom: 1.5rem; color: var(--primary);">Generate Audio</h2>
501
+
502
+ <textarea id="textInput" placeholder="Enter text to convert to speech..."></textarea>
503
+
504
+ <div class="controls">
505
+ <div class="control-group">
506
+ <label>Voice</label>
507
+ <select id="voiceSelect">
508
+ <option value="8">Default Voice</option>
509
+ <option value="0">Voice 0</option>
510
+ <option value="1">Voice 1</option>
511
+ <option value="2">Voice 2</option>
512
+ <option value="3">Voice 3</option>
513
+ <option value="4">Voice 4</option>
514
+ <option value="5">Voice 5</option>
515
+ <option value="6">Voice 6</option>
516
+ <option value="7">Voice 7</option>
517
+ <option value="9">Voice 9</option>
518
+ </select>
519
+ </div>
520
+ <div class="control-group">
521
+ <label>Speed</label>
522
+ <input type="number" id="speedInput" value="1.0" step="0.1" min="0.5" max="2.0">
523
+ </div>
524
+ </div>
525
+
526
+ <button class="btn" id="generateBtn">
527
+ πŸš€ Generate Audio
528
+ </button>
529
+ </div>
530
+
531
+ <div class="table-section">
532
+ <h2 style="margin-bottom: 1.5rem; color: var(--secondary);">Processing Queue</h2>
533
+ <div class="table-wrapper">
534
+ <table>
535
+ <thead>
536
+ <tr>
537
+ <th>Text</th>
538
+ <th>Status</th>
539
+ <th>Audio</th>
540
+ <th>Created</th>
541
+ <th>Processed</th>
542
+ </tr>
543
+ </thead>
544
+ <tbody id="filesTable">
545
+ <tr>
546
+ <td colspan="5" class="empty-state">No tasks yet. Start by generating audio!
547
+ </td>
548
+ </tr>
549
+ </tbody>
550
+ </table>
551
+ </div>
552
+ </div>
553
+ </div>
554
+
555
+ <button class="refresh-btn" id="refreshBtn" title="Refresh">πŸ”„</button>
556
+
557
+ <!-- Loader -->
558
+ <div class="loader-overlay" id="loader" style="display: none;">
559
+ <div>
560
+ <div class="loader"></div>
561
+ <div class="loader-text">Queuing...</div>
562
+ </div>
563
+ </div>
564
+
565
+ <script>
566
+ const API_URL = '/api';
567
+
568
+ // Elements
569
+ const textInput = document.getElementById('textInput');
570
+ const voiceSelect = document.getElementById('voiceSelect');
571
+ const speedInput = document.getElementById('speedInput');
572
+ const generateBtn = document.getElementById('generateBtn');
573
+ const loader = document.getElementById('loader');
574
+ const refreshBtn = document.getElementById('refreshBtn');
575
+
576
+ // Generate button
577
+ generateBtn.addEventListener('click', async () => {
578
+ const text = textInput.value.trim();
579
+ if (!text) {
580
+ showNotification('Please enter some text!', 'error');
581
+ return;
582
+ }
583
+
584
+ const voice = voiceSelect.value;
585
+ const speed = parseFloat(speedInput.value);
586
+
587
+ // Show loader
588
+ loader.style.display = 'flex';
589
+ generateBtn.disabled = true;
590
+
591
+ try {
592
+ const response = await fetch(`${API_URL}/generate`, {
593
+ method: 'POST',
594
+ headers: {
595
+ 'Content-Type': 'application/json'
596
+ },
597
+ body: JSON.stringify({
598
+ text,
599
+ voice,
600
+ speed
601
+ })
602
+ });
603
+
604
+ const data = await response.json();
605
+
606
+ if (response.ok) {
607
+ showNotification('Task queued successfully! πŸŽ‰');
608
+ textInput.value = '';
609
+ loadFiles();
610
+ } else {
611
+ showNotification(data.error || 'Generation failed', 'error');
612
+ }
613
+ } catch (error) {
614
+ showNotification('Network error: ' + error.message, 'error');
615
+ } finally {
616
+ // Hide loader
617
+ loader.style.display = 'none';
618
+ generateBtn.disabled = false;
619
+ }
620
+ });
621
+
622
+ // Load files
623
+ async function loadFiles() {
624
+ try {
625
+ const response = await fetch(`${API_URL}/files`);
626
+ const files = await response.json();
627
+
628
+ const tbody = document.getElementById('filesTable');
629
+
630
+ if (files.length === 0) {
631
+ tbody.innerHTML = '<tr><td colspan="5" class="empty-state">No tasks yet. Start by generating audio!</td></tr>';
632
+ return;
633
+ }
634
+
635
+ tbody.innerHTML = files.map(file => {
636
+ return `
637
+ <tr>
638
+ <td class="text-cell" title="${file.text}">${file.text}</td>
639
+ <td><span class="status status-${file.status}">${file.status.replace('_', ' ')}</span></td>
640
+ <td>
641
+ ${file.status === 'completed' && file.output_file ?
642
+ `<a href="${API_URL}/download/${file.id}" class="btn btn-small btn-secondary" target="_blank">⬇️ Download</a>`
643
+ : 'β€”'}
644
+ </td>
645
+ <td>${new Date(file.created_at).toLocaleString()}</td>
646
+ <td>${file.processed_at ? new Date(file.processed_at).toLocaleString() : 'β€”'}</td>
647
+ </tr>
648
+ `;
649
+ }).join('');
650
+ } catch (error) {
651
+ console.error('Error loading files:', error);
652
+ }
653
+ }
654
+
655
+ // Refresh button
656
+ refreshBtn.addEventListener('click', () => {
657
+ loadFiles();
658
+ const icon = refreshBtn.textContent;
659
+ refreshBtn.textContent = '⏳';
660
+ setTimeout(() => refreshBtn.textContent = icon, 500);
661
+ });
662
+
663
+ // Auto refresh every 5 seconds
664
+ setInterval(loadFiles, 5000);
665
+
666
+ // Initial load
667
+ loadFiles();
668
+
669
+ // Notification system
670
+ function showNotification(message, type = 'success') {
671
+ const notification = document.createElement('div');
672
+ notification.className = 'notification';
673
+ if (type === 'error') {
674
+ notification.style.background = 'var(--error)';
675
+ notification.style.borderColor = 'var(--error)';
676
+ }
677
+ notification.textContent = message;
678
+ document.body.appendChild(notification);
679
+
680
+ setTimeout(() => {
681
+ notification.remove();
682
+ }, 4000);
683
+ }
684
+ </script>
685
+ </body>
686
+
687
+ </html>
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Flask==3.0.0
2
+ flask-cors==4.0.0
3
+ werkzeug==3.0.1
4
+
5
+ git+https://github.com/jebin2/TTS.git#egg=tts-runner[kokoro]
setup.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # setup.py
2
+ import os
3
+ from setuptools import setup, find_packages
4
+
5
+ # Read README.md
6
+ this_directory = os.path.abspath(os.path.dirname(__file__))
7
+ with open(os.path.join(this_directory, 'README.md'), encoding='utf-8') as f:
8
+ long_description = f.read()
9
+
10
+ # Base dependencies
11
+ BASE_DEPS = [
12
+ 'numpy',
13
+ 'torch',
14
+ 'pydub',
15
+ 'sounddevice',
16
+ 'python-dotenv',
17
+ # 'textual', # From requirement_tui.txt
18
+ # 'pyperclip', # From requirement_tui.txt
19
+ 'scipy' # Implicit dependency for wavfile reading in base
20
+ ]
21
+
22
+ # Optional extras (engines)
23
+ extras_require = {
24
+ "chatterbox": [
25
+ "chatterbox-tts",
26
+ "spacy",
27
+ "peft"
28
+ ],
29
+ "kitten": [
30
+ "kittentts",
31
+ "spacy"
32
+ ],
33
+ "kokoro": [
34
+ "kokoro>=0.9.4",
35
+ "soundfile"
36
+ ],
37
+ }
38
+
39
+ # All extras
40
+ all_deps = []
41
+ for deps in extras_require.values():
42
+ all_deps.extend(deps)
43
+ extras_require["all"] = list(set(all_deps))
44
+
45
+ setup(
46
+ name="tts-runner",
47
+ version="1.0.0",
48
+ author="Jebin Einstein",
49
+ author_email="jebin@gmail.com",
50
+ description="A flexible, multi-engine Text-to-Speech runner with TUI",
51
+ long_description=long_description,
52
+ long_description_content_type="text/markdown",
53
+ url="https://github.com/jebin2/TTS",
54
+
55
+ packages=find_packages(),
56
+ include_package_data=True,
57
+
58
+ install_requires=BASE_DEPS,
59
+ extras_require=extras_require,
60
+
61
+ entry_points={
62
+ "console_scripts": [
63
+ "tts-runner=tts_runner.runner:main",
64
+ "tts-tui=tts_runner.tui:main",
65
+ ],
66
+ },
67
+
68
+ classifiers=[
69
+ "Programming Language :: Python :: 3",
70
+ "License :: OSI Approved :: MIT License",
71
+ "Operating System :: OS Independent",
72
+ "Topic :: Multimedia :: Sound/Audio :: Speech",
73
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
74
+ ],
75
+
76
+ python_requires=">=3.10",
77
+ )
tts_runner/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # tts_runner/__init__.py
2
+ __version__ = "1.0.0"
tts_runner/base.py ADDED
@@ -0,0 +1,660 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from pathlib import Path
3
+ import shutil
4
+ from pydub import AudioSegment
5
+ import os
6
+ import traceback
7
+ from functools import reduce
8
+ import threading
9
+ import queue
10
+ import time
11
+ import sys
12
+ from . import common
13
+
14
+ from dotenv import load_dotenv
15
+ import os
16
+ if os.path.exists(".env"):
17
+ print("Loaded load_dotenv")
18
+ load_dotenv(".env")
19
+
20
+ class BaseTTS:
21
+ def __init__(self, type, stream_audio=False, setup_signals=True):
22
+ """Initialize BaseTTS with environment settings and configuration."""
23
+ if os.getenv("USE_CPU_IF_POSSIBLE", None):
24
+ self.device = "cpu"
25
+ else:
26
+ self.device = "cuda" if common.is_gpu_available() else "cpu"
27
+ print(f'Using device:: {self.device}')
28
+ # Environment setup
29
+ os.environ["TORCH_USE_CUDA_DSA"] = "1"
30
+ os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
31
+ os.environ["HF_HUB_TIMEOUT"] = "120"
32
+
33
+ base_dir = os.path.dirname(os.path.abspath(__file__))
34
+
35
+ # File paths and directories
36
+ self.content_file = Path("content.txt")
37
+ self.final_output_audio = "output_audio.wav"
38
+ self.final_output_timestamps = "output_timestamps.json"
39
+ self.temp_output_dir = Path("temp_audio_chunks")
40
+
41
+ if not self.temp_output_dir.exists():
42
+ self.temp_output_dir.mkdir(parents=True, exist_ok=True)
43
+
44
+ # Voice and speed configuration
45
+ self.default_voice_index = 8
46
+ self.default_speed = 0.8
47
+
48
+ self.voices = [
49
+ None,
50
+ os.path.join(base_dir, 'voices/Main-4.wav'),
51
+ os.path.join(base_dir, 'voices/Ellen-TTS-10.wav'),
52
+ os.path.join(base_dir, 'voices/kratos(ambient)_en.wav'),
53
+ os.path.join(base_dir, 'voices/20250329-audio-american-male.wav'),
54
+ os.path.join(base_dir, 'voices/Ellen13y TTS-14.wav'),
55
+ os.path.join(base_dir, 'voices/Simple guy.wav'),
56
+ None,
57
+ os.path.join(base_dir, 'voices/bbc_news.wav'),
58
+ os.path.join(base_dir, 'voices/en_woman.wav'),
59
+ os.path.join(base_dir, 'voices/voice_preview_david castlemore - newsreader and educator.mp3'),
60
+ os.path.join(base_dir, 'voices/voice_preview_kelly - storytelling & motivational content.mp3'),
61
+ os.path.join(base_dir, 'voices/voice_preview_motivational coach - leader.mp3'),
62
+ os.path.join(base_dir, 'voices/voice_preview_sevan bomar - black motivational speaker.mp3',)
63
+ ]
64
+
65
+ # General settings
66
+ self.type = type
67
+ self.save_audio_file = True
68
+ self.stream_audio = stream_audio
69
+
70
+ # Audio streaming configuration
71
+ self.audio_queue = queue.Queue()
72
+ self.is_streaming = False
73
+ self.stream_thread = None
74
+ self.sample_rate = 24000 # Default, can be overridden by subclasses
75
+ self.last_playing_audio_duration_seconds = 0.1
76
+
77
+ # Text streaming configuration
78
+ self.text_queue = queue.Queue()
79
+ self.text_processing_thread = None
80
+ self.is_text_streaming = False
81
+ self.text_chunk_size = 10 # Number of words per chunk
82
+ self.current_voice = None
83
+ self.current_speed = None
84
+
85
+ # Text buffering for streaming input
86
+ self.temp_feed_words = []
87
+
88
+ # Emergency stop control
89
+ self.emergency_stop = False
90
+ if setup_signals:
91
+ self.setup_signal_handler()
92
+
93
+ # ===== UTILITY METHODS =====
94
+
95
+ def cleanup_temp_files(self):
96
+ """Clean up temporary audio files."""
97
+ if self.temp_output_dir.exists():
98
+ shutil.rmtree(self.temp_output_dir)
99
+ if os.path.exists(self.final_output_audio):
100
+ os.remove(self.final_output_audio)
101
+ if os.path.exists(self.final_output_timestamps):
102
+ os.remove(self.final_output_timestamps)
103
+ print("Temporary files cleaned up")
104
+
105
+ def setup_output_directory(self):
106
+ """Create clean output directory for audio chunks."""
107
+ if self.temp_output_dir.exists():
108
+ shutil.rmtree(self.temp_output_dir)
109
+ self.temp_output_dir.mkdir(exist_ok=True)
110
+
111
+ def read_content_file(self):
112
+ """Read content from the content file."""
113
+ with open(self.content_file, 'r', encoding='utf-8') as file:
114
+ return file.read().strip()
115
+
116
+ def validate_voice_index(self, args) -> str:
117
+ """Validate and return voice file path."""
118
+ voice_index = self.default_voice_index
119
+ try:
120
+ voice_index = int(getattr(args, 'voice'))
121
+ if not 0 <= voice_index < len(self.voices):
122
+ print(f"Invalid voice index {voice_index}, using default voice")
123
+ voice_index = self.default_voice_index
124
+ except:
125
+ voice_index = self.default_voice_index
126
+
127
+ print(f"Voice Value: {self.voices[voice_index]}")
128
+ return self.voices[voice_index]
129
+
130
+ def validate_speed(self, args) -> float:
131
+ """Validate and return speed value."""
132
+ speed_value = self.default_speed
133
+ try:
134
+ speed_value = float(getattr(args, 'speed'))
135
+ if speed_value <= 0:
136
+ print(f"Invalid speed {speed_value}, using default speed")
137
+ speed_value = self.default_speed
138
+ except:
139
+ speed_value = self.default_speed
140
+
141
+ print(f"Speed Value: {speed_value}")
142
+ return speed_value
143
+
144
+ def combine_audio_files(self, audio_files):
145
+ """Combine multiple audio files into one.
146
+
147
+ Args:
148
+ audio_files: List of audio file paths to combine
149
+
150
+ Returns:
151
+ True if successful, False otherwise
152
+ """
153
+ if not audio_files:
154
+ raise ValueError("No audio files to combine")
155
+
156
+ print(f"Combining {len(audio_files)} audio files...")
157
+ combined = reduce(
158
+ lambda acc, file_name: acc + AudioSegment.from_wav(file_name),
159
+ audio_files,
160
+ AudioSegment.empty()
161
+ )
162
+
163
+ # Export combined audio
164
+ combined.export(self.final_output_audio, format="wav")
165
+ print(f"Combined audio saved as {self.final_output_audio}")
166
+ return True
167
+
168
+ def split_sentences(self, text, max_chars=300):
169
+ """Common split method for all frameworks. Override in subclasses if needed.
170
+
171
+ Args:
172
+ text (str): Text to split
173
+ max_chars (int): Maximum characters per chunk
174
+
175
+ Returns:
176
+ list: List of text chunks
177
+ """
178
+ words = text.split()
179
+ chunks = []
180
+ current = ""
181
+
182
+ for word in words:
183
+ test_chunk = current + " " + word if current else word
184
+ if len(test_chunk) <= max_chars:
185
+ current = test_chunk
186
+ else:
187
+ if current:
188
+ chunks.append(current)
189
+ current = word
190
+
191
+ if current:
192
+ chunks.append(current)
193
+
194
+ return chunks
195
+
196
+ def generate_chunk_audio_file(self, audio, chunk_index) -> Path:
197
+ import soundfile as sf
198
+ chunk_file = self.temp_output_dir / f"chunk_{chunk_index:04d}.wav"
199
+ sf.write(chunk_file, audio, self.sample_rate)
200
+ return chunk_file
201
+
202
+ # ===== EMERGENCY STOP METHODS =====
203
+
204
+ def setup_signal_handler(self):
205
+ """Setup signal handler for Ctrl+C to stop everything immediately."""
206
+ import signal
207
+ signal.signal(signal.SIGINT, self.emergency_stop_handler)
208
+ signal.signal(signal.SIGTERM, self.emergency_stop_handler)
209
+
210
+ def emergency_stop_handler(self, signum, frame):
211
+ """Handle Ctrl+C - stop everything immediately."""
212
+ print("\nπŸ›‘ Emergency stop triggered! Stopping all operations...")
213
+ self.emergency_stop = True
214
+
215
+ # Stop audio playback immediately
216
+ try:
217
+ import sounddevice as sd
218
+ sd.stop()
219
+ except:
220
+ pass
221
+
222
+ # Stop streaming
223
+ self.force_stop_streaming()
224
+ self.force_stop_text_streaming()
225
+
226
+ print("βœ… Emergency stop completed. Exiting...")
227
+ sys.exit(0)
228
+
229
+ def force_stop_streaming(self):
230
+ """Force stop audio streaming immediately without waiting."""
231
+ if self.is_streaming:
232
+ self.is_streaming = False
233
+
234
+ # Clear the queue
235
+ try:
236
+ while not self.audio_queue.empty():
237
+ self.audio_queue.get_nowait()
238
+ except:
239
+ pass
240
+
241
+ # Send poison pill
242
+ try:
243
+ self.audio_queue.put(None)
244
+ except:
245
+ pass
246
+
247
+ print("πŸ”‡ Audio streaming force stopped")
248
+
249
+ def force_stop_text_streaming(self):
250
+ """Force stop text streaming immediately without waiting."""
251
+ if self.is_text_streaming:
252
+ self.is_text_streaming = False
253
+
254
+ # Clear the text queue
255
+ try:
256
+ while not self.text_queue.empty():
257
+ self.text_queue.get_nowait()
258
+ except:
259
+ pass
260
+
261
+ # Send poison pill
262
+ try:
263
+ self.text_queue.put(None)
264
+ except:
265
+ pass
266
+
267
+ print("πŸ“ Text streaming force stopped")
268
+
269
+ def check_emergency_stop(self):
270
+ """Check if emergency stop was triggered. Call this in loops."""
271
+ if self.emergency_stop:
272
+ raise KeyboardInterrupt("Emergency stop triggered")
273
+
274
+ # ===== AUDIO STREAMING METHODS =====
275
+
276
+ def _audio_stream_worker(self):
277
+ """Worker thread that plays audio chunks as they arrive."""
278
+ while self.is_streaming and not self.emergency_stop:
279
+ try:
280
+ audio_data = self.audio_queue.get(timeout=0.1)
281
+ if audio_data is None or self.emergency_stop: # Poison pill or emergency stop
282
+ break
283
+
284
+ self.last_playing_audio_duration_seconds = len(audio_data) / self.sample_rate
285
+ # Play audio chunk
286
+ import sounddevice as sd
287
+ sd.play(audio_data, samplerate=self.sample_rate)
288
+
289
+ # Check for emergency stop while playing
290
+ while sd.get_stream().active and not self.emergency_stop:
291
+ time.sleep(0.01)
292
+
293
+ if self.emergency_stop:
294
+ sd.stop()
295
+ break
296
+
297
+ except queue.Empty:
298
+ continue
299
+ except Exception as e:
300
+ if not self.emergency_stop:
301
+ print(f"Audio playback error: {e}")
302
+ break
303
+
304
+ def start_audio_streaming(self):
305
+ """Start the audio streaming thread."""
306
+ try:
307
+ import sounddevice as sd
308
+ if not self.is_streaming and not self.emergency_stop:
309
+ self.is_streaming = True
310
+ self.stream_thread = threading.Thread(target=self._audio_stream_worker)
311
+ self.stream_thread.daemon = True
312
+ self.stream_thread.start()
313
+ print("πŸ”Š Audio streaming started")
314
+ except:
315
+ self.stream_audio = False
316
+ print("πŸ”‡ No sounddevice available.")
317
+ pass
318
+
319
+ def stop_audio_streaming(self):
320
+ """Stop the audio streaming thread."""
321
+ if self.is_streaming:
322
+ self.is_streaming = False
323
+ self.audio_queue.put(None) # Poison pill
324
+ if self.stream_thread:
325
+ self.stream_thread.join(timeout=2) # Don't wait forever
326
+ print("πŸ”‡ Audio streaming stopped")
327
+
328
+ def queue_audio_for_streaming(self, audio_data, sample_rate=None):
329
+ """Queue audio data for streaming playback.
330
+
331
+ Args:
332
+ audio_data: Audio data (numpy array, tensor, or file path)
333
+ sample_rate (int, optional): Sample rate of the audio data
334
+ """
335
+ if self.is_streaming and not self.emergency_stop:
336
+ # Convert audio data to numpy array if needed
337
+ processed_audio = self._prepare_audio_for_streaming(audio_data, sample_rate)
338
+ if processed_audio is not None:
339
+ self.audio_queue.put(processed_audio)
340
+ return len(processed_audio) / self.sample_rate
341
+ return 0
342
+
343
+ def _prepare_audio_for_streaming(self, audio_data, sample_rate=None):
344
+ """Prepare audio data for streaming by converting to numpy array.
345
+
346
+ Args:
347
+ audio_data: Raw audio data (numpy array, tensor, or file path)
348
+ sample_rate (int, optional): Sample rate of the audio data
349
+
350
+ Returns:
351
+ numpy.ndarray: Audio data ready for streaming
352
+ """
353
+ import numpy as np
354
+
355
+ try:
356
+ # If it's already a numpy array, just ensure it's float32
357
+ if isinstance(audio_data, np.ndarray):
358
+ # Ensure audio is in the right format for sounddevice
359
+ audio = audio_data.astype(np.float32)
360
+ # Ensure values are in [-1, 1] range
361
+ if audio.max() > 1.0 or audio.min() < -1.0:
362
+ audio = audio / np.max(np.abs(audio))
363
+ return audio
364
+
365
+ # If it's a torch tensor, convert to numpy
366
+ elif hasattr(audio_data, 'cpu'): # torch tensor
367
+ audio = audio_data.cpu().numpy().astype(np.float32)
368
+ # Ensure values are in [-1, 1] range
369
+ if audio.max() > 1.0 or audio.min() < -1.0:
370
+ audio = audio / np.max(np.abs(audio))
371
+ return audio
372
+
373
+ # If it's a file path, load it
374
+ elif isinstance(audio_data, (str, Path)):
375
+ from scipy.io import wavfile
376
+ sr, audio = wavfile.read(str(audio_data))
377
+ # Convert to float32 and normalize
378
+ if audio.dtype == np.int16:
379
+ audio = audio.astype(np.float32) / 32768.0
380
+ elif audio.dtype == np.int32:
381
+ audio = audio.astype(np.float32) / 2147483648.0
382
+ else:
383
+ audio = audio.astype(np.float32)
384
+
385
+ # Update sample rate if provided
386
+ if sample_rate is None:
387
+ self.sample_rate = sr
388
+
389
+ return audio
390
+
391
+ else:
392
+ print(f"⚠️ Unsupported audio data type: {type(audio_data)}")
393
+ return None
394
+
395
+ except Exception as e:
396
+ print(f"❌ Error preparing audio for streaming: {e}")
397
+ return None
398
+
399
+ def wait_for_audio_streaming_complete(self):
400
+ """Wait for all queued audio to finish playing."""
401
+ time.sleep(0.5) # Small delay to ensure last chunk starts
402
+ while not self.audio_queue.empty() and not self.emergency_stop:
403
+ time.sleep(0.1)
404
+
405
+ time.sleep(self.last_playing_audio_duration_seconds)
406
+
407
+ # ===== TEXT STREAMING METHODS =====
408
+
409
+ def _text_processing_worker(self):
410
+ """Worker thread that processes text chunks from the queue."""
411
+ chunk_counter = 0
412
+
413
+ while self.is_text_streaming and not self.emergency_stop:
414
+ try:
415
+ text_chunk = self.text_queue.get(timeout=0.1)
416
+ if text_chunk is None or self.emergency_stop: # Poison pill or emergency stop
417
+ break
418
+
419
+ if not text_chunk.strip(): # Skip empty chunks
420
+ continue
421
+
422
+ print(f"πŸ“ Processing text chunk {chunk_counter + 1}: '{text_chunk[:50]}...'")
423
+
424
+ # Generate audio for this text chunk
425
+ try:
426
+ audio_files = self.generate_audio_files(
427
+ text_chunk,
428
+ self.current_voice,
429
+ self.current_speed,
430
+ chunk_id=chunk_counter
431
+ )
432
+
433
+ if audio_files:
434
+ print(f"βœ… Generated audio for chunk {chunk_counter + 1}")
435
+ else:
436
+ print(f"⚠️ No audio generated for chunk {chunk_counter + 1}")
437
+
438
+ except Exception as e:
439
+ print(f"❌ Error processing chunk {chunk_counter + 1}: {e}")
440
+
441
+ chunk_counter += 1
442
+
443
+ except queue.Empty:
444
+ continue
445
+ except Exception as e:
446
+ if not self.emergency_stop:
447
+ print(f"Text processing error: {e}")
448
+ break
449
+
450
+ print(f"πŸ“ Text processing completed. Processed {chunk_counter} chunks.")
451
+
452
+ def start_text_streaming(self, voice, speed):
453
+ """Start the text processing streaming thread."""
454
+ if not self.is_text_streaming and not self.emergency_stop:
455
+ self.current_voice = voice
456
+ self.current_speed = speed
457
+ self.is_text_streaming = True
458
+ self.text_processing_thread = threading.Thread(target=self._text_processing_worker)
459
+ self.text_processing_thread.daemon = True
460
+ self.text_processing_thread.start()
461
+ print("πŸ“ Text streaming started")
462
+
463
+ def stop_text_streaming(self):
464
+ """Stop the text processing streaming thread."""
465
+ if self.is_text_streaming:
466
+ self.is_text_streaming = False
467
+ self.text_queue.put(None) # Poison pill
468
+ if self.text_processing_thread:
469
+ self.text_processing_thread.join(timeout=5) # Wait a bit longer for text processing
470
+ print("πŸ“ Text streaming stopped")
471
+
472
+ def add_text_chunk(self, text_chunk):
473
+ """Add a text chunk to the processing queue.
474
+
475
+ Args:
476
+ text_chunk (str): Text chunk to process
477
+ """
478
+ if self.is_text_streaming and not self.emergency_stop and text_chunk.strip():
479
+ # Ensure chunk ends with punctuation for better TTS pronunciation
480
+ cleaned_chunk = text_chunk.strip()
481
+ if not any(cleaned_chunk.endswith(p) for p in ['.', '!', '?', ':', ';', ',']):
482
+ cleaned_chunk += '.' # Add period if no punctuation
483
+
484
+ self.text_queue.put(cleaned_chunk)
485
+ print(f"πŸ“ Queued text chunk: '{cleaned_chunk[:30]}...'")
486
+ else:
487
+ if not self.is_text_streaming:
488
+ print("⚠️ Text streaming not started. Call start_text_streaming() first.")
489
+
490
+ def add_text_by_words(self, text, words_per_chunk=None):
491
+ """Split text into word chunks and add to queue.
492
+
493
+ Args:
494
+ text (str): Full text to split and queue
495
+ words_per_chunk (int, optional): Number of words per chunk. Uses self.text_chunk_size if None.
496
+ """
497
+ if words_per_chunk is None:
498
+ words_per_chunk = self.text_chunk_size
499
+
500
+ words = text.split()
501
+
502
+ for i in range(0, len(words), words_per_chunk):
503
+ chunk = ' '.join(words[i:i + words_per_chunk])
504
+ self.add_text_chunk(chunk)
505
+
506
+ print(f"πŸ“ Split text into {(len(words) + words_per_chunk - 1) // words_per_chunk} chunks of {words_per_chunk} words each")
507
+
508
+ def wait_for_text_processing_complete(self):
509
+ """Wait for all queued text chunks to be processed."""
510
+ print("πŸ“ Waiting for text processing to complete...")
511
+ while not self.text_queue.empty() and not self.emergency_stop:
512
+ time.sleep(0.1)
513
+ time.sleep(1) # Extra time for last chunk to process
514
+ print("πŸ“ Text processing queue empty")
515
+
516
+ # ===== STREAMING TEXT INPUT METHODS =====
517
+
518
+ def feed_text_chunk(self, text_chunk):
519
+ """Feed a single text chunk for processing with smart buffering.
520
+
521
+ Args:
522
+ text_chunk (str): Text chunk to process
523
+ """
524
+ # Add new words to the buffer
525
+ self.temp_feed_words.extend(text_chunk.split())
526
+
527
+ # Combine all buffered words and split into sentences/chunks
528
+ all_words = " ".join(self.temp_feed_words)
529
+ sentences = self.split_sentences(all_words)
530
+ total_sentences = len(sentences)
531
+
532
+ # Process all complete sentences except the last one (which might be incomplete)
533
+ for i, sentence in enumerate(sentences):
534
+ if i + 1 != total_sentences: # Not the last sentence
535
+ print(f"πŸ“ Feeding chunk: {sentence}")
536
+ self.add_text_chunk(sentence)
537
+
538
+ # Keep the last sentence in buffer (might be incomplete)
539
+ self.temp_feed_words = sentences[-1].split() if sentences else []
540
+
541
+ def flush_remaining_words(self):
542
+ """Flush any remaining words in the buffer. Call this when done feeding text."""
543
+ if self.temp_feed_words:
544
+ chunk_text = " ".join(self.temp_feed_words)
545
+ print(f"πŸ“ Flushing final chunk: {chunk_text}")
546
+ self.add_text_chunk(chunk_text)
547
+ self.temp_feed_words = []
548
+
549
+ # ===== HIGH-LEVEL STREAMING METHODS =====
550
+
551
+ def stream_real_time_text(self, args):
552
+ """Initialize streaming for real-time text input.
553
+
554
+ Args:
555
+ args: Arguments containing voice, speed, etc.
556
+ """
557
+ speed = self.validate_speed(args)
558
+ voice = self.validate_voice_index(args)
559
+
560
+ # Setup directories
561
+ self.cleanup_temp_files()
562
+ self.setup_output_directory()
563
+
564
+ # Start both streaming systems
565
+ if self.stream_audio:
566
+ self.start_audio_streaming()
567
+
568
+ self.start_text_streaming(voice, speed)
569
+
570
+ print("πŸš€ Real-time text streaming initialized!")
571
+ print("πŸ“ Use feed_text_chunk() to add text incrementally")
572
+ print("πŸ“ Use add_text_chunk() to add individual chunks")
573
+ print("πŸ“ Use add_text_by_words() to split and add text automatically")
574
+ print("πŸ›‘ Use stop_all_streaming() when done")
575
+
576
+ def stop_all_streaming(self):
577
+ """Stop all streaming operations and cleanup."""
578
+ print("πŸ›‘ Stopping all streaming operations...")
579
+
580
+ # Flush any remaining words first
581
+ self.flush_remaining_words()
582
+
583
+ # Wait for queues to empty
584
+ self.wait_for_text_processing_complete()
585
+ self.wait_for_audio_streaming_complete()
586
+
587
+ # Stop streaming threads
588
+ self.stop_text_streaming()
589
+ self.stop_audio_streaming()
590
+
591
+ print("βœ… All streaming operations stopped")
592
+
593
+ # ===== BACKWARD COMPATIBILITY METHODS =====
594
+
595
+ def start_streaming(self):
596
+ """Start audio streaming (backward compatibility)."""
597
+ self.start_audio_streaming()
598
+
599
+ def stop_streaming(self):
600
+ """Stop audio streaming (backward compatibility)."""
601
+ self.stop_audio_streaming()
602
+
603
+ def wait_for_streaming_complete(self):
604
+ """Wait for audio streaming to complete (backward compatibility)."""
605
+ self.wait_for_audio_streaming_complete()
606
+
607
+ # ===== ABSTRACT METHODS =====
608
+
609
+ def generate_audio_files(self, text: str, voice: str, speed: float, chunk_id: int = None):
610
+ """Generate audio files. To be implemented by subclasses.
611
+
612
+ Args:
613
+ text (str): Text to convert to audio
614
+ voice (str): Voice file path
615
+ speed (float): Speed multiplier
616
+ chunk_id (int, optional): Unique identifier for this chunk (for streaming)
617
+ """
618
+ raise NotImplementedError("Subclasses must implement generate_audio_files")
619
+
620
+ # ===== MAIN METHODS =====
621
+
622
+ def save_audio(self, args) -> bool:
623
+ """Generate and save complete audio file (batch mode).
624
+
625
+ Args:
626
+ args: Arguments containing voice, speed, etc.
627
+
628
+ Returns:
629
+ True if successful, False otherwise
630
+ """
631
+ # Read content
632
+ text = self.read_content_file()
633
+ if not text:
634
+ raise ValueError("Warning: Content file is empty")
635
+
636
+ speed = self.validate_speed(args)
637
+ voice = self.validate_voice_index(args)
638
+
639
+ # Clean up temporary files
640
+ self.cleanup_temp_files()
641
+
642
+ # Setup output directory
643
+ self.setup_output_directory()
644
+
645
+ # Generate audio files (with optional streaming)
646
+ if self.stream_audio:
647
+ self.start_audio_streaming()
648
+
649
+ audio_files = self.generate_audio_files(text, voice, speed)
650
+
651
+ if not audio_files:
652
+ raise ValueError("Error: No audio files generated")
653
+
654
+ # Combine audio files
655
+ success = self.combine_audio_files(audio_files)
656
+
657
+ self.wait_for_audio_streaming_complete()
658
+ self.stop_audio_streaming()
659
+
660
+ return success
tts_runner/common.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import os
3
+ import shutil
4
+ import string
5
+ import secrets
6
+ import hashlib
7
+ import random
8
+ import time
9
+ import re
10
+
11
+ def get_files_count(directory_path):
12
+ return len(os.listdir(directory_path))
13
+
14
+ def generate_random_string(length=10):
15
+ characters = string.ascii_letters
16
+ random_string = ''.join(secrets.choice(characters) for _ in range(length))
17
+ return random_string
18
+
19
+ def generate_random_string_from_input(input_string, length=16):
20
+ # Hash the input string to get a consistent value
21
+ hash_object = hashlib.sha256(input_string.encode())
22
+ hashed_string = hash_object.hexdigest()
23
+
24
+ # Use the hash to seed the random number generator
25
+ random.seed(hashed_string)
26
+
27
+ # Generate a random string based on the seed
28
+ characters = string.ascii_letters + string.digits
29
+ random_string = ''.join(random.choice(characters) for _ in range(length))
30
+
31
+ return random_string
32
+
33
+ def is_mostly_black(frame, black_threshold=20, percentage_threshold=0.9, sample_rate=10):
34
+ """
35
+ Fast black frame detection using pixel sampling.
36
+
37
+ Args:
38
+ frame: OpenCV BGR frame (NumPy array)
39
+ black_threshold: grayscale value below which a pixel is considered black
40
+ percentage_threshold: fraction of black pixels to consider frame mostly black
41
+ sample_rate: sample every N-th pixel in both dimensions (higher = faster)
42
+ Returns:
43
+ True if mostly black, False otherwise
44
+ """
45
+ import cv2
46
+ import numpy as np
47
+ if frame is None or frame.size == 0:
48
+ return True
49
+ # Convert to grayscale
50
+ gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
51
+ # Sample pixels
52
+ sampled = gray[::sample_rate, ::sample_rate]
53
+ black_count = np.sum(sampled < black_threshold)
54
+ total_count = sampled.size
55
+ return (black_count / total_count) >= percentage_threshold
56
+
57
+ def only_alpha(text: str) -> str:
58
+ # Keep only alphabetic characters (make lowercase to ignore case)
59
+ return re.sub(r'[^a-zA-Z]', '', text).lower()
60
+
61
+ def manage_gpu(size_gb: float = 0, gpu_index: int = 0, action: str = "check"):
62
+ """
63
+ Manage GPU memory:
64
+ - check β†’ just prints memory + process table
65
+ - clear_cache β†’ clears PyTorch cache
66
+ - kill β†’ kills all GPU processes
67
+ """
68
+ try:
69
+ import pynvml,signal, gc
70
+ pynvml.nvmlInit()
71
+ handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index)
72
+ info = pynvml.nvmlDeviceGetMemoryInfo(handle)
73
+
74
+ free_gb = info.free / 1024**3
75
+ total_gb = info.total / 1024**3
76
+
77
+ print(f"\nGPU {gpu_index}: Free {free_gb:.2f} GB / Total {total_gb:.2f} GB")
78
+
79
+ # Show processes
80
+ processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
81
+ print("\nActive GPU Processes:")
82
+ print(f"{'PID':<8} {'Process Name':<40} {'Used (GB)':<10}")
83
+ print("-" * 60)
84
+ for p in processes:
85
+ used_gb = p.usedGpuMemory / 1024**3
86
+ proc_name = pynvml.nvmlSystemGetProcessName(p.pid).decode(errors="ignore")
87
+ print(f"{p.pid:<8} {proc_name:<40} {used_gb:.2f}")
88
+
89
+ if action == "clear_cache":
90
+ try:
91
+ import torch
92
+ gc.collect()
93
+ gc.collect()
94
+ torch.cuda.empty_cache()
95
+ torch.cuda.reset_peak_memory_stats()
96
+ torch.cuda.synchronize()
97
+ time.sleep(1)
98
+ print("\n🧹 Cleared PyTorch CUDA cache")
99
+ except ImportError:
100
+ print("\n⚠️ PyTorch not installed, cannot clear cache.")
101
+
102
+ elif action == "kill":
103
+ for p in processes:
104
+ proc_name = pynvml.nvmlSystemGetProcessName(p.pid).decode(errors="ignore")
105
+ try:
106
+ os.kill(p.pid, signal.SIGKILL)
107
+ print(f"❌ Killed {p.pid} ({proc_name})")
108
+ except Exception as e:
109
+ print(f"⚠️ Could not kill {p.pid}: {e}")
110
+ manage_gpu(action="clear_cache")
111
+ gc.collect()
112
+ gc.collect()
113
+ return free_gb > size_gb
114
+ except: return False
115
+
116
+ def is_gpu_available(verbose=True):
117
+ import torch
118
+ if not torch.cuda.is_available():
119
+ if verbose:
120
+ print("CUDA not available.")
121
+ return False
122
+
123
+ try:
124
+ # Try a tiny allocation to check if GPU is free & usable
125
+ torch.empty(1, device="cuda")
126
+ if verbose:
127
+ print(f"CUDA available. Using device: {torch.cuda.get_device_name(0)}")
128
+ return True
129
+ except RuntimeError as e:
130
+ if "CUDA-capable device(s) is/are busy or unavailable" in str(e) or \
131
+ "CUDA error" in str(e):
132
+ if verbose:
133
+ print("CUDA detected but busy/unavailable. Please CPU.")
134
+ return False
135
+ raise # re-raise if it's some other unexpected error
tts_runner/engines/__init__.py ADDED
File without changes
tts_runner/engines/chatterbox.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import List
3
+ import spacy
4
+ import torchaudio as ta
5
+ import torch
6
+ from ..base import BaseTTS
7
+
8
+ class ChatterboxTTSProcessor(BaseTTS):
9
+ """Text-to-Speech processor using ChatterboxTTS."""
10
+
11
+ def __init__(self, stream_audio=False):
12
+ super().__init__("Chatterbox", stream_audio=stream_audio)
13
+ print("Initializing Chatterbox...")
14
+ from chatterbox.tts import ChatterboxTTS
15
+ print("Loading Modal...")
16
+ self.model = ChatterboxTTS.from_pretrained(device=self.device)
17
+
18
+ self.nlp=None
19
+ try:
20
+ self.nlp = spacy.load("en_core_web_sm")
21
+ except OSError:
22
+ from spacy.cli import download
23
+ download("en_core_web_sm")
24
+ self.nlp = spacy.load("en_core_web_sm")
25
+ print("Model loaded successfully")
26
+
27
+ def tokenize_sentences(self, text):
28
+ """Split text into sentences using spaCy.
29
+
30
+ Args:
31
+ text: Input text to tokenize
32
+
33
+ Returns:
34
+ List of sentence strings
35
+ """
36
+ doc = self.nlp(text)
37
+ return [sent.text.strip() for sent in doc.sents if sent.text.strip()]
38
+
39
+ def norm_and_token_count(self, text):
40
+ """Get normalized text and token count.
41
+
42
+ Args:
43
+ text: Input text to normalize and count tokens
44
+
45
+ Returns:
46
+ Tuple of (normalized_text, token_count)
47
+ """
48
+ from chatterbox.tts import punc_norm
49
+ with torch.inference_mode():
50
+ normalized = punc_norm(text)
51
+ tokens = self.model.tokenizer.text_to_tokens(normalized)
52
+ token_count = tokens.shape[1]
53
+
54
+ # Clear tokens from GPU memory immediately
55
+ if hasattr(tokens, 'cpu'):
56
+ tokens = tokens.cpu()
57
+ del tokens
58
+ return normalized, token_count
59
+
60
+ def split_sentences(self, text, max_tokens=200):
61
+ """Split text into chunks based on token count.
62
+
63
+ Args:
64
+ text: Input text to split
65
+ max_tokens: Maximum tokens per chunk
66
+
67
+ Returns:
68
+ List of text chunks
69
+ """
70
+ sentences = self.tokenize_sentences(text)
71
+ chunks = []
72
+ current = ""
73
+
74
+ for sentence in sentences:
75
+ # Check if sentence alone exceeds max tokens
76
+ _, sentence_tokens = self.norm_and_token_count(sentence)
77
+ if sentence_tokens > max_tokens:
78
+ # If current chunk has content, save it first
79
+ if current:
80
+ chunks.append(current.strip())
81
+ current = ""
82
+
83
+ # Split long sentence by words if it's too long
84
+ words = sentence.split()
85
+ temp_chunk = ""
86
+
87
+ for word in words:
88
+ test_chunk = (temp_chunk + " " + word).strip() if temp_chunk else word
89
+ _, test_tokens = self.norm_and_token_count(test_chunk)
90
+
91
+ if test_tokens <= max_tokens:
92
+ temp_chunk = test_chunk
93
+ else:
94
+ if temp_chunk:
95
+ chunks.append(temp_chunk.strip())
96
+ temp_chunk = word
97
+
98
+ if temp_chunk:
99
+ current = temp_chunk.strip()
100
+ continue
101
+
102
+ # Try adding sentence to current chunk
103
+ candidate = (current + " " + sentence).strip() if current else sentence.strip()
104
+ _, token_count = self.norm_and_token_count(candidate)
105
+
106
+ if token_count <= max_tokens:
107
+ current = candidate
108
+ else:
109
+ # Current chunk is full, save it and start new one
110
+ if current:
111
+ chunks.append(current.strip())
112
+ current = sentence.strip()
113
+
114
+ # Don't forget the last chunk
115
+ if current:
116
+ chunks.append(current.strip())
117
+
118
+ return chunks
119
+
120
+ def generate_chunk_audio_file(self, sentence: str, chunk_index: int, voice: str, speed: float) -> Path:
121
+ wav = self.model.generate(
122
+ sentence,
123
+ audio_prompt_path=voice,
124
+ temperature=speed
125
+ )
126
+
127
+ # Save sentence to numbered file
128
+ chunk_file = self.temp_output_dir / f"chunk_{chunk_index:04d}.wav"
129
+ ta.save(str(chunk_file), wav, self.model.sr)
130
+ del wav
131
+
132
+ if self.stream_audio:
133
+ self.queue_audio_for_streaming(str(chunk_file))
134
+ return chunk_file
135
+
136
+ def generate_audio_files(self, text: str, voice: str, speed: float, chunk_id: int = None):
137
+ sentences = self.split_sentences(text)
138
+ audio_files = []
139
+ total_sentences = len(sentences)
140
+
141
+ print(f"Processing {total_sentences} text sentences...")
142
+ with torch.inference_mode():
143
+ for i, sentence in enumerate(sentences):
144
+ if self.save_audio_file:
145
+ chunk_file = self.generate_chunk_audio_file(sentence, chunk_id if chunk_id else i, voice, speed)
146
+ audio_files.append(chunk_file)
147
+ print(f"Sentence {i + 1}/{total_sentences} processed -> {chunk_file.name} -> {sentence}")
148
+
149
+ return audio_files
tts_runner/engines/kitten.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ from pathlib import Path
3
+ from ..base import BaseTTS
4
+
5
+ class KittenTTSProcessor(BaseTTS):
6
+ """Text-to-Speech processor using KittenTTS with streaming support."""
7
+
8
+ def __init__(self, stream_audio=False):
9
+ super().__init__("Kitten", stream_audio=stream_audio)
10
+ self.default_voice_index = 7
11
+ self.voices = [ 'expr-voice-2-m', 'expr-voice-2-f', 'expr-voice-3-m', 'expr-voice-3-f', 'expr-voice-4-m', 'expr-voice-4-f', 'expr-voice-5-m', 'expr-voice-5-f' ]
12
+ print("Initialising Kitten...")
13
+ from kittentts import KittenTTS
14
+ print("Loading Modal...")
15
+ self.pipeline = KittenTTS("KittenML/kitten-tts-nano-0.2")
16
+ print("Model loaded successfully")
17
+
18
+ def generate_audio_files(self, text: str, voice: str, speed: float, chunk_id: int = None):
19
+ sentences = self.split_sentences(text)
20
+ audio_files = []
21
+ total_sentences = len(sentences)
22
+
23
+ print(f"Processing {total_sentences} text sentences...")
24
+ for i, sentence in enumerate(sentences):
25
+ audio = self.pipeline.generate(sentence, voice=voice)
26
+ if self.stream_audio:
27
+ self.queue_audio_for_streaming(audio)
28
+ if self.save_audio_file:
29
+ chunk_file = self.generate_chunk_audio_file(audio, chunk_id if chunk_id else i)
30
+ audio_files.append(chunk_file)
31
+ print(f"Sentence {i + 1} processed -> {chunk_file.name} -> {sentence}")
32
+
33
+ return audio_files
tts_runner/engines/kokoro.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import List
3
+ from pathlib import Path
4
+ from ..base import BaseTTS
5
+
6
+ class KokoroTTSProcessor(BaseTTS):
7
+ """Text-to-Speech processor using KokoroTTS."""
8
+
9
+ def __init__(self, stream_audio=False, setup_signals=True):
10
+ super().__init__("Kokoro", stream_audio=stream_audio, setup_signals=setup_signals)
11
+ self.default_voice_index = 8
12
+ self.default_speed = 1
13
+ self.voices = [
14
+ 'af', # Default voice is a 50-50 mix of Bella & Sarah
15
+ 'af_bella', 'af_sarah', 'am_adam', 'am_michael',
16
+ 'bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis',
17
+ 'af_nicole', 'af_sky', 'af_heart', 'am_echo'
18
+ ]
19
+ print("Initialising Kokoro...")
20
+ from kokoro import KPipeline
21
+ print("Loading Modal...")
22
+ self.pipeline = KPipeline(lang_code='a', device=self.device)
23
+ print("Model loaded successfully")
24
+
25
+ def generate_audio_files(self, text: str, voice: str, speed: float, chunk_id: int = None):
26
+ generator = self.pipeline(
27
+ text,
28
+ voice=voice,
29
+ speed=speed,
30
+ split_pattern=r'\n+'
31
+ )
32
+ audio_files = []
33
+ word_timestamps = []
34
+
35
+ print(f"Processing text sentences...")
36
+
37
+ for i, result in enumerate(generator):
38
+ tokens = result.tokens
39
+ audio = result.audio
40
+
41
+ callback_words = []
42
+ sentence = ""
43
+ for word in tokens:
44
+ sentence += word.text
45
+ word_data = {
46
+ "word": word.text,
47
+ "phonemes": word.phonemes,
48
+ "start_time": word.start_ts,
49
+ "end_time": word.end_ts
50
+ }
51
+ word_timestamps.append(word_data)
52
+ callback_words.append(word_data)
53
+
54
+ if self.stream_audio:
55
+ audio_duration = self.queue_audio_for_streaming(audio)
56
+
57
+ # Call the callback if set (for UI highlighting)
58
+ if hasattr(self, 'word_callback') and self.word_callback:
59
+ self.word_callback(callback_words, audio_duration)
60
+ if self.save_audio_file:
61
+ chunk_file = self.generate_chunk_audio_file(audio, chunk_id if chunk_id else i)
62
+ audio_files.append(chunk_file)
63
+ print(f"Sentence {i + 1} processed -> {chunk_file.name} -> {sentence}")
64
+
65
+ # Save timestamps to a JSON file
66
+ with open(self.final_output_timestamps, 'w') as f:
67
+ json.dump(word_timestamps, f, indent=4)
68
+
69
+ print(f'Timestamps saved as {self.final_output_timestamps}')
70
+
71
+ return audio_files
tts_runner/runner.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+ warnings.filterwarnings("ignore", category=UserWarning)
3
+ warnings.filterwarnings("ignore", category=FutureWarning)
4
+ import logging
5
+ logging.getLogger().setLevel(logging.ERROR)
6
+
7
+ import argparse
8
+ import os
9
+ import sys
10
+ import time
11
+
12
+ TTS_ENGINE = None
13
+ os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
14
+
15
+ def server_mode(args):
16
+ while True:
17
+ input = sys.stdin.readline().strip()
18
+ input = input.split("voice")
19
+
20
+ try: args.speed = float(input[0])
21
+ except: args.speed = 1
22
+
23
+ try: args.voice = int(input[1])
24
+ except: args.voice = 8
25
+
26
+ output_path = initiate(args)
27
+
28
+ print(output_path)
29
+ sys.stdout.flush()
30
+
31
+ def current_env():
32
+ """Detect current virtual environment."""
33
+ venv_path = os.environ.get("VIRTUAL_ENV")
34
+ if venv_path:
35
+ return os.path.basename(venv_path)
36
+ raise ValueError("Please set env first")
37
+
38
+ def initiate(args):
39
+ model = args.get('model') if isinstance(args, dict) else getattr(args, 'model', None)
40
+ if not model:
41
+ if current_env() == "kokoro_env":
42
+ from .engines.kokoro import KokoroTTSProcessor as TTSEngine
43
+ elif current_env() == "kitten_env":
44
+ from .engines.kitten import KittenTTSProcessor as TTSEngine
45
+ else:
46
+ from .engines.chatterbox import ChatterboxTTSProcessor as TTSEngine
47
+ else:
48
+ if model == "kokoro":
49
+ from .engines.kokoro import KokoroTTSProcessor as TTSEngine
50
+ elif model == "kitten":
51
+ from .engines.kitten import KittenTTSProcessor as TTSEngine
52
+ else:
53
+ from .engines.chatterbox import ChatterboxTTSProcessor as TTSEngine
54
+
55
+ global TTS_ENGINE
56
+ if not TTS_ENGINE:
57
+ TTS_ENGINE = TTSEngine(stream_audio=args.stream_text)
58
+
59
+ try:
60
+ import torch
61
+ import gc
62
+ torch.cuda.empty_cache()
63
+ torch.cuda.synchronize()
64
+ gc.collect()
65
+ gc.collect()
66
+ time.sleep(1)
67
+ print("\n🧹 Cleared PyTorch CUDA cache")
68
+ except: pass
69
+
70
+ if args.stream_text:
71
+ TTS_ENGINE.stream_real_time_text(args)
72
+ text = TTS_ENGINE.read_content_file()
73
+ for text_chunk in text.split():
74
+ TTS_ENGINE.feed_text_chunk(text_chunk)
75
+ time.sleep(0.1) # Optional delay
76
+
77
+ TTS_ENGINE.stop_all_streaming()
78
+ else:
79
+ TTS_ENGINE.save_audio(args)
80
+
81
+
82
+ def main():
83
+ """Main entry point."""
84
+ parser = argparse.ArgumentParser(
85
+ description="Text-to-Speech processor"
86
+ )
87
+ parser.add_argument(
88
+ "--server-mode",
89
+ action="store_true",
90
+ help="Run in server mode (read commands from stdin)"
91
+ )
92
+ parser.add_argument(
93
+ "--speed",
94
+ type=float,
95
+ help=f"Speech speed"
96
+ )
97
+ parser.add_argument(
98
+ "--voice",
99
+ type=int,
100
+ help=f"Voice index"
101
+ )
102
+ parser.add_argument(
103
+ "--stream-text",
104
+ action="store_true",
105
+ help="Enable streaming text output"
106
+ )
107
+ parser.add_argument(
108
+ "--model",
109
+ help="model name"
110
+ )
111
+
112
+
113
+ args = parser.parse_args()
114
+
115
+ if args.server_mode:
116
+ server_mode(args)
117
+ else:
118
+ success = initiate(args)
119
+ return 0 if success else 1
120
+
121
+ if __name__ == "__main__":
122
+ main()
tts_runner/tui.py ADDED
@@ -0,0 +1,536 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Minimalistic TTS TUI Reader with Word Highlighting
3
+ Requires: textual, pyperclip, kokoro-tts
4
+ Install: pip install textual pyperclip kokoro-tts
5
+ """
6
+ from textual.app import App, ComposeResult
7
+ from textual.widgets import TextArea, Button, Footer, RichLog, Static
8
+ from textual.containers import Horizontal, Vertical, Container
9
+ from textual.binding import Binding
10
+ from textual.reactive import reactive
11
+ from textual.widgets.text_area import Selection
12
+ from textual import work
13
+ import pyperclip
14
+ import threading
15
+ import time
16
+ import queue
17
+ import re
18
+ import bisect
19
+
20
+ from .kokoro_tts import KokoroTTSProcessor
21
+
22
+
23
+ class StatusBar(Static):
24
+ """Custom status bar with TTS state"""
25
+ DEFAULT_CSS = """
26
+ StatusBar {
27
+ dock: top;
28
+ height: 1;
29
+ background: #1a1a2e;
30
+ color: #00ff9f;
31
+ padding: 0 2;
32
+ text-style: bold;
33
+ }
34
+ """
35
+
36
+ status_text = reactive("Ready")
37
+
38
+ def render(self) -> str:
39
+ return self.status_text
40
+
41
+
42
+ class TTSReader(App):
43
+ CSS = """
44
+ Screen {
45
+ background: #0f0f23;
46
+ }
47
+
48
+ StatusBar {
49
+ border-bottom: heavy #00ff9f;
50
+ }
51
+
52
+ #main_container {
53
+ height: 1fr;
54
+ margin: 2 3;
55
+ padding: 0;
56
+ }
57
+
58
+ #text_panel {
59
+ height: 1fr;
60
+ background: #1a1a2e;
61
+ border: heavy #00d4ff;
62
+ padding: 2;
63
+ }
64
+
65
+ TextArea {
66
+ height: 1fr;
67
+ background: #1a1a2e;
68
+ color: #e0e0e0;
69
+ border: none;
70
+ padding: 1;
71
+ scrollbar-gutter: stable;
72
+ scrollbar-color: #00ff9f #1a1a2e;
73
+ }
74
+
75
+ TextArea:focus {
76
+ border: none;
77
+ }
78
+
79
+ TextArea > .text-area--cursor {
80
+ background: #ff00ff;
81
+ color: #1a1a2e;
82
+ }
83
+
84
+ TextArea > .text-area--selection {
85
+ background: #ff00ff 40%;
86
+ }
87
+
88
+ #log_container {
89
+ height: 12;
90
+ margin: 0 3 2 3;
91
+ padding: 0;
92
+ }
93
+
94
+ #log_panel {
95
+ height: 1fr;
96
+ background: #1a1a2e;
97
+ border: heavy #ff00ff;
98
+ padding: 1 2;
99
+ }
100
+
101
+ RichLog {
102
+ height: 1fr;
103
+ background: transparent;
104
+ color: #00ff9f;
105
+ border: none;
106
+ padding: 0;
107
+ }
108
+
109
+ #controls {
110
+ height: auto;
111
+ dock: bottom;
112
+ background: #0f0f23;
113
+ padding: 2 3 3 3;
114
+ align: center middle;
115
+ }
116
+
117
+ #button_row {
118
+ width: auto;
119
+ height: auto;
120
+ align: center middle;
121
+ }
122
+
123
+ Button {
124
+ min-width: 14;
125
+ height: 3;
126
+ margin: 0 1;
127
+ border: heavy #00d4ff;
128
+ background: #1a1a2e;
129
+ color: #00d4ff;
130
+ text-style: bold;
131
+ }
132
+
133
+ Button:hover {
134
+ background: #00d4ff 20%;
135
+ color: #ffffff;
136
+ border: heavy #00ff9f;
137
+ }
138
+
139
+ Button:disabled {
140
+ opacity: 0.6;
141
+ border: heavy #00d4ff;
142
+ color: #00d4ff;
143
+ }
144
+
145
+ Footer {
146
+ background: #1a1a2e;
147
+ color: #00ff9f;
148
+ border-top: heavy #00d4ff;
149
+ }
150
+
151
+ Footer > .footer--highlight {
152
+ background: #ff00ff;
153
+ color: #ffffff;
154
+ }
155
+
156
+ Footer > .footer--key {
157
+ background: #00d4ff;
158
+ color: #0f0f23;
159
+ }
160
+
161
+ /* Smooth transitions */
162
+ Button {
163
+ transition: background 100ms, border 100ms, color 100ms;
164
+ }
165
+ """
166
+
167
+ BINDINGS = [
168
+ Binding("ctrl+v", "paste", "Paste", show=True),
169
+ Binding("ctrl+p", "toggle_play", "Play", show=True),
170
+ Binding("ctrl+s", "stop_audio", "Stop", show=True),
171
+ Binding("q", "quit", "Quit", show=True),
172
+ ]
173
+
174
+ is_playing = reactive(False)
175
+ tts_ready = reactive(False)
176
+
177
+ def __init__(self, debug_mode=False):
178
+ super().__init__()
179
+ self.debug_mode = debug_mode
180
+ self.tts = None
181
+ self.original_text = ""
182
+ self._playback_worker = None
183
+ self._highlight_worker = None
184
+ self._word_queue = queue.Queue()
185
+ self._stop_highlighting = threading.Event()
186
+ self._pending_play_after_ready = False
187
+ self._word_spans = []
188
+ self._word_span_pos = 0
189
+
190
+ def compose(self) -> ComposeResult:
191
+ yield StatusBar(id="status")
192
+
193
+ with Vertical(id="main_container"):
194
+ with Container(id="text_panel"):
195
+ yield TextArea(
196
+ "",
197
+ id="text_input",
198
+ soft_wrap=True,
199
+ language="text",
200
+ theme="css"
201
+ )
202
+
203
+ if self.debug_mode:
204
+ with Vertical(id="log_container"):
205
+ with Container(id="log_panel"):
206
+ yield RichLog(id="log", wrap=True, markup=True, auto_scroll=True)
207
+
208
+ with Horizontal(id="controls"):
209
+ with Horizontal(id="button_row"):
210
+ yield Button("Paste", id="paste")
211
+ yield Button("Play", id="play")
212
+ yield Button("Stop", id="stop")
213
+ yield Button("Quit", id="quit")
214
+
215
+ yield Footer()
216
+
217
+ def on_mount(self):
218
+ self.update_status("β–Ά INITIALIZING...")
219
+ self.update_controls()
220
+ self.log_message("[dim]>>> Initializing TTS engine...[/dim]")
221
+ self._init_tts()
222
+
223
+ @work(thread=True)
224
+ def _init_tts(self):
225
+ try:
226
+ self.tts = KokoroTTSProcessor(stream_audio=True, setup_signals=False)
227
+ self.tts_ready = True
228
+ self.call_from_thread(self.update_status, "Ready")
229
+ self.call_from_thread(self.log_message, "[green]>>> TTS engine initialized[/green]")
230
+
231
+ if self._pending_play_after_ready:
232
+ self._pending_play_after_ready = False
233
+ self.call_from_thread(self.action_toggle_play)
234
+
235
+ self.call_from_thread(self.update_controls)
236
+ except Exception as e:
237
+ self.call_from_thread(self.update_status, "Error")
238
+ self.call_from_thread(self.log_message, f"[red]>>> TTS initialization failed: {e}[/red]")
239
+
240
+ def update_status(self, text: str):
241
+ try:
242
+ status = self.query_one(StatusBar)
243
+ status.status_text = text
244
+ except Exception:
245
+ pass
246
+
247
+ # --- Actions ---
248
+ def action_paste(self):
249
+ try:
250
+ text = pyperclip.paste()
251
+ if text:
252
+ self.query_one("#text_input", TextArea).text = text
253
+ self.log_message("[green]>>> Text pasted from clipboard[/green]")
254
+ self.update_status("Text loaded")
255
+ except Exception as e:
256
+ self.log_message(f"[red]>>> Paste failed: {e}[/red]")
257
+
258
+ def action_toggle_play(self):
259
+ textarea = self.query_one("#text_input", TextArea)
260
+ text = textarea.text
261
+ if text.strip():
262
+ play_btn = self.query_one("#play", Button)
263
+ stop_btn = self.query_one("#stop", Button)
264
+ if self.is_playing:
265
+ self.stop_audio()
266
+ else:
267
+ if not self.tts_ready:
268
+ self.log_message("[cyan]>>> TTS loading... will auto-play[/cyan]")
269
+ self.update_status("Loading...")
270
+ self._pending_play_after_ready = True
271
+ play_btn.disabled = True
272
+ stop_btn.disabled = True
273
+ else:
274
+ self.play_audio()
275
+
276
+ def action_stop_audio(self):
277
+ self.stop_audio()
278
+
279
+ def action_quit(self):
280
+ try:
281
+ self.update_status("Exiting...")
282
+ except Exception:
283
+ pass
284
+ self._ensure_tts_stopped()
285
+ self.exit() # cleanly exits the Textual app
286
+
287
+ def on_button_pressed(self, event: Button.Pressed):
288
+ mapping = {
289
+ "paste": self.action_paste,
290
+ "play": self.action_toggle_play,
291
+ "stop": self.action_stop_audio,
292
+ "quit": self.action_quit,
293
+ }
294
+ action = mapping.get(event.button.id)
295
+ if action:
296
+ action()
297
+
298
+ # --- Word span mapping ---
299
+ @staticmethod
300
+ def _normalize_token(s: str) -> str:
301
+ return re.sub(r"[^A-Za-z0-9']+", "", s).lower()
302
+
303
+ @staticmethod
304
+ def _line_starts(text: str):
305
+ starts = [0]
306
+ for i, ch in enumerate(text):
307
+ if ch == "\n":
308
+ starts.append(i + 1)
309
+ return starts
310
+
311
+ def _build_word_spans(self, text: str):
312
+ spans = []
313
+ line_starts = self._line_starts(text)
314
+ for m in re.finditer(r"\S+", text):
315
+ abs_start, abs_end = m.start(), m.end()
316
+ row = bisect.bisect_right(line_starts, abs_start) - 1
317
+ start_col = abs_start - line_starts[row]
318
+ end_col = abs_end - line_starts[row]
319
+ spans.append({
320
+ "token": m.group(),
321
+ "row": row,
322
+ "start_col": start_col,
323
+ "end_col": end_col,
324
+ })
325
+ return spans
326
+
327
+ # --- Playback + Highlight ---
328
+ def play_audio(self):
329
+ if not self.tts_ready:
330
+ self.log_message("[cyan]>>> TTS is still loading[/cyan]")
331
+ return
332
+
333
+ textarea = self.query_one("#text_input", TextArea)
334
+ text = textarea.text
335
+ if not text.strip():
336
+ self.log_message("[cyan]>>> No text to read[/cyan]")
337
+ return
338
+
339
+ self._ensure_tts_stopped()
340
+ self._word_spans = self._build_word_spans(text)
341
+ self._word_span_pos = 0
342
+ self.is_playing = True
343
+ self._stop_highlighting.clear()
344
+
345
+ while not self._word_queue.empty():
346
+ try:
347
+ self._word_queue.get_nowait()
348
+ except queue.Empty:
349
+ break
350
+
351
+ textarea.focus()
352
+ self.update_status("Playing...")
353
+
354
+ self._highlight_worker = threading.Thread(target=self._highlight_loop, daemon=True)
355
+ self._highlight_worker.start()
356
+
357
+ self._playback_worker = threading.Thread(
358
+ target=self._tts_playback_thread, args=(text,), daemon=True
359
+ )
360
+ self._playback_worker.start()
361
+
362
+ def _highlight_loop(self):
363
+ prev_end_time = 0.0
364
+ while not self._stop_highlighting.is_set():
365
+ try:
366
+ item = self._word_queue.get(timeout=0.1)
367
+ if item is None:
368
+ break
369
+
370
+ row, start_col, end_col, start_time, end_time = (
371
+ item["row"],
372
+ item["start_col"],
373
+ item["end_col"],
374
+ item["start_time"],
375
+ item["end_time"],
376
+ )
377
+
378
+ self.call_from_thread(self._set_selection, row, start_col, end_col)
379
+
380
+ if prev_end_time > end_time:
381
+ prev_end_time = -0.2 # add buffer when next audio plays
382
+ duration = max(0.0, end_time - prev_end_time)
383
+ prev_end_time = end_time
384
+
385
+ time.sleep(duration)
386
+
387
+ except queue.Empty:
388
+ continue
389
+ except Exception as e:
390
+ self.call_from_thread(lambda: self.log_message(f"[red]>>> Highlight error: {e}[/red]"))
391
+ break
392
+
393
+ MATCH_WINDOW = 12
394
+
395
+ def _set_selection(self, row: int, start_col: int, end_col: int):
396
+ try:
397
+ textarea = self.query_one("#text_input", TextArea)
398
+ textarea.selection = Selection(start=(row, start_col), end=(row, end_col))
399
+ textarea.focus()
400
+ textarea.scroll_to(y=row, immediate=True)
401
+ except Exception as e:
402
+ self.log_message(f"[red]>>> Selection error: {e}[/red]")
403
+
404
+ def _tts_playback_thread(self, text: str):
405
+ try:
406
+ def word_cb(word_datas, audio_duration):
407
+ self.log_message(word_datas)
408
+ for wd_index, wd in enumerate(word_datas):
409
+ tts_word = wd.get("word", "")
410
+ if not tts_word or not any(ch.isalnum() for ch in tts_word):
411
+ continue
412
+
413
+ start_index = self._word_span_pos
414
+ end_index = min(start_index + 1, len(self._word_spans))
415
+
416
+ match_idx = None
417
+ for i in range(start_index, end_index):
418
+ if self._word_spans[i]["token"] == tts_word:
419
+ match_idx = i
420
+ break
421
+
422
+ if match_idx is None:
423
+ if self._word_span_pos < len(self._word_spans):
424
+ match_idx = self._word_span_pos
425
+ else:
426
+ continue
427
+
428
+ span = self._word_spans[match_idx]
429
+ self._word_span_pos = match_idx + 1
430
+
431
+ start_time = wd.get("start_time", 0.0)
432
+ end_time = wd.get("end_time", 0.0)
433
+ if start_time == None and end_time == None:
434
+ if wd_index + 1 == len(word_datas):
435
+ start_time = word_datas[wd_index - 1]["end_time"]
436
+ end_time = audio_duration
437
+ else:
438
+ start_time = word_datas[wd_index - 1]["end_time"]
439
+ end_time = word_datas[wd_index + 1]["start_time"]
440
+
441
+ self._word_queue.put(
442
+ {
443
+ "word": span["token"],
444
+ "row": span["row"],
445
+ "start_col": span["start_col"],
446
+ "end_col": span["end_col"],
447
+ "start_time": float(start_time) if start_time is not None else 0.0,
448
+ "end_time": float(end_time) if end_time is not None else 0.0,
449
+ }
450
+ )
451
+
452
+ self.tts.word_callback = word_cb
453
+ self.tts.start_audio_streaming()
454
+ self.tts.generate_audio_files(text, self.tts.voices[2], self.tts.default_speed)
455
+ self._word_queue.put(None)
456
+ self.tts.wait_for_audio_streaming_complete()
457
+ self.tts.stop_audio_streaming()
458
+ self.call_from_thread(self.update_status, "Completed")
459
+ self.call_from_thread(lambda: self.log_message("[green]>>> Playback complete[/green]"))
460
+ except Exception as e:
461
+ self.call_from_thread(lambda: self.log_message(f"[red]>>> Playback error: {e}[/red]"))
462
+ finally:
463
+ self.tts.word_callback = None
464
+ self._stop_highlighting.set()
465
+ self.is_playing = False
466
+ self.call_from_thread(self._cleanup_playback)
467
+
468
+ def _ensure_tts_stopped(self):
469
+ if self.tts:
470
+ try:
471
+ if hasattr(self.tts, "is_streaming") and self.tts.is_streaming:
472
+ if hasattr(self.tts, "force_stop_streaming"):
473
+ self.tts.force_stop_streaming()
474
+ if hasattr(self.tts, "audio_queue"):
475
+ while not self.tts.audio_queue.empty():
476
+ try:
477
+ self.tts.audio_queue.get_nowait()
478
+ except Exception:
479
+ break
480
+ self.tts.is_streaming = False
481
+ except Exception as e:
482
+ self.log_message(f"[cyan]>>> Cleanup warning: {e}[/cyan]")
483
+
484
+ self._stop_highlighting.set()
485
+ if self._highlight_worker and self._highlight_worker.is_alive():
486
+ self._highlight_worker.join(timeout=0.2)
487
+ if self._playback_worker and self._playback_worker.is_alive():
488
+ self._playback_worker.join(timeout=0.2)
489
+ self.is_playing = False
490
+
491
+ def stop_audio(self):
492
+ if not self.is_playing:
493
+ return
494
+ self.is_playing = False
495
+ self._stop_highlighting.set()
496
+ self._ensure_tts_stopped()
497
+ self._cleanup_playback()
498
+ self.update_status("Stopped")
499
+ self.log_message("[red]>>> Playback stopped[/red]")
500
+
501
+ def _cleanup_playback(self):
502
+ textarea = self.query_one("#text_input", TextArea)
503
+ textarea.selection = Selection()
504
+ self.update_controls()
505
+
506
+ # --- UI ---
507
+ def log_message(self, message):
508
+ if not self.debug_mode:
509
+ return
510
+ try:
511
+ self.query_one("#log", RichLog).write(message)
512
+ except Exception:
513
+ pass
514
+
515
+ def watch_is_playing(self, is_playing):
516
+ self.update_controls()
517
+ play_btn = self.query_one("#play", Button)
518
+ play_btn.label = "Play"
519
+
520
+ def update_controls(self):
521
+ try:
522
+ play_btn = self.query_one("#play", Button)
523
+ stop_btn = self.query_one("#stop", Button)
524
+ play_btn.disabled = self.is_playing
525
+ stop_btn.disabled = not self.is_playing
526
+ except Exception:
527
+ pass
528
+
529
+
530
+ def main():
531
+ import sys
532
+ debug_mode = "--debug" in sys.argv
533
+ TTSReader(debug_mode=debug_mode).run()
534
+
535
+ if __name__ == "__main__":
536
+ main()
tts_runner/voices/00007.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:697a4ec1903de653c6febe002e0f6fb6f2d0087cc3b1843efb745f8280466201
3
+ size 108844
tts_runner/voices/20250329-audio-american-female.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3cfd78a952c62b19af188a473948f20cc75ea429c172f7688b2cf1fecd13e2b
3
+ size 403244
tts_runner/voices/20250329-audio-american-male.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65daf63c956847c7e3b47c683090c237e6927b3443edc04e5545fdc0d9565502
3
+ size 712014
tts_runner/voices/Ellen-TTS-10.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc930644956f6e214eae4256d1f5328998dd22df8522d9710b77309d06702091
3
+ size 868072
tts_runner/voices/Ellen13y TTS-14.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abb2e9140f3fdf1af552ab42fccf5441f67c88a857589f1f504d557111f601b6
3
+ size 1219164
tts_runner/voices/Main-4.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dab7226e72f1de2f7751c829466deb2f68f60ccbaf4f5265135073d6015a94ee
3
+ size 619208
tts_runner/voices/Simple guy.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77454f53011ec6565b72f62a7e6afdf8f564463990572c489af4d02c3622b14e
3
+ size 616364
tts_runner/voices/VEGETA_4_504_US.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d520d15eb7db5a1e100c529ba160ef6945260799ebf16ca3b79e64b29330ed7c
3
+ size 157302
tts_runner/voices/VEGETA_4_532_US.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:906c1ffae0f1661841c7d4d2970811dde8b64e92cf894db545bf5c3453e12e22
3
+ size 152976
tts_runner/voices/bbc_news.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5152e43ef1b2f72c95d64f216179b52d0b68d754785bb85b69ed9111036aa43
3
+ size 317214
tts_runner/voices/en_woman.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1d49dc69f3b0731ed7b10ddf51dfc8f73465d4323f45841d93583d8b1e4d3e6
3
+ size 313272
tts_runner/voices/kratos(ambient)_en.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e725a35b3aec489a95877c3940e2c6f6cfd24a7ca4692c8680f18a368674cfa8
3
+ size 2759900
tts_runner/voices/voice_preview_cocky male villain voice.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52c844d16cc2d459e9a9881f9fec6fd7580ac3fb6633de11d4bf6913107c9bd2
3
+ size 182693
tts_runner/voices/voice_preview_cocky male villain voice.mp3:Zone.Identifier ADDED
File without changes
tts_runner/voices/voice_preview_david castlemore - newsreader and educator.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0961e2fb1a2437c22b1ca8490965309fb5eec52d3f4f67b62e45fcf280d3b755
3
+ size 152973
tts_runner/voices/voice_preview_kelly - storytelling & motivational content.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d71ad08000aa9178bfad7c30a55d6b6607d51aa93c8d87a85f30015237ad64a
3
+ size 86771
tts_runner/voices/voice_preview_motivational coach - leader.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc3ae8151229a195be4472e042f4bd03a0e8b7ce33995dbbd1d6fccc28979f15
3
+ size 201038
tts_runner/voices/voice_preview_sevan bomar - black motivational speaker.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b36725add8170141c740632a4a870b5f2bd8523e9361a84ca49d7131c9e8b2b
3
+ size 114938
worker.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+ import time
3
+ import os
4
+ import subprocess
5
+ import json
6
+ import shlex
7
+ from datetime import datetime
8
+
9
+ CWD = "./"
10
+ PYTHON_PATH = "stt-transcribe"
11
+ STT_MODEL_NAME = "fasterwhispher"
12
+ POLL_INTERVAL = 3 # seconds
13
+
14
+ def process_audio(file_id, filepath):
15
+ """Process audio file using STT and return the transcription"""
16
+ try:
17
+ print(f"πŸ”„ Running STT on: {os.path.abspath(filepath)}")
18
+
19
+ # Run STT command
20
+ command = f"""cd {CWD} && {PYTHON_PATH} --input {shlex.quote(os.path.abspath(filepath))} --model {STT_MODEL_NAME}"""
21
+
22
+ subprocess.run(
23
+ command,
24
+ shell=True,
25
+ executable="/bin/bash",
26
+ check=True,
27
+ cwd=CWD,
28
+ env={
29
+ **os.environ,
30
+ 'PYTHONUNBUFFERED': '1',
31
+ 'CUDA_LAUNCH_BLOCKING': '1',
32
+ 'USE_CPU_IF_POSSIBLE': 'true'
33
+ }
34
+ )
35
+
36
+ # Read transcription result
37
+ output_path = f'{CWD}/temp_dir/output_transcription.json'
38
+ with open(output_path, 'r') as file:
39
+ result = json.loads(file.read().strip())
40
+
41
+ # Extract caption text (adjust based on your actual output format)
42
+ caption = result.get('text', '') or result.get('transcription', '') or str(result)
43
+
44
+ return caption, None
45
+
46
+ except Exception as e:
47
+ print(f"❌ Error processing file {file_id}: {str(e)}")
48
+ return None, str(e)
49
+
50
+ def update_status(file_id, status, caption=None, error=None):
51
+ """Update the status of a file in the database"""
52
+ conn = sqlite3.connect('audio_captions.db')
53
+ c = conn.cursor()
54
+
55
+ if status == 'completed':
56
+ c.execute('''UPDATE audio_files
57
+ SET status = ?, caption = ?, processed_at = ?
58
+ WHERE id = ?''',
59
+ (status, caption, datetime.now().isoformat(), file_id))
60
+ elif status == 'failed':
61
+ c.execute('''UPDATE audio_files
62
+ SET status = ?, caption = ?, processed_at = ?
63
+ WHERE id = ?''',
64
+ (status, f"Error: {error}", datetime.now().isoformat(), file_id))
65
+ else:
66
+ c.execute('UPDATE audio_files SET status = ? WHERE id = ?', (status, file_id))
67
+
68
+ conn.commit()
69
+ conn.close()
70
+
71
+ def worker_loop():
72
+ """Main worker loop that processes audio files"""
73
+ print("πŸ€– STT Worker started. Monitoring for new audio files...")
74
+ print("πŸ—‘οΈ Audio files will be deleted after successful processing\n")
75
+
76
+ while True:
77
+ try:
78
+ # Get next unprocessed file
79
+ conn = sqlite3.connect('audio_captions.db')
80
+ conn.row_factory = sqlite3.Row
81
+ c = conn.cursor()
82
+ c.execute('''SELECT * FROM audio_files
83
+ WHERE status = 'not_started'
84
+ ORDER BY created_at ASC
85
+ LIMIT 1''')
86
+ row = c.fetchone()
87
+ conn.close()
88
+
89
+ if row:
90
+ file_id = row['id']
91
+ filepath = row['filepath']
92
+ filename = row['filename']
93
+
94
+ print(f"\n{'='*60}")
95
+ print(f"🎡 Processing: {filename}")
96
+ print(f"πŸ“ ID: {file_id}")
97
+ print(f"{'='*60}")
98
+
99
+ # Update status to processing
100
+ update_status(file_id, 'processing')
101
+
102
+ # Process the audio file
103
+ caption, error = process_audio(file_id, filepath)
104
+
105
+ if caption:
106
+ print(f"βœ… Successfully processed: {filename}")
107
+ print(f"πŸ“„ Caption preview: {caption[:100]}...")
108
+ update_status(file_id, 'completed', caption=caption)
109
+
110
+ # Delete the audio file after successful processing
111
+ if os.path.exists(filepath):
112
+ os.remove(filepath)
113
+ print(f"πŸ—‘οΈ Deleted audio file: {filepath}")
114
+ else:
115
+ print(f"❌ Failed to process: {filename}")
116
+ print(f"Error: {error}")
117
+ update_status(file_id, 'failed', error=error)
118
+ # Don't delete file on failure (for debugging)
119
+ else:
120
+ # No files to process, sleep for a bit
121
+ time.sleep(POLL_INTERVAL)
122
+
123
+ except Exception as e:
124
+ print(f"⚠️ Worker error: {str(e)}")
125
+ time.sleep(POLL_INTERVAL)
126
+
127
+ if __name__ == '__main__':
128
+ # Initialize database if it doesn't exist
129
+ if not os.path.exists('audio_captions.db'):
130
+ print("❌ Database not found. Please run app.py first to initialize.")
131
+ else:
132
+ print("\n" + "="*60)
133
+ print("πŸš€ Starting STT Worker (Standalone Mode)")
134
+ print("="*60)
135
+ print("⚠️ Note: Worker is now embedded in app.py")
136
+ print("⚠️ This standalone mode is for testing/debugging only")
137
+ print("="*60 + "\n")
138
+ worker_loop()