Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -179,9 +179,9 @@ def clone_voice(text_to_speak, reference_audio_path, exaggeration=0.6, cfg_pace=
|
|
| 179 |
return None, "Error: Please upload a reference audio file (.wav or .mp3)."
|
| 180 |
|
| 181 |
try:
|
| 182 |
-
print(f"
|
| 183 |
print(f" Text: '{text_to_speak}'")
|
| 184 |
-
print(f" Audio
|
| 185 |
print(f" Exaggeration: {exaggeration}")
|
| 186 |
print(f" CFG/Pace: {cfg_pace}")
|
| 187 |
print(f" Random Seed: {random_seed}")
|
|
@@ -206,7 +206,7 @@ def clone_voice(text_to_speak, reference_audio_path, exaggeration=0.6, cfg_pace=
|
|
| 206 |
except:
|
| 207 |
sample_rate = 24000
|
| 208 |
|
| 209 |
-
print(f"Audio generated successfully
|
| 210 |
|
| 211 |
if isinstance(output_wav_data, str):
|
| 212 |
return output_wav_data, "Success: Audio generated successfully!"
|
|
@@ -219,12 +219,11 @@ def clone_voice(text_to_speak, reference_audio_path, exaggeration=0.6, cfg_pace=
|
|
| 219 |
return (sample_rate, output_wav_data), "Success: Audio generated successfully!"
|
| 220 |
|
| 221 |
except Exception as e:
|
| 222 |
-
print(f"ERROR: Failed during audio generation
|
| 223 |
-
print("Detailed error trace for audio generation
|
| 224 |
traceback.print_exc()
|
| 225 |
return None, f"Error during audio generation: {str(e)}. Check logs for more details."
|
| 226 |
|
| 227 |
-
# Updated clone_voice_api function with detailed logging
|
| 228 |
def clone_voice_api(text_to_speak, reference_audio_url, exaggeration=0.6, cfg_pace=0.3, random_seed=0, temperature=0.6):
|
| 229 |
import requests
|
| 230 |
import tempfile
|
|
@@ -233,120 +232,60 @@ def clone_voice_api(text_to_speak, reference_audio_url, exaggeration=0.6, cfg_pa
|
|
| 233 |
|
| 234 |
temp_audio_path = None
|
| 235 |
try:
|
| 236 |
-
print(f"=== API CALL DEBUG ===")
|
| 237 |
-
print(f"Text: {text_to_speak}")
|
| 238 |
-
print(f"Audio URL type: {type(reference_audio_url)}")
|
| 239 |
-
print(f"Audio URL length: {len(str(reference_audio_url)) if reference_audio_url else 0}")
|
| 240 |
-
print(f"Audio URL preview: {str(reference_audio_url)[:100]}...")
|
| 241 |
-
print(f"Parameters: exag={exaggeration}, cfg={cfg_pace}, seed={random_seed}, temp={temperature}")
|
| 242 |
-
|
| 243 |
-
# Validate inputs
|
| 244 |
-
if not text_to_speak or text_to_speak.strip() == "":
|
| 245 |
-
return None, "Error: Please enter some text to speak."
|
| 246 |
-
|
| 247 |
-
if not reference_audio_url:
|
| 248 |
-
return None, "Error: Please provide reference audio."
|
| 249 |
-
|
| 250 |
-
print("Processing audio data...")
|
| 251 |
-
|
| 252 |
if reference_audio_url.startswith('data:audio'):
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
elif 'wav' in header:
|
| 265 |
-
ext = '.wav'
|
| 266 |
-
else:
|
| 267 |
-
ext = '.wav'
|
| 268 |
-
|
| 269 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
|
| 270 |
-
temp_file.write(audio_data)
|
| 271 |
-
temp_audio_path = temp_file.name
|
| 272 |
-
|
| 273 |
-
print(f"Created temporary audio file: {temp_audio_path}")
|
| 274 |
-
print(f"File exists: {os.path.exists(temp_audio_path)}")
|
| 275 |
-
print(f"File size: {os.path.getsize(temp_audio_path)} bytes")
|
| 276 |
-
|
| 277 |
-
except Exception as audio_error:
|
| 278 |
-
print(f"Audio processing error: {audio_error}")
|
| 279 |
-
return None, f"Error processing audio data: {str(audio_error)}"
|
| 280 |
-
|
| 281 |
elif reference_audio_url.startswith('http'):
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
temp_file.write(response.content)
|
| 294 |
-
temp_audio_path = temp_file.name
|
| 295 |
-
print(f"Downloaded audio to: {temp_audio_path}")
|
| 296 |
-
except Exception as download_error:
|
| 297 |
-
print(f"Download error: {download_error}")
|
| 298 |
-
return None, f"Error downloading audio: {str(download_error)}"
|
| 299 |
else:
|
| 300 |
-
print("Using direct file path...")
|
| 301 |
temp_audio_path = reference_audio_url
|
| 302 |
|
| 303 |
-
print(f"Calling clone_voice with:")
|
| 304 |
-
print(f" Text: {text_to_speak}")
|
| 305 |
-
print(f" Audio path: {temp_audio_path}")
|
| 306 |
-
print(f" Parameters: {exaggeration}, {cfg_pace}, {random_seed}, {temperature}")
|
| 307 |
-
|
| 308 |
-
# Call the main function
|
| 309 |
audio_output, status = clone_voice(text_to_speak, temp_audio_path, exaggeration, cfg_pace, random_seed, temperature)
|
| 310 |
-
|
| 311 |
-
print(f"clone_voice returned:")
|
| 312 |
-
print(f" Audio output type: {type(audio_output)}")
|
| 313 |
-
print(f" Status: {status}")
|
| 314 |
|
| 315 |
-
# Cleanup
|
| 316 |
if temp_audio_path and temp_audio_path != reference_audio_url:
|
| 317 |
try:
|
| 318 |
os.unlink(temp_audio_path)
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
print(f"Cleanup error: {cleanup_error}")
|
| 322 |
-
|
| 323 |
return audio_output, status
|
| 324 |
-
|
| 325 |
except Exception as e:
|
| 326 |
-
print(f"=== CRITICAL ERROR ===")
|
| 327 |
-
print(f"Error type: {type(e)}")
|
| 328 |
-
print(f"Error message: {str(e)}")
|
| 329 |
-
import traceback
|
| 330 |
-
traceback.print_exc()
|
| 331 |
-
|
| 332 |
-
# Cleanup on error
|
| 333 |
if temp_audio_path and temp_audio_path != reference_audio_url:
|
| 334 |
try:
|
| 335 |
os.unlink(temp_audio_path)
|
| 336 |
except:
|
| 337 |
pass
|
| 338 |
-
|
| 339 |
return None, f"API Error: {str(e)}"
|
| 340 |
|
| 341 |
def main():
|
| 342 |
print("Starting Advanced Gradio interface...")
|
| 343 |
-
|
| 344 |
-
|
|
|
|
| 345 |
gr.Markdown("# ποΈ Advanced Chatterbox Voice Cloning")
|
| 346 |
gr.Markdown("Clone any voice using advanced AI technology with fine-tuned controls.")
|
| 347 |
-
|
| 348 |
with gr.Row():
|
| 349 |
with gr.Column(scale=2):
|
|
|
|
| 350 |
text_input = gr.Textbox(
|
| 351 |
label="Text to Speak",
|
| 352 |
placeholder="Enter the text you want the cloned voice to say...",
|
|
@@ -357,9 +296,10 @@ def main():
|
|
| 357 |
label="Reference Audio (Upload a short .wav or .mp3 clip)",
|
| 358 |
sources=["upload", "microphone"]
|
| 359 |
)
|
|
|
|
| 360 |
with gr.Accordion("π§ Advanced Settings", open=False):
|
| 361 |
with gr.Row():
|
| 362 |
-
|
| 363 |
minimum=0.25,
|
| 364 |
maximum=1.0,
|
| 365 |
value=0.6,
|
|
@@ -367,7 +307,7 @@ def main():
|
|
| 367 |
label="Exaggeration",
|
| 368 |
info="Controls voice characteristic emphasis"
|
| 369 |
)
|
| 370 |
-
|
| 371 |
minimum=0.2,
|
| 372 |
maximum=1.0,
|
| 373 |
value=0.3,
|
|
@@ -376,13 +316,13 @@ def main():
|
|
| 376 |
info="Classifier-free guidance weight"
|
| 377 |
)
|
| 378 |
with gr.Row():
|
| 379 |
-
|
| 380 |
value=0,
|
| 381 |
label="Random Seed",
|
| 382 |
info="Set to 0 for random results",
|
| 383 |
precision=0
|
| 384 |
)
|
| 385 |
-
|
| 386 |
minimum=0.05,
|
| 387 |
maximum=2.0,
|
| 388 |
value=0.6,
|
|
@@ -390,28 +330,14 @@ def main():
|
|
| 390 |
label="Temperature",
|
| 391 |
info="Controls randomness in generation"
|
| 392 |
)
|
|
|
|
| 393 |
generate_btn = gr.Button("π΅ Generate Voice Clone", variant="primary", size="lg")
|
| 394 |
-
|
| 395 |
with gr.Column(scale=1):
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
)
|
| 401 |
-
status_output = gr.Textbox(
|
| 402 |
-
label="Status",
|
| 403 |
-
interactive=False,
|
| 404 |
-
lines=2
|
| 405 |
-
)
|
| 406 |
-
|
| 407 |
-
# This is the key part - create the API endpoint properly
|
| 408 |
-
generate_btn.click(
|
| 409 |
-
fn=clone_voice_api, # Use the API-ready function
|
| 410 |
-
inputs=[text_input, audio_input, exaggeration, cfg_pace, random_seed, temperature],
|
| 411 |
-
outputs=[audio_output, status_output],
|
| 412 |
-
api_name="predict" # This creates /api/predict endpoint
|
| 413 |
-
)
|
| 414 |
-
|
| 415 |
with gr.Accordion("π Examples", open=False):
|
| 416 |
gr.Examples(
|
| 417 |
examples=[
|
|
@@ -419,14 +345,43 @@ def main():
|
|
| 419 |
["The quick brown fox jumps over the lazy dog.", None, 0.7, 0.3, 42, 0.6],
|
| 420 |
["Welcome to our AI voice cloning service. We hope you enjoy the experience!", None, 0.4, 0.7, 123, 1.0]
|
| 421 |
],
|
| 422 |
-
inputs=[text_input, audio_input,
|
| 423 |
-
outputs=[audio_output, status_output],
|
| 424 |
-
fn=clone_voice_api,
|
| 425 |
-
cache_examples=False
|
| 426 |
)
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 430 |
server_name="0.0.0.0",
|
| 431 |
server_port=7860,
|
| 432 |
show_error=True,
|
|
|
|
| 179 |
return None, "Error: Please upload a reference audio file (.wav or .mp3)."
|
| 180 |
|
| 181 |
try:
|
| 182 |
+
print(f"Received request:")
|
| 183 |
print(f" Text: '{text_to_speak}'")
|
| 184 |
+
print(f" Audio: '{reference_audio_path}'")
|
| 185 |
print(f" Exaggeration: {exaggeration}")
|
| 186 |
print(f" CFG/Pace: {cfg_pace}")
|
| 187 |
print(f" Random Seed: {random_seed}")
|
|
|
|
| 206 |
except:
|
| 207 |
sample_rate = 24000
|
| 208 |
|
| 209 |
+
print(f"Audio generated successfully. Output data type: {type(output_wav_data)}, Sample rate: {sample_rate}")
|
| 210 |
|
| 211 |
if isinstance(output_wav_data, str):
|
| 212 |
return output_wav_data, "Success: Audio generated successfully!"
|
|
|
|
| 219 |
return (sample_rate, output_wav_data), "Success: Audio generated successfully!"
|
| 220 |
|
| 221 |
except Exception as e:
|
| 222 |
+
print(f"ERROR: Failed during audio generation: {e}")
|
| 223 |
+
print("Detailed error trace for audio generation:")
|
| 224 |
traceback.print_exc()
|
| 225 |
return None, f"Error during audio generation: {str(e)}. Check logs for more details."
|
| 226 |
|
|
|
|
| 227 |
def clone_voice_api(text_to_speak, reference_audio_url, exaggeration=0.6, cfg_pace=0.3, random_seed=0, temperature=0.6):
|
| 228 |
import requests
|
| 229 |
import tempfile
|
|
|
|
| 232 |
|
| 233 |
temp_audio_path = None
|
| 234 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
if reference_audio_url.startswith('data:audio'):
|
| 236 |
+
header, encoded = reference_audio_url.split(',', 1)
|
| 237 |
+
audio_data = base64.b64decode(encoded)
|
| 238 |
+
if 'mp3' in header:
|
| 239 |
+
ext = '.mp3'
|
| 240 |
+
elif 'wav' in header:
|
| 241 |
+
ext = '.wav'
|
| 242 |
+
else:
|
| 243 |
+
ext = '.wav'
|
| 244 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
|
| 245 |
+
temp_file.write(audio_data)
|
| 246 |
+
temp_audio_path = temp_file.name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
elif reference_audio_url.startswith('http'):
|
| 248 |
+
response = requests.get(reference_audio_url)
|
| 249 |
+
response.raise_for_status()
|
| 250 |
+
if reference_audio_url.endswith('.mp3'):
|
| 251 |
+
ext = '.mp3'
|
| 252 |
+
elif reference_audio_url.endswith('.wav'):
|
| 253 |
+
ext = '.wav'
|
| 254 |
+
else:
|
| 255 |
+
ext = '.wav'
|
| 256 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
|
| 257 |
+
temp_file.write(response.content)
|
| 258 |
+
temp_audio_path = temp_file.name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
else:
|
|
|
|
| 260 |
temp_audio_path = reference_audio_url
|
| 261 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
audio_output, status = clone_voice(text_to_speak, temp_audio_path, exaggeration, cfg_pace, random_seed, temperature)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
|
|
|
|
| 264 |
if temp_audio_path and temp_audio_path != reference_audio_url:
|
| 265 |
try:
|
| 266 |
os.unlink(temp_audio_path)
|
| 267 |
+
except:
|
| 268 |
+
pass
|
|
|
|
|
|
|
| 269 |
return audio_output, status
|
|
|
|
| 270 |
except Exception as e:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
if temp_audio_path and temp_audio_path != reference_audio_url:
|
| 272 |
try:
|
| 273 |
os.unlink(temp_audio_path)
|
| 274 |
except:
|
| 275 |
pass
|
|
|
|
| 276 |
return None, f"API Error: {str(e)}"
|
| 277 |
|
| 278 |
def main():
|
| 279 |
print("Starting Advanced Gradio interface...")
|
| 280 |
+
|
| 281 |
+
# Create a Blocks interface with multiple functions
|
| 282 |
+
with gr.Blocks(title="ποΈ Advanced Chatterbox Voice Cloning") as demo:
|
| 283 |
gr.Markdown("# ποΈ Advanced Chatterbox Voice Cloning")
|
| 284 |
gr.Markdown("Clone any voice using advanced AI technology with fine-tuned controls.")
|
| 285 |
+
|
| 286 |
with gr.Row():
|
| 287 |
with gr.Column(scale=2):
|
| 288 |
+
# Main interface inputs
|
| 289 |
text_input = gr.Textbox(
|
| 290 |
label="Text to Speak",
|
| 291 |
placeholder="Enter the text you want the cloned voice to say...",
|
|
|
|
| 296 |
label="Reference Audio (Upload a short .wav or .mp3 clip)",
|
| 297 |
sources=["upload", "microphone"]
|
| 298 |
)
|
| 299 |
+
|
| 300 |
with gr.Accordion("π§ Advanced Settings", open=False):
|
| 301 |
with gr.Row():
|
| 302 |
+
exaggeration_input = gr.Slider(
|
| 303 |
minimum=0.25,
|
| 304 |
maximum=1.0,
|
| 305 |
value=0.6,
|
|
|
|
| 307 |
label="Exaggeration",
|
| 308 |
info="Controls voice characteristic emphasis"
|
| 309 |
)
|
| 310 |
+
cfg_pace_input = gr.Slider(
|
| 311 |
minimum=0.2,
|
| 312 |
maximum=1.0,
|
| 313 |
value=0.3,
|
|
|
|
| 316 |
info="Classifier-free guidance weight"
|
| 317 |
)
|
| 318 |
with gr.Row():
|
| 319 |
+
seed_input = gr.Number(
|
| 320 |
value=0,
|
| 321 |
label="Random Seed",
|
| 322 |
info="Set to 0 for random results",
|
| 323 |
precision=0
|
| 324 |
)
|
| 325 |
+
temperature_input = gr.Slider(
|
| 326 |
minimum=0.05,
|
| 327 |
maximum=2.0,
|
| 328 |
value=0.6,
|
|
|
|
| 330 |
label="Temperature",
|
| 331 |
info="Controls randomness in generation"
|
| 332 |
)
|
| 333 |
+
|
| 334 |
generate_btn = gr.Button("π΅ Generate Voice Clone", variant="primary", size="lg")
|
| 335 |
+
|
| 336 |
with gr.Column(scale=1):
|
| 337 |
+
# Outputs
|
| 338 |
+
audio_output = gr.Audio(label="Generated Audio", type="numpy")
|
| 339 |
+
status_output = gr.Textbox(label="Status", lines=2)
|
| 340 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
with gr.Accordion("π Examples", open=False):
|
| 342 |
gr.Examples(
|
| 343 |
examples=[
|
|
|
|
| 345 |
["The quick brown fox jumps over the lazy dog.", None, 0.7, 0.3, 42, 0.6],
|
| 346 |
["Welcome to our AI voice cloning service. We hope you enjoy the experience!", None, 0.4, 0.7, 123, 1.0]
|
| 347 |
],
|
| 348 |
+
inputs=[text_input, audio_input, exaggeration_input, cfg_pace_input, seed_input, temperature_input]
|
|
|
|
|
|
|
|
|
|
| 349 |
)
|
| 350 |
+
|
| 351 |
+
# Main interface function (for file uploads)
|
| 352 |
+
generate_btn.click(
|
| 353 |
+
fn=clone_voice_api,
|
| 354 |
+
inputs=[text_input, audio_input, exaggeration_input, cfg_pace_input, seed_input, temperature_input],
|
| 355 |
+
outputs=[audio_output, status_output],
|
| 356 |
+
api_name="predict"
|
| 357 |
+
)
|
| 358 |
+
|
| 359 |
+
# API function for base64 data (for external API calls)
|
| 360 |
+
def clone_voice_base64_api(text_to_speak, reference_audio_b64, exaggeration=0.6, cfg_pace=0.3, random_seed=0, temperature=0.6):
|
| 361 |
+
"""API function that accepts base64 audio data directly."""
|
| 362 |
+
return clone_voice_api(text_to_speak, reference_audio_b64, exaggeration, cfg_pace, random_seed, temperature)
|
| 363 |
+
|
| 364 |
+
# Hidden inputs/outputs for the base64 API
|
| 365 |
+
with gr.Row(visible=False):
|
| 366 |
+
api_text_input = gr.Textbox()
|
| 367 |
+
api_audio_input = gr.Textbox() # This will receive base64 data URL
|
| 368 |
+
api_exaggeration_input = gr.Slider(minimum=0.25, maximum=1.0, value=0.6)
|
| 369 |
+
api_cfg_pace_input = gr.Slider(minimum=0.2, maximum=1.0, value=0.3)
|
| 370 |
+
api_seed_input = gr.Number(value=0, precision=0)
|
| 371 |
+
api_temperature_input = gr.Slider(minimum=0.05, maximum=2.0, value=0.6)
|
| 372 |
+
api_audio_output = gr.Audio(type="numpy")
|
| 373 |
+
api_status_output = gr.Textbox()
|
| 374 |
+
api_btn = gr.Button()
|
| 375 |
+
|
| 376 |
+
# API endpoint for base64 data
|
| 377 |
+
api_btn.click(
|
| 378 |
+
fn=clone_voice_base64_api,
|
| 379 |
+
inputs=[api_text_input, api_audio_input, api_exaggeration_input, api_cfg_pace_input, api_seed_input, api_temperature_input],
|
| 380 |
+
outputs=[api_audio_output, api_status_output],
|
| 381 |
+
api_name="clone_voice"
|
| 382 |
+
)
|
| 383 |
+
|
| 384 |
+
demo.launch(
|
| 385 |
server_name="0.0.0.0",
|
| 386 |
server_port=7860,
|
| 387 |
show_error=True,
|