Spaces:

ramimu
/

voice_cloning

Sleeping

App Files Files Community

ramimu commited on May 31

Commit

7be21d2

verified ·

1 Parent(s): 5961c78

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -126

app.py CHANGED Viewed

@@ -179,9 +179,9 @@ def clone_voice(text_to_speak, reference_audio_path, exaggeration=0.6, cfg_pace=
         return None, "Error: Please upload a reference audio file (.wav or .mp3)."
     try:
-        print(f"clone_voice function called:")
         print(f"  Text: '{text_to_speak}'")
-        print(f"  Audio Path: '{reference_audio_path}'")
         print(f"  Exaggeration: {exaggeration}")
         print(f"  CFG/Pace: {cfg_pace}")
         print(f"  Random Seed: {random_seed}")
@@ -206,7 +206,7 @@ def clone_voice(text_to_speak, reference_audio_path, exaggeration=0.6, cfg_pace=
         except:
             sample_rate = 24000
-        print(f"Audio generated successfully by clone_voice. Output data type: {type(output_wav_data)}, Sample rate: {sample_rate}")
         if isinstance(output_wav_data, str):
             return output_wav_data, "Success: Audio generated successfully!"
@@ -219,12 +219,11 @@ def clone_voice(text_to_speak, reference_audio_path, exaggeration=0.6, cfg_pace=
             return (sample_rate, output_wav_data), "Success: Audio generated successfully!"
     except Exception as e:
-        print(f"ERROR: Failed during audio generation in clone_voice: {e}")
-        print("Detailed error trace for audio generation in clone_voice:")
         traceback.print_exc()
         return None, f"Error during audio generation: {str(e)}. Check logs for more details."
-# Updated clone_voice_api function with detailed logging
 def clone_voice_api(text_to_speak, reference_audio_url, exaggeration=0.6, cfg_pace=0.3, random_seed=0, temperature=0.6):
     import requests
     import tempfile
@@ -233,120 +232,60 @@ def clone_voice_api(text_to_speak, reference_audio_url, exaggeration=0.6, cfg_pa
     temp_audio_path = None
     try:
-        print(f"=== API CALL DEBUG ===")
-        print(f"Text: {text_to_speak}")
-        print(f"Audio URL type: {type(reference_audio_url)}")
-        print(f"Audio URL length: {len(str(reference_audio_url)) if reference_audio_url else 0}")
-        print(f"Audio URL preview: {str(reference_audio_url)[:100]}...")
-        print(f"Parameters: exag={exaggeration}, cfg={cfg_pace}, seed={random_seed}, temp={temperature}")
-        # Validate inputs
-        if not text_to_speak or text_to_speak.strip() == "":
-            return None, "Error: Please enter some text to speak."
-        if not reference_audio_url:
-            return None, "Error: Please provide reference audio."
-        print("Processing audio data...")
         if reference_audio_url.startswith('data:audio'):
-            print("Processing base64 audio data...")
-            try:
-                header, encoded = reference_audio_url.split(',', 1)
-                print(f"Header: {header}")
-                print(f"Encoded data length: {len(encoded)}")
-                audio_data = base64.b64decode(encoded)
-                print(f"Decoded audio data size: {len(audio_data)} bytes")
-                if 'mp3' in header:
-                    ext = '.mp3'
-                elif 'wav' in header:
-                    ext = '.wav'
-                else:
-                    ext = '.wav'
-                with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
-                    temp_file.write(audio_data)
-                    temp_audio_path = temp_file.name
-                print(f"Created temporary audio file: {temp_audio_path}")
-                print(f"File exists: {os.path.exists(temp_audio_path)}")
-                print(f"File size: {os.path.getsize(temp_audio_path)} bytes")
-            except Exception as audio_error:
-                print(f"Audio processing error: {audio_error}")
-                return None, f"Error processing audio data: {str(audio_error)}"
         elif reference_audio_url.startswith('http'):
-            print("Processing HTTP audio URL...")
-            try:
-                response = requests.get(reference_audio_url)
-                response.raise_for_status()
-                if reference_audio_url.endswith('.mp3'):
-                    ext = '.mp3'
-                elif reference_audio_url.endswith('.wav'):
-                    ext = '.wav'
-                else:
-                    ext = '.wav'
-                with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
-                    temp_file.write(response.content)
-                    temp_audio_path = temp_file.name
-                print(f"Downloaded audio to: {temp_audio_path}")
-            except Exception as download_error:
-                print(f"Download error: {download_error}")
-                return None, f"Error downloading audio: {str(download_error)}"
         else:
-            print("Using direct file path...")
             temp_audio_path = reference_audio_url
-        print(f"Calling clone_voice with:")
-        print(f"  Text: {text_to_speak}")
-        print(f"  Audio path: {temp_audio_path}")
-        print(f"  Parameters: {exaggeration}, {cfg_pace}, {random_seed}, {temperature}")
-        # Call the main function
         audio_output, status = clone_voice(text_to_speak, temp_audio_path, exaggeration, cfg_pace, random_seed, temperature)
-        print(f"clone_voice returned:")
-        print(f"  Audio output type: {type(audio_output)}")
-        print(f"  Status: {status}")
-        # Cleanup
         if temp_audio_path and temp_audio_path != reference_audio_url:
             try:
                 os.unlink(temp_audio_path)
-                print(f"Cleaned up temporary file: {temp_audio_path}")
-            except Exception as cleanup_error:
-                print(f"Cleanup error: {cleanup_error}")
         return audio_output, status
     except Exception as e:
-        print(f"=== CRITICAL ERROR ===")
-        print(f"Error type: {type(e)}")
-        print(f"Error message: {str(e)}")
-        import traceback
-        traceback.print_exc()
-        # Cleanup on error
         if temp_audio_path and temp_audio_path != reference_audio_url:
             try:
                 os.unlink(temp_audio_path)
             except:
                 pass
         return None, f"API Error: {str(e)}"
 def main():
     print("Starting Advanced Gradio interface...")
-    with gr.Blocks(title="Advanced Chatterbox Voice Cloning", theme=gr.themes.Soft()) as iface:
         gr.Markdown("# 🎙️ Advanced Chatterbox Voice Cloning")
         gr.Markdown("Clone any voice using advanced AI technology with fine-tuned controls.")
         with gr.Row():
             with gr.Column(scale=2):
                 text_input = gr.Textbox(
                     label="Text to Speak",
                     placeholder="Enter the text you want the cloned voice to say...",
@@ -357,9 +296,10 @@ def main():
                     label="Reference Audio (Upload a short .wav or .mp3 clip)",
                     sources=["upload", "microphone"]
                 )
                 with gr.Accordion("🔧 Advanced Settings", open=False):
                     with gr.Row():
-                        exaggeration = gr.Slider(
                             minimum=0.25,
                             maximum=1.0,
                             value=0.6,
@@ -367,7 +307,7 @@ def main():
                             label="Exaggeration",
                             info="Controls voice characteristic emphasis"
                         )
-                        cfg_pace = gr.Slider(
                             minimum=0.2,
                             maximum=1.0,
                             value=0.3,
@@ -376,13 +316,13 @@ def main():
                             info="Classifier-free guidance weight"
                         )
                     with gr.Row():
-                        random_seed = gr.Number(
                             value=0,
                             label="Random Seed",
                             info="Set to 0 for random results",
                             precision=0
                         )
-                        temperature = gr.Slider(
                             minimum=0.05,
                             maximum=2.0,
                             value=0.6,
@@ -390,28 +330,14 @@ def main():
                             label="Temperature",
                             info="Controls randomness in generation"
                         )
                 generate_btn = gr.Button("🎵 Generate Voice Clone", variant="primary", size="lg")
             with gr.Column(scale=1):
-                audio_output = gr.Audio(
-                    label="Generated Audio",
-                    type="numpy",
-                    interactive=False
-                )
-                status_output = gr.Textbox(
-                    label="Status",
-                    interactive=False,
-                    lines=2
-                )
-        # This is the key part - create the API endpoint properly
-        generate_btn.click(
-            fn=clone_voice_api,  # Use the API-ready function
-            inputs=[text_input, audio_input, exaggeration, cfg_pace, random_seed, temperature],
-            outputs=[audio_output, status_output],
-            api_name="predict"  # This creates /api/predict endpoint
-        )
         with gr.Accordion("📝 Examples", open=False):
             gr.Examples(
                 examples=[
@@ -419,14 +345,43 @@ def main():
                     ["The quick brown fox jumps over the lazy dog.", None, 0.7, 0.3, 42, 0.6],
                     ["Welcome to our AI voice cloning service. We hope you enjoy the experience!", None, 0.4, 0.7, 123, 1.0]
                 ],
-                inputs=[text_input, audio_input, exaggeration, cfg_pace, random_seed, temperature],
-                outputs=[audio_output, status_output],
-                fn=clone_voice_api,
-                cache_examples=False
             )
-    # Launch the interface
-    iface.launch(
         server_name="0.0.0.0",
         server_port=7860,
         show_error=True,

         return None, "Error: Please upload a reference audio file (.wav or .mp3)."
     try:
+        print(f"Received request:")
         print(f"  Text: '{text_to_speak}'")
+        print(f"  Audio: '{reference_audio_path}'")
         print(f"  Exaggeration: {exaggeration}")
         print(f"  CFG/Pace: {cfg_pace}")
         print(f"  Random Seed: {random_seed}")
         except:
             sample_rate = 24000
+        print(f"Audio generated successfully. Output data type: {type(output_wav_data)}, Sample rate: {sample_rate}")
         if isinstance(output_wav_data, str):
             return output_wav_data, "Success: Audio generated successfully!"
             return (sample_rate, output_wav_data), "Success: Audio generated successfully!"
     except Exception as e:
+        print(f"ERROR: Failed during audio generation: {e}")
+        print("Detailed error trace for audio generation:")
         traceback.print_exc()
         return None, f"Error during audio generation: {str(e)}. Check logs for more details."
 def clone_voice_api(text_to_speak, reference_audio_url, exaggeration=0.6, cfg_pace=0.3, random_seed=0, temperature=0.6):
     import requests
     import tempfile
     temp_audio_path = None
     try:
         if reference_audio_url.startswith('data:audio'):
+            header, encoded = reference_audio_url.split(',', 1)
+            audio_data = base64.b64decode(encoded)
+            if 'mp3' in header:
+                ext = '.mp3'
+            elif 'wav' in header:
+                ext = '.wav'
+            else:
+                ext = '.wav'
+            with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
+                temp_file.write(audio_data)
+                temp_audio_path = temp_file.name
         elif reference_audio_url.startswith('http'):
+            response = requests.get(reference_audio_url)
+            response.raise_for_status()
+            if reference_audio_url.endswith('.mp3'):
+                ext = '.mp3'
+            elif reference_audio_url.endswith('.wav'):
+                ext = '.wav'
+            else:
+                ext = '.wav'
+            with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
+                temp_file.write(response.content)
+                temp_audio_path = temp_file.name
         else:
             temp_audio_path = reference_audio_url
         audio_output, status = clone_voice(text_to_speak, temp_audio_path, exaggeration, cfg_pace, random_seed, temperature)
         if temp_audio_path and temp_audio_path != reference_audio_url:
             try:
                 os.unlink(temp_audio_path)
+            except:
+                pass
         return audio_output, status
     except Exception as e:
         if temp_audio_path and temp_audio_path != reference_audio_url:
             try:
                 os.unlink(temp_audio_path)
             except:
                 pass
         return None, f"API Error: {str(e)}"
 def main():
     print("Starting Advanced Gradio interface...")
+    # Create a Blocks interface with multiple functions
+    with gr.Blocks(title="🎙️ Advanced Chatterbox Voice Cloning") as demo:
         gr.Markdown("# 🎙️ Advanced Chatterbox Voice Cloning")
         gr.Markdown("Clone any voice using advanced AI technology with fine-tuned controls.")
         with gr.Row():
             with gr.Column(scale=2):
+                # Main interface inputs
                 text_input = gr.Textbox(
                     label="Text to Speak",
                     placeholder="Enter the text you want the cloned voice to say...",
                     label="Reference Audio (Upload a short .wav or .mp3 clip)",
                     sources=["upload", "microphone"]
                 )
                 with gr.Accordion("🔧 Advanced Settings", open=False):
                     with gr.Row():
+                        exaggeration_input = gr.Slider(
                             minimum=0.25,
                             maximum=1.0,
                             value=0.6,
                             label="Exaggeration",
                             info="Controls voice characteristic emphasis"
                         )
+                        cfg_pace_input = gr.Slider(
                             minimum=0.2,
                             maximum=1.0,
                             value=0.3,
                             info="Classifier-free guidance weight"
                         )
                     with gr.Row():
+                        seed_input = gr.Number(
                             value=0,
                             label="Random Seed",
                             info="Set to 0 for random results",
                             precision=0
                         )
+                        temperature_input = gr.Slider(
                             minimum=0.05,
                             maximum=2.0,
                             value=0.6,
                             label="Temperature",
                             info="Controls randomness in generation"
                         )
                 generate_btn = gr.Button("🎵 Generate Voice Clone", variant="primary", size="lg")
             with gr.Column(scale=1):
+                # Outputs
+                audio_output = gr.Audio(label="Generated Audio", type="numpy")
+                status_output = gr.Textbox(label="Status", lines=2)
         with gr.Accordion("📝 Examples", open=False):
             gr.Examples(
                 examples=[
                     ["The quick brown fox jumps over the lazy dog.", None, 0.7, 0.3, 42, 0.6],
                     ["Welcome to our AI voice cloning service. We hope you enjoy the experience!", None, 0.4, 0.7, 123, 1.0]
                 ],
+                inputs=[text_input, audio_input, exaggeration_input, cfg_pace_input, seed_input, temperature_input]
             )
+        # Main interface function (for file uploads)
+        generate_btn.click(
+            fn=clone_voice_api,
+            inputs=[text_input, audio_input, exaggeration_input, cfg_pace_input, seed_input, temperature_input],
+            outputs=[audio_output, status_output],
+            api_name="predict"
+        )
+        # API function for base64 data (for external API calls)
+        def clone_voice_base64_api(text_to_speak, reference_audio_b64, exaggeration=0.6, cfg_pace=0.3, random_seed=0, temperature=0.6):
+            """API function that accepts base64 audio data directly."""
+            return clone_voice_api(text_to_speak, reference_audio_b64, exaggeration, cfg_pace, random_seed, temperature)
+        # Hidden inputs/outputs for the base64 API
+        with gr.Row(visible=False):
+            api_text_input = gr.Textbox()
+            api_audio_input = gr.Textbox()  # This will receive base64 data URL
+            api_exaggeration_input = gr.Slider(minimum=0.25, maximum=1.0, value=0.6)
+            api_cfg_pace_input = gr.Slider(minimum=0.2, maximum=1.0, value=0.3)
+            api_seed_input = gr.Number(value=0, precision=0)
+            api_temperature_input = gr.Slider(minimum=0.05, maximum=2.0, value=0.6)
+            api_audio_output = gr.Audio(type="numpy")
+            api_status_output = gr.Textbox()
+            api_btn = gr.Button()
+        # API endpoint for base64 data
+        api_btn.click(
+            fn=clone_voice_base64_api,
+            inputs=[api_text_input, api_audio_input, api_exaggeration_input, api_cfg_pace_input, api_seed_input, api_temperature_input],
+            outputs=[api_audio_output, api_status_output],
+            api_name="clone_voice"
+        )
+    demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
         show_error=True,