File size: 2,514 Bytes
e80214b
53ed5ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e80214b
 
53ed5ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e80214b
53ed5ad
 
 
 
 
e80214b
53ed5ad
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import gradio as gr
from transformers import pipeline
from TTS.api import TTS

# -----------------------------
# 1. Load Speech-to-Text (Whisper)
# -----------------------------

stt = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-large-v3",
    device="cpu"
)

# -----------------------------
# 2. Load Translation (M2M100)
# -----------------------------

translator = pipeline(
    "translation",
    model="facebook/m2m100_418M"
)

# List of supported languages
languages = {
    "English": "en",
    "French": "fr",
    "Kinyarwanda": "rw",
    "Swahili": "sw",
    "German": "de",
    "Spanish": "es",
    "Portuguese": "pt",
    "Italian": "it",
    "Chinese (Mandarin)": "zh",
    "Japanese": "ja",
    "Korean": "ko",
    "Arabic": "ar",
    "Russian": "ru",
}

# -----------------------------
# 3. Load Text-to-Speech (XTTSv2)
# -----------------------------

tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")


# -----------------------------
# MAIN FUNCTION
# -----------------------------

def process_audio(audio, target_lang):

    # Step 1: STT
    text = stt(audio)["text"]

    # Step 2: Translate
    lang_code = languages[target_lang]
    translation = translator(
        text,
        forced_bos_token_id=translator.tokenizer.get_lang_id(lang_code)
    )[0]["translation_text"]

    # Step 3: TTS
    output_audio_path = "output.wav"
    tts.tts_to_file(
        text=translation,
        file_path=output_audio_path,
        speaker_wav=None,
        language=lang_code
    )

    return text, translation, output_audio_path


# -----------------------------
# 4. Gradio UI
# -----------------------------

with gr.Blocks(css="custom.css") as app:

    gr.Markdown("<h1>🌍 Multilingual Voice-to-Voice AI Translator</h1>")
    gr.Markdown("Record or upload audio β†’ AI Converts speech β†’ Translates β†’ Speaks output voice.")

    with gr.Row():
        audio_input = gr.Audio(type="filepath", label="🎀 Upload or Record Audio")
        lang_input = gr.Dropdown(list(languages.keys()), label="🌐 Choose Target Language")

    with gr.Row():
        text_out = gr.Textbox(label="πŸ“ Transcribed Text")
        translation_out = gr.Textbox(label="🌍 Translated Text")

    audio_out = gr.Audio(label="πŸ”Š AI Generated Voice Output")

    submit = gr.Button("πŸš€ Translate & Convert")

    submit.click(
        fn=process_audio,
        inputs=[audio_input, lang_input],
        outputs=[text_out, translation_out, audio_out]
    )

app.launch()