""" MoodSync Therapy AI ------------------- Production-grade Voice Emotion Detection for Therapy Single dominant emotion with strong confidence Prepared for Music-as-Therapy integration Author: You Use case: Mental health, therapy, wellness, funding programs """ import warnings warnings.filterwarnings("ignore") import numpy as np import librosa import gradio as gr from transformers import pipeline # ===================================================== # MODEL CONFIGURATION # ===================================================== MODEL_NAME = "superb/wav2vec2-base-superb-er" TARGET_SAMPLE_RATE = 16000 # Confidence & decision thresholds (therapy-grade) CHUNK_CONFIDENCE_THRESHOLD = 0.60 AGREEMENT_THRESHOLD = 0.65 FINAL_CONFIDENCE_THRESHOLD = 0.70 # Core emotions only (higher accuracy, clinically safer) ALLOWED_EMOTIONS = {"HAPPY", "SAD", "ANGRY", "NEUTRAL"} EMOJI_MAP = { "HAPPY": "😊", "SAD": "😢", "ANGRY": "😠", "NEUTRAL": "😐" } # ===================================================== # LOAD MODEL # ===================================================== print("Loading Therapy-Grade Emotion Model...") emotion_pipe = pipeline( task="audio-classification", model=MODEL_NAME, top_k=None ) print("Model loaded successfully.") # ===================================================== # AUDIO PROCESSING # ===================================================== def preprocess_audio(audio): """ Convert to mono, resample, normalize (RMS) """ sr, data = audio if data is None or len(data) == 0: return None if len(data.shape) > 1: data = np.mean(data, axis=1) if sr != TARGET_SAMPLE_RATE: data = librosa.resample(data, orig_sr=sr, target_sr=TARGET_SAMPLE_RATE) data = data.astype(np.float32) rms = np.sqrt(np.mean(data ** 2)) if rms > 0: data = data / rms return data def chunk_audio(audio, chunk_sec=1.0): """ Split audio into 1-second chunks """ chunk_size = int(TARGET_SAMPLE_RATE * chunk_sec) return [ audio[i:i + chunk_size] for i in range(0, len(audio) - chunk_size, chunk_size) ] # ===================================================== # EMOTION INFERENCE # ===================================================== def predict_chunks(chunks): """ Predict emotion per chunk and keep confident predictions """ predictions = [] for chunk in chunks: preds = emotion_pipe({ "raw": chunk, "sampling_rate": TARGET_SAMPLE_RATE }) top = max(preds, key=lambda x: x["score"]) label = top["label"].upper() if label in ALLOWED_EMOTIONS and top["score"] >= CHUNK_CONFIDENCE_THRESHOLD: predictions.append(top) return predictions def decide_final_emotion(predictions): """ Voting + confidence aggregation """ if len(predictions) < 2: return None, 0.0, "Not enough confident speech segments." votes = {} confidence_sum = {} for p in predictions: label = p["label"].upper() votes[label] = votes.get(label, 0) + 1 confidence_sum[label] = confidence_sum.get(label, 0) + p["score"] total_votes = sum(votes.values()) best_emotion = max(votes, key=votes.get) agreement = votes[best_emotion] / total_votes avg_confidence = confidence_sum[best_emotion] / votes[best_emotion] if agreement >= AGREEMENT_THRESHOLD and avg_confidence >= FINAL_CONFIDENCE_THRESHOLD: return best_emotion, avg_confidence, "High confidence emotional state detected." return None, 0.0, "Emotion detected but confidence not strong enough for therapy use." # ===================================================== # PUBLIC ANALYSIS FUNCTION # ===================================================== def analyze_voice(audio): """ End-to-end therapy-grade emotion analysis """ if audio is None: return "🎤 No audio", "0%", "Please record or upload speech." processed = preprocess_audio(audio) if processed is None: return "❌ Invalid audio", "0%", "Unreadable audio input." chunks = chunk_audio(processed) predictions = predict_chunks(chunks) emotion, confidence, message = decide_final_emotion(predictions) if emotion is None: return "😐 UNCERTAIN", "Low confidence", message display = f"{EMOJI_MAP.get(emotion)} {emotion}" return display, f"{confidence * 100:.1f}%", message # ===================================================== # MUSIC-AS-THERAPY (PLACEHOLDER) # ===================================================== def therapy_music_placeholder(emotion): """ Placeholder for future music therapy engine """ if emotion in ["SAD", "ANGRY"]: return "🎧 Music therapy will recommend calming, uplifting tracks." if emotion == "HAPPY": return "🎧 Music therapy will sustain and reinforce positive mood." if emotion == "NEUTRAL": return "🎧 Music therapy will gently elevate mood." return "🎧 Music therapy unavailable." # ===================================================== # GRADIO UI # ===================================================== with gr.Blocks(title="MoodSync Therapy AI") as demo: gr.Markdown("# 🎤 MoodSync Therapy AI") gr.Markdown( "Clinical-grade voice emotion detection for therapy.\n\n" "• Detects **one dominant emotion**\n" "• High confidence only\n" "• Designed for Music-as-Therapy systems" ) with gr.Row(): with gr.Column(scale=6): audio_input = gr.Audio( sources=["microphone", "upload"], type="numpy", label="Speak naturally for 3–10 seconds" ) analyze_btn = gr.Button("Analyze Emotion", variant="primary") with gr.Column(scale=4): emotion_output = gr.Textbox(label="Detected Emotion", interactive=False) confidence_output = gr.Textbox(label="Confidence", interactive=False) explanation_output = gr.Textbox(label="Clinical Explanation", lines=3, interactive=False) therapy_btn = gr.Button("Music-as-Therapy (Preview)") therapy_output = gr.Textbox(label="Therapy Guidance", interactive=False) analyze_btn.click( fn=analyze_voice, inputs=audio_input, outputs=[emotion_output, confidence_output, explanation_output] ) therapy_btn.click( fn=therapy_music_placeholder, inputs=emotion_output, outputs=therapy_output ) # ===================================================== # APP ENTRYPOINT # ===================================================== if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, debug=False )