Spaces:
Sleeping
Sleeping
| """ | |
| MoodSync Therapy AI | |
| ------------------- | |
| Production-grade Voice Emotion Detection for Therapy | |
| Single dominant emotion with strong confidence | |
| Prepared for Music-as-Therapy integration | |
| Author: You | |
| Use case: Mental health, therapy, wellness, funding programs | |
| """ | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| import numpy as np | |
| import librosa | |
| import gradio as gr | |
| from transformers import pipeline | |
| # ===================================================== | |
| # MODEL CONFIGURATION | |
| # ===================================================== | |
| MODEL_NAME = "superb/wav2vec2-base-superb-er" | |
| TARGET_SAMPLE_RATE = 16000 | |
| # Confidence & decision thresholds (therapy-grade) | |
| CHUNK_CONFIDENCE_THRESHOLD = 0.60 | |
| AGREEMENT_THRESHOLD = 0.65 | |
| FINAL_CONFIDENCE_THRESHOLD = 0.70 | |
| # Core emotions only (higher accuracy, clinically safer) | |
| ALLOWED_EMOTIONS = {"HAPPY", "SAD", "ANGRY", "NEUTRAL"} | |
| EMOJI_MAP = { | |
| "HAPPY": "π", | |
| "SAD": "π’", | |
| "ANGRY": "π ", | |
| "NEUTRAL": "π" | |
| } | |
| # ===================================================== | |
| # LOAD MODEL | |
| # ===================================================== | |
| print("Loading Therapy-Grade Emotion Model...") | |
| emotion_pipe = pipeline( | |
| task="audio-classification", | |
| model=MODEL_NAME, | |
| top_k=None | |
| ) | |
| print("Model loaded successfully.") | |
| # ===================================================== | |
| # AUDIO PROCESSING | |
| # ===================================================== | |
| def preprocess_audio(audio): | |
| """ | |
| Convert to mono, resample, normalize (RMS) | |
| """ | |
| sr, data = audio | |
| if data is None or len(data) == 0: | |
| return None | |
| if len(data.shape) > 1: | |
| data = np.mean(data, axis=1) | |
| if sr != TARGET_SAMPLE_RATE: | |
| data = librosa.resample(data, orig_sr=sr, target_sr=TARGET_SAMPLE_RATE) | |
| data = data.astype(np.float32) | |
| rms = np.sqrt(np.mean(data ** 2)) | |
| if rms > 0: | |
| data = data / rms | |
| return data | |
| def chunk_audio(audio, chunk_sec=1.0): | |
| """ | |
| Split audio into 1-second chunks | |
| """ | |
| chunk_size = int(TARGET_SAMPLE_RATE * chunk_sec) | |
| return [ | |
| audio[i:i + chunk_size] | |
| for i in range(0, len(audio) - chunk_size, chunk_size) | |
| ] | |
| # ===================================================== | |
| # EMOTION INFERENCE | |
| # ===================================================== | |
| def predict_chunks(chunks): | |
| """ | |
| Predict emotion per chunk and keep confident predictions | |
| """ | |
| predictions = [] | |
| for chunk in chunks: | |
| preds = emotion_pipe({ | |
| "raw": chunk, | |
| "sampling_rate": TARGET_SAMPLE_RATE | |
| }) | |
| top = max(preds, key=lambda x: x["score"]) | |
| label = top["label"].upper() | |
| if label in ALLOWED_EMOTIONS and top["score"] >= CHUNK_CONFIDENCE_THRESHOLD: | |
| predictions.append(top) | |
| return predictions | |
| def decide_final_emotion(predictions): | |
| """ | |
| Voting + confidence aggregation | |
| """ | |
| if len(predictions) < 2: | |
| return None, 0.0, "Not enough confident speech segments." | |
| votes = {} | |
| confidence_sum = {} | |
| for p in predictions: | |
| label = p["label"].upper() | |
| votes[label] = votes.get(label, 0) + 1 | |
| confidence_sum[label] = confidence_sum.get(label, 0) + p["score"] | |
| total_votes = sum(votes.values()) | |
| best_emotion = max(votes, key=votes.get) | |
| agreement = votes[best_emotion] / total_votes | |
| avg_confidence = confidence_sum[best_emotion] / votes[best_emotion] | |
| if agreement >= AGREEMENT_THRESHOLD and avg_confidence >= FINAL_CONFIDENCE_THRESHOLD: | |
| return best_emotion, avg_confidence, "High confidence emotional state detected." | |
| return None, 0.0, "Emotion detected but confidence not strong enough for therapy use." | |
| # ===================================================== | |
| # PUBLIC ANALYSIS FUNCTION | |
| # ===================================================== | |
| def analyze_voice(audio): | |
| """ | |
| End-to-end therapy-grade emotion analysis | |
| """ | |
| if audio is None: | |
| return "π€ No audio", "0%", "Please record or upload speech." | |
| processed = preprocess_audio(audio) | |
| if processed is None: | |
| return "β Invalid audio", "0%", "Unreadable audio input." | |
| chunks = chunk_audio(processed) | |
| predictions = predict_chunks(chunks) | |
| emotion, confidence, message = decide_final_emotion(predictions) | |
| if emotion is None: | |
| return "π UNCERTAIN", "Low confidence", message | |
| display = f"{EMOJI_MAP.get(emotion)} {emotion}" | |
| return display, f"{confidence * 100:.1f}%", message | |
| # ===================================================== | |
| # MUSIC-AS-THERAPY (PLACEHOLDER) | |
| # ===================================================== | |
| def therapy_music_placeholder(emotion): | |
| """ | |
| Placeholder for future music therapy engine | |
| """ | |
| if emotion in ["SAD", "ANGRY"]: | |
| return "π§ Music therapy will recommend calming, uplifting tracks." | |
| if emotion == "HAPPY": | |
| return "π§ Music therapy will sustain and reinforce positive mood." | |
| if emotion == "NEUTRAL": | |
| return "π§ Music therapy will gently elevate mood." | |
| return "π§ Music therapy unavailable." | |
| # ===================================================== | |
| # GRADIO UI | |
| # ===================================================== | |
| with gr.Blocks(title="MoodSync Therapy AI") as demo: | |
| gr.Markdown("# π€ MoodSync Therapy AI") | |
| gr.Markdown( | |
| "Clinical-grade voice emotion detection for therapy.\n\n" | |
| "β’ Detects **one dominant emotion**\n" | |
| "β’ High confidence only\n" | |
| "β’ Designed for Music-as-Therapy systems" | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=6): | |
| audio_input = gr.Audio( | |
| sources=["microphone", "upload"], | |
| type="numpy", | |
| label="Speak naturally for 3β10 seconds" | |
| ) | |
| analyze_btn = gr.Button("Analyze Emotion", variant="primary") | |
| with gr.Column(scale=4): | |
| emotion_output = gr.Textbox(label="Detected Emotion", interactive=False) | |
| confidence_output = gr.Textbox(label="Confidence", interactive=False) | |
| explanation_output = gr.Textbox(label="Clinical Explanation", lines=3, interactive=False) | |
| therapy_btn = gr.Button("Music-as-Therapy (Preview)") | |
| therapy_output = gr.Textbox(label="Therapy Guidance", interactive=False) | |
| analyze_btn.click( | |
| fn=analyze_voice, | |
| inputs=audio_input, | |
| outputs=[emotion_output, confidence_output, explanation_output] | |
| ) | |
| therapy_btn.click( | |
| fn=therapy_music_placeholder, | |
| inputs=emotion_output, | |
| outputs=therapy_output | |
| ) | |
| # ===================================================== | |
| # APP ENTRYPOINT | |
| # ===================================================== | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| debug=False | |
| ) | |