moodsync / app.py
PsalmsJava's picture
Update app.py
671ad4f verified
"""
MoodSync Therapy AI
-------------------
Production-grade Voice Emotion Detection for Therapy
Single dominant emotion with strong confidence
Prepared for Music-as-Therapy integration
Author: You
Use case: Mental health, therapy, wellness, funding programs
"""
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import librosa
import gradio as gr
from transformers import pipeline
# =====================================================
# MODEL CONFIGURATION
# =====================================================
MODEL_NAME = "superb/wav2vec2-base-superb-er"
TARGET_SAMPLE_RATE = 16000
# Confidence & decision thresholds (therapy-grade)
CHUNK_CONFIDENCE_THRESHOLD = 0.60
AGREEMENT_THRESHOLD = 0.65
FINAL_CONFIDENCE_THRESHOLD = 0.70
# Core emotions only (higher accuracy, clinically safer)
ALLOWED_EMOTIONS = {"HAPPY", "SAD", "ANGRY", "NEUTRAL"}
EMOJI_MAP = {
"HAPPY": "😊",
"SAD": "😒",
"ANGRY": "😠",
"NEUTRAL": "😐"
}
# =====================================================
# LOAD MODEL
# =====================================================
print("Loading Therapy-Grade Emotion Model...")
emotion_pipe = pipeline(
task="audio-classification",
model=MODEL_NAME,
top_k=None
)
print("Model loaded successfully.")
# =====================================================
# AUDIO PROCESSING
# =====================================================
def preprocess_audio(audio):
"""
Convert to mono, resample, normalize (RMS)
"""
sr, data = audio
if data is None or len(data) == 0:
return None
if len(data.shape) > 1:
data = np.mean(data, axis=1)
if sr != TARGET_SAMPLE_RATE:
data = librosa.resample(data, orig_sr=sr, target_sr=TARGET_SAMPLE_RATE)
data = data.astype(np.float32)
rms = np.sqrt(np.mean(data ** 2))
if rms > 0:
data = data / rms
return data
def chunk_audio(audio, chunk_sec=1.0):
"""
Split audio into 1-second chunks
"""
chunk_size = int(TARGET_SAMPLE_RATE * chunk_sec)
return [
audio[i:i + chunk_size]
for i in range(0, len(audio) - chunk_size, chunk_size)
]
# =====================================================
# EMOTION INFERENCE
# =====================================================
def predict_chunks(chunks):
"""
Predict emotion per chunk and keep confident predictions
"""
predictions = []
for chunk in chunks:
preds = emotion_pipe({
"raw": chunk,
"sampling_rate": TARGET_SAMPLE_RATE
})
top = max(preds, key=lambda x: x["score"])
label = top["label"].upper()
if label in ALLOWED_EMOTIONS and top["score"] >= CHUNK_CONFIDENCE_THRESHOLD:
predictions.append(top)
return predictions
def decide_final_emotion(predictions):
"""
Voting + confidence aggregation
"""
if len(predictions) < 2:
return None, 0.0, "Not enough confident speech segments."
votes = {}
confidence_sum = {}
for p in predictions:
label = p["label"].upper()
votes[label] = votes.get(label, 0) + 1
confidence_sum[label] = confidence_sum.get(label, 0) + p["score"]
total_votes = sum(votes.values())
best_emotion = max(votes, key=votes.get)
agreement = votes[best_emotion] / total_votes
avg_confidence = confidence_sum[best_emotion] / votes[best_emotion]
if agreement >= AGREEMENT_THRESHOLD and avg_confidence >= FINAL_CONFIDENCE_THRESHOLD:
return best_emotion, avg_confidence, "High confidence emotional state detected."
return None, 0.0, "Emotion detected but confidence not strong enough for therapy use."
# =====================================================
# PUBLIC ANALYSIS FUNCTION
# =====================================================
def analyze_voice(audio):
"""
End-to-end therapy-grade emotion analysis
"""
if audio is None:
return "🎀 No audio", "0%", "Please record or upload speech."
processed = preprocess_audio(audio)
if processed is None:
return "❌ Invalid audio", "0%", "Unreadable audio input."
chunks = chunk_audio(processed)
predictions = predict_chunks(chunks)
emotion, confidence, message = decide_final_emotion(predictions)
if emotion is None:
return "😐 UNCERTAIN", "Low confidence", message
display = f"{EMOJI_MAP.get(emotion)} {emotion}"
return display, f"{confidence * 100:.1f}%", message
# =====================================================
# MUSIC-AS-THERAPY (PLACEHOLDER)
# =====================================================
def therapy_music_placeholder(emotion):
"""
Placeholder for future music therapy engine
"""
if emotion in ["SAD", "ANGRY"]:
return "🎧 Music therapy will recommend calming, uplifting tracks."
if emotion == "HAPPY":
return "🎧 Music therapy will sustain and reinforce positive mood."
if emotion == "NEUTRAL":
return "🎧 Music therapy will gently elevate mood."
return "🎧 Music therapy unavailable."
# =====================================================
# GRADIO UI
# =====================================================
with gr.Blocks(title="MoodSync Therapy AI") as demo:
gr.Markdown("# 🎀 MoodSync Therapy AI")
gr.Markdown(
"Clinical-grade voice emotion detection for therapy.\n\n"
"β€’ Detects **one dominant emotion**\n"
"β€’ High confidence only\n"
"β€’ Designed for Music-as-Therapy systems"
)
with gr.Row():
with gr.Column(scale=6):
audio_input = gr.Audio(
sources=["microphone", "upload"],
type="numpy",
label="Speak naturally for 3–10 seconds"
)
analyze_btn = gr.Button("Analyze Emotion", variant="primary")
with gr.Column(scale=4):
emotion_output = gr.Textbox(label="Detected Emotion", interactive=False)
confidence_output = gr.Textbox(label="Confidence", interactive=False)
explanation_output = gr.Textbox(label="Clinical Explanation", lines=3, interactive=False)
therapy_btn = gr.Button("Music-as-Therapy (Preview)")
therapy_output = gr.Textbox(label="Therapy Guidance", interactive=False)
analyze_btn.click(
fn=analyze_voice,
inputs=audio_input,
outputs=[emotion_output, confidence_output, explanation_output]
)
therapy_btn.click(
fn=therapy_music_placeholder,
inputs=emotion_output,
outputs=therapy_output
)
# =====================================================
# APP ENTRYPOINT
# =====================================================
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
debug=False
)