ALSARA / servers /elevenlabs_server.py
axegameon's picture
Upload ALSARA app files (#1)
3e435ad verified
#!/usr/bin/env python3
"""
ElevenLabs MCP Server for Voice Capabilities
Provides text-to-speech and speech-to-text for ALS Research Agent
This server enables voice accessibility features crucial for ALS patients
who may have limited mobility but retain cognitive function.
"""
from mcp.server.fastmcp import FastMCP
import httpx
import logging
import os
import base64
import json
from typing import Optional, Dict, Any
from pathlib import Path
import sys
# Add parent directory to path for shared imports
sys.path.insert(0, str(Path(__file__).parent.parent))
from shared import config
from shared.http_client import get_http_client
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Initialize MCP server
mcp = FastMCP("elevenlabs-voice")
# ElevenLabs API configuration
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")
ELEVENLABS_API_BASE = "https://api.elevenlabs.io/v1"
# Default voice settings optimized for clarity (important for ALS patients)
DEFAULT_VOICE_ID = os.getenv("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM") # Rachel voice (clear and calm)
DEFAULT_MODEL = "eleven_turbo_v2_5" # Turbo v2.5 - Fastest model available (40% faster than v2)
# Voice settings for accessibility
VOICE_SETTINGS = {
"stability": 0.5, # Balanced for speed and clarity (turbo model)
"similarity_boost": 0.5, # Balanced setting for faster processing
"style": 0.0, # Neutral style for clarity
"use_speaker_boost": True # Enhanced clarity
}
@mcp.tool()
async def text_to_speech(
text: str,
voice_id: Optional[str] = None,
output_format: str = "mp3_44100_128",
speed: float = 1.0
) -> str:
"""Convert text to speech optimized for ALS patients.
Args:
text: Text to convert to speech (research findings, paper summaries, etc.)
voice_id: ElevenLabs voice ID (defaults to clear, calm voice)
output_format: Audio format (mp3_44100_128, mp3_44100_192, pcm_16000, etc.)
speed: Speech rate (0.5-2.0, default 1.0 - can be slower for clarity)
Returns:
Base64 encoded audio data and metadata
"""
try:
if not ELEVENLABS_API_KEY:
return json.dumps({
"status": "error",
"error": "ELEVENLABS_API_KEY not configured",
"message": "Please set your ElevenLabs API key in .env file"
}, indent=2)
# Limit text length to avoid ElevenLabs API timeouts
# Testing shows 2500 chars is safe, 5000 chars times out
max_length = 2500
if len(text) > max_length:
logger.warning(f"Text truncated from {len(text)} to {max_length} characters to avoid timeout")
# Try to truncate at a sentence boundary
truncated = text[:max_length]
last_period = truncated.rfind('.')
last_newline = truncated.rfind('\n')
# Use the latest sentence/paragraph boundary
boundary = max(last_period, last_newline)
if boundary > max_length - 500: # If there's a boundary in the last 500 chars
text = truncated[:boundary + 1]
else:
text = truncated + "..."
voice_id = voice_id or DEFAULT_VOICE_ID
# Prepare the request
url = f"{ELEVENLABS_API_BASE}/text-to-speech/{voice_id}"
headers = {
"xi-api-key": ELEVENLABS_API_KEY,
"Content-Type": "application/json"
}
# Adjust voice settings for speed
adjusted_settings = VOICE_SETTINGS.copy()
if speed < 1.0:
# Slower speech - increase stability for clarity
adjusted_settings["stability"] = min(1.0, adjusted_settings["stability"] + 0.1)
payload = {
"text": text,
"model_id": DEFAULT_MODEL,
"voice_settings": adjusted_settings
}
logger.info(f"Converting text to speech: {len(text)} characters")
# Set timeout based on text length (with 2500 char limit, 45s should be enough)
timeout = 45.0
logger.info(f"Using timeout of {timeout} seconds")
# Use shared HTTP client for connection pooling
client = get_http_client(timeout=timeout)
response = await client.post(url, json=payload, headers=headers)
response.raise_for_status()
# Get the audio data
audio_data = response.content
# Encode to base64 for transmission
audio_base64 = base64.b64encode(audio_data).decode('utf-8')
# Return structured response
result = {
"status": "success",
"audio_base64": audio_base64,
"format": output_format,
"duration_estimate": len(text) / 150 * 60, # Rough estimate: 150 words/min
"text_length": len(text),
"voice_id": voice_id,
"message": "Audio generated successfully. Use the audio_base64 field to play the audio."
}
logger.info(f"Successfully generated {len(audio_data)} bytes of audio")
return json.dumps(result, indent=2)
except httpx.HTTPStatusError as e:
logger.error(f"ElevenLabs API error: {e}")
if e.response.status_code == 401:
return json.dumps({
"status": "error",
"error": "Authentication failed",
"message": "Check your ELEVENLABS_API_KEY"
}, indent=2)
elif e.response.status_code == 429:
return json.dumps({
"status": "error",
"error": "Rate limit exceeded",
"message": "Please wait before trying again"
}, indent=2)
else:
return json.dumps({
"status": "error",
"error": f"API error: {e.response.status_code}",
"message": str(e)
}, indent=2)
except Exception as e:
logger.error(f"Unexpected error in text_to_speech: {e}")
return json.dumps({
"status": "error",
"error": "Text-to-speech error",
"message": str(e)
}, indent=2)
@mcp.tool()
async def create_audio_summary(
content: str,
summary_type: str = "research",
max_duration: int = 60
) -> str:
"""Create an audio summary of research content optimized for listening.
This tool reformats technical content into a more listenable format
before converting to speech - important for complex medical research.
Args:
content: Research content to summarize (paper abstract, findings, etc.)
summary_type: Type of summary - "research", "clinical", "patient-friendly"
max_duration: Target duration in seconds (affects summary length)
Returns:
Audio summary with both text and audio versions
"""
try:
# Calculate target word count (assuming 150 words per minute)
target_words = int((max_duration / 60) * 150)
# Process content based on summary type
if summary_type == "patient-friendly":
# Simplify medical jargon for patients/families
processed_text = _simplify_medical_content(content, target_words)
elif summary_type == "clinical":
# Focus on clinical relevance
processed_text = _extract_clinical_relevance(content, target_words)
else: # research
# Standard research summary
processed_text = _create_research_summary(content, target_words)
# Add intro for context
intro = "Here's your audio research summary: "
final_text = intro + processed_text
# Convert to speech
tts_result = await text_to_speech(
text=final_text,
speed=0.95 # Slightly slower for complex content
)
# Parse the TTS result
tts_data = json.loads(tts_result)
if tts_data.get("status") != "success":
return tts_result # Return error from TTS
# Return enhanced result
result = {
"status": "success",
"audio_base64": tts_data["audio_base64"],
"text_summary": processed_text,
"summary_type": summary_type,
"word_count": len(processed_text.split()),
"estimated_duration": tts_data["duration_estimate"],
"format": tts_data["format"],
"message": f"Audio summary created: {summary_type} format, ~{int(tts_data['duration_estimate'])} seconds"
}
return json.dumps(result, indent=2)
except Exception as e:
logger.error(f"Error creating audio summary: {e}")
return json.dumps({
"status": "error",
"error": "Summary creation error",
"message": str(e)
}, indent=2)
@mcp.tool()
async def list_voices() -> str:
"""List available voices optimized for medical/research content.
Returns voices suitable for clear pronunciation of medical terminology.
"""
try:
if not ELEVENLABS_API_KEY:
return json.dumps({
"status": "error",
"error": "ELEVENLABS_API_KEY not configured",
"message": "Please set your ElevenLabs API key in .env file"
}, indent=2)
url = f"{ELEVENLABS_API_BASE}/voices"
headers = {"xi-api-key": ELEVENLABS_API_KEY}
# Use shared HTTP client for connection pooling
client = get_http_client(timeout=10.0)
response = await client.get(url, headers=headers)
response.raise_for_status()
data = response.json()
voices = data.get("voices", [])
# Filter and rank voices for medical content
recommended_voices = []
for voice in voices:
# Prefer clear, professional voices
labels = voice.get("labels", {})
if any(label in ["clear", "professional", "narration"] for label in labels.values()):
recommended_voices.append({
"voice_id": voice["voice_id"],
"name": voice["name"],
"preview_url": voice.get("preview_url"),
"description": voice.get("description", ""),
"recommended_for": "medical_content"
})
# Add all other voices
other_voices = []
for voice in voices:
if voice["voice_id"] not in [v["voice_id"] for v in recommended_voices]:
other_voices.append({
"voice_id": voice["voice_id"],
"name": voice["name"],
"preview_url": voice.get("preview_url"),
"description": voice.get("description", "")
})
result = {
"status": "success",
"recommended_voices": recommended_voices[:5], # Top 5 recommended
"other_voices": other_voices[:10], # Limit for clarity
"total_voices": len(voices),
"message": "Recommended voices are optimized for clear medical terminology pronunciation"
}
return json.dumps(result, indent=2)
except Exception as e:
logger.error(f"Error listing voices: {e}")
return json.dumps({
"status": "error",
"error": "Failed to list voices",
"message": str(e)
}, indent=2)
@mcp.tool()
async def pronunciation_guide(
medical_terms: list[str],
include_audio: bool = True
) -> str:
"""Generate pronunciation guide for medical terms.
Critical for ALS patients/caregivers learning about complex terminology.
Args:
medical_terms: List of medical terms to pronounce
include_audio: Whether to include audio pronunciation
Returns:
Pronunciation guide with optional audio
"""
try:
results = []
for term in medical_terms[:10]: # Limit to prevent long processing
# Create phonetic breakdown
phonetic = _get_phonetic_spelling(term)
# Create pronunciation text
pronunciation_text = f"{term}. {phonetic}. {term}."
result_entry = {
"term": term,
"phonetic": phonetic
}
if include_audio:
# Generate audio
tts_result = await text_to_speech(
text=pronunciation_text,
speed=0.8 # Slower for clarity
)
tts_data = json.loads(tts_result)
if tts_data.get("status") == "success":
result_entry["audio_base64"] = tts_data["audio_base64"]
results.append(result_entry)
return json.dumps({
"status": "success",
"pronunciations": results,
"message": f"Generated pronunciation guide for {len(results)} terms"
}, indent=2)
except Exception as e:
logger.error(f"Error creating pronunciation guide: {e}")
return json.dumps({
"status": "error",
"error": "Pronunciation guide error",
"message": str(e)
}, indent=2)
# Helper functions for content processing
def _simplify_medical_content(content: str, target_words: int) -> str:
"""Simplify medical content for patient understanding."""
# This would ideally use NLP, but for now, basic simplification
# First, strip references for cleaner audio
content = _strip_references(content)
# Common medical term replacements
replacements = {
"amyotrophic lateral sclerosis": "ALS or Lou Gehrig's disease",
"motor neurons": "nerve cells that control muscles",
"neurodegeneration": "nerve cell damage",
"pathogenesis": "disease development",
"etiology": "cause",
"prognosis": "expected outcome",
"therapeutic": "treatment",
"pharmacological": "drug-based",
"intervention": "treatment",
"mortality": "death rate",
"morbidity": "illness rate"
}
simplified = content.lower()
for term, replacement in replacements.items():
simplified = simplified.replace(term, replacement)
# Truncate to target length
words = simplified.split()
if len(words) > target_words:
words = words[:target_words]
simplified = " ".join(words) + "..."
return simplified.capitalize()
def _extract_clinical_relevance(content: str, target_words: int) -> str:
"""Extract clinically relevant information."""
# Focus on treatment, outcomes, and practical implications
# First, strip references for cleaner audio
content = _strip_references(content)
# Look for key clinical phrases
clinical_markers = [
"treatment", "therapy", "outcome", "survival", "progression",
"clinical trial", "efficacy", "safety", "adverse", "benefit",
"patient", "dose", "administration"
]
sentences = content.split(". ")
relevant_sentences = []
for sentence in sentences:
if any(marker in sentence.lower() for marker in clinical_markers):
relevant_sentences.append(sentence)
result = ". ".join(relevant_sentences)
# Truncate to target length
words = result.split()
if len(words) > target_words:
words = words[:target_words]
result = " ".join(words) + "..."
return result
def _create_research_summary(content: str, target_words: int) -> str:
"""Create a research-focused summary."""
# Extract key findings and implications
# First, strip references section if present
content = _strip_references(content)
# Simply truncate for now (could be enhanced with NLP)
words = content.split()
if len(words) > target_words:
words = words[:target_words]
content = " ".join(words) + "..."
return content
def _strip_references(content: str) -> str:
"""Remove references section and citations from content for audio reading."""
import re
# Extract only synthesis content if it's marked
synthesis_match = re.search(r'✅\s*SYNTHESIS:?\s*(.*?)(?=##?\s*References|##?\s*Bibliography|$)',
content, flags=re.DOTALL | re.IGNORECASE)
if synthesis_match:
content = synthesis_match.group(1)
# Remove References section (multiple possible formats)
patterns_to_remove = [
r'##?\s*References.*$', # ## References or # References to end
r'##?\s*Bibliography.*$', # Bibliography section
r'##?\s*Citations.*$', # Citations section
r'##?\s*Works Cited.*$', # Works Cited section
r'##?\s*Key References.*$', # Key References section
]
for pattern in patterns_to_remove:
content = re.sub(pattern, '', content, flags=re.DOTALL | re.IGNORECASE)
# Remove inline citations like [1], [2,3], [PMID: 12345678]
content = re.sub(r'\[[\d,\s]+\]', '', content) # [1], [2,3], etc.
content = re.sub(r'\[PMID:\s*\d+\]', '', content) # [PMID: 12345678]
content = re.sub(r'\[NCT\d+\]', '', content) # [NCT12345678]
# Remove URLs for cleaner audio
content = re.sub(r'https?://[^\s\)]+', '', content)
content = re.sub(r'www\.[^\s\)]+', '', content)
# Remove PMID/DOI/NCT references
content = re.sub(r'PMID:\s*\d+', '', content)
content = re.sub(r'DOI:\s*[^\s]+', '', content)
content = re.sub(r'NCT\d{8}', '', content)
# Remove markdown formatting that sounds awkward in audio
content = re.sub(r'\*\*(.*?)\*\*', r'\1', content) # Remove bold
content = re.sub(r'\*(.*?)\*', r'\1', content) # Remove italic
content = re.sub(r'`(.*?)`', r'\1', content) # Remove inline code
content = re.sub(r'#{1,6}\s*', '', content) # Remove headers
content = re.sub(r'^[-*+]\s+', '', content, flags=re.MULTILINE) # Remove bullet points
content = re.sub(r'^\d+\.\s+', '', content, flags=re.MULTILINE) # Remove numbered lists
# Replace markdown links with just the text
content = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', content)
# Clean up extra whitespace
content = re.sub(r'\s+', ' ', content)
content = re.sub(r'\n{3,}', '\n\n', content)
return content.strip()
def _get_phonetic_spelling(term: str) -> str:
"""Generate phonetic spelling for medical terms."""
# Basic phonetic rules for medical terms
# This could be enhanced with a medical pronunciation dictionary
phonetic_map = {
"amyotrophic": "AM-ee-oh-TROH-fik",
"lateral": "LAT-er-al",
"sclerosis": "skleh-ROH-sis",
"tdp-43": "T-D-P forty-three",
"riluzole": "RIL-you-zole",
"edaravone": "ed-AR-a-vone",
"tofersen": "TOE-fer-sen",
"neurofilament": "NUR-oh-FIL-a-ment",
"astrocyte": "AS-tro-site",
"oligodendrocyte": "oh-li-go-DEN-dro-site"
}
term_lower = term.lower()
if term_lower in phonetic_map:
return phonetic_map[term_lower]
# Basic syllable breakdown for unknown terms
# This is very simplified and could be improved
syllables = []
current = ""
for char in term:
if char in "aeiouAEIOU" and current:
syllables.append(current + char)
current = ""
else:
current += char
if current:
syllables.append(current)
return "-".join(syllables).upper()
if __name__ == "__main__":
# Check for API key
if not ELEVENLABS_API_KEY:
logger.warning("ELEVENLABS_API_KEY not set in environment")
logger.warning("Voice features will be limited without API key")
logger.info("Get your API key at: https://elevenlabs.io")
# Run the MCP server
mcp.run(transport="stdio")