#!/usr/bin/env python3
"""
ElevenLabs MCP Server for Voice Capabilities
Provides text-to-speech and speech-to-text for ALS Research Agent

This server enables voice accessibility features crucial for ALS patients
who may have limited mobility but retain cognitive function.
"""

from mcp.server.fastmcp import FastMCP
import httpx
import logging
import os
import base64
import json
from typing import Optional, Dict, Any
from pathlib import Path
import sys

# Add parent directory to path for shared imports
sys.path.insert(0, str(Path(__file__).parent.parent))

from shared import config
from shared.http_client import get_http_client

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize MCP server
mcp = FastMCP("elevenlabs-voice")

# ElevenLabs API configuration
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")
ELEVENLABS_API_BASE = "https://api.elevenlabs.io/v1"

# Default voice settings optimized for clarity (important for ALS patients)
DEFAULT_VOICE_ID = os.getenv("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")  # Rachel voice (clear and calm)
DEFAULT_MODEL = "eleven_turbo_v2_5"  # Turbo v2.5 - Fastest model available (40% faster than v2)

# Voice settings for accessibility
VOICE_SETTINGS = {
    "stability": 0.5,  # Balanced for speed and clarity (turbo model)
    "similarity_boost": 0.5,  # Balanced setting for faster processing
    "style": 0.0,  # Neutral style for clarity
    "use_speaker_boost": True  # Enhanced clarity
}


@mcp.tool()
async def text_to_speech(
    text: str,
    voice_id: Optional[str] = None,
    output_format: str = "mp3_44100_128",
    speed: float = 1.0
) -> str:
    """Convert text to speech optimized for ALS patients.

    Args:
        text: Text to convert to speech (research findings, paper summaries, etc.)
        voice_id: ElevenLabs voice ID (defaults to clear, calm voice)
        output_format: Audio format (mp3_44100_128, mp3_44100_192, pcm_16000, etc.)
        speed: Speech rate (0.5-2.0, default 1.0 - can be slower for clarity)

    Returns:
        Base64 encoded audio data and metadata
    """
    try:
        if not ELEVENLABS_API_KEY:
            return json.dumps({
                "status": "error",
                "error": "ELEVENLABS_API_KEY not configured",
                "message": "Please set your ElevenLabs API key in .env file"
            }, indent=2)

        # Limit text length to avoid ElevenLabs API timeouts
        # Testing shows 2500 chars is safe, 5000 chars times out
        max_length = 2500
        if len(text) > max_length:
            logger.warning(f"Text truncated from {len(text)} to {max_length} characters to avoid timeout")
            # Try to truncate at a sentence boundary
            truncated = text[:max_length]
            last_period = truncated.rfind('.')
            last_newline = truncated.rfind('\n')
            # Use the latest sentence/paragraph boundary
            boundary = max(last_period, last_newline)
            if boundary > max_length - 500:  # If there's a boundary in the last 500 chars
                text = truncated[:boundary + 1]
            else:
                text = truncated + "..."

        voice_id = voice_id or DEFAULT_VOICE_ID

        # Prepare the request
        url = f"{ELEVENLABS_API_BASE}/text-to-speech/{voice_id}"

        headers = {
            "xi-api-key": ELEVENLABS_API_KEY,
            "Content-Type": "application/json"
        }

        # Adjust voice settings for speed
        adjusted_settings = VOICE_SETTINGS.copy()
        if speed < 1.0:
            # Slower speech - increase stability for clarity
            adjusted_settings["stability"] = min(1.0, adjusted_settings["stability"] + 0.1)

        payload = {
            "text": text,
            "model_id": DEFAULT_MODEL,
            "voice_settings": adjusted_settings
        }

        logger.info(f"Converting text to speech: {len(text)} characters")

        # Set timeout based on text length (with 2500 char limit, 45s should be enough)
        timeout = 45.0
        logger.info(f"Using timeout of {timeout} seconds")

        # Use shared HTTP client for connection pooling
        client = get_http_client(timeout=timeout)
        response = await client.post(url, json=payload, headers=headers)
        response.raise_for_status()

        # Get the audio data
        audio_data = response.content

        # Encode to base64 for transmission
        audio_base64 = base64.b64encode(audio_data).decode('utf-8')

        # Return structured response
        result = {
            "status": "success",
            "audio_base64": audio_base64,
            "format": output_format,
            "duration_estimate": len(text) / 150 * 60,  # Rough estimate: 150 words/min
            "text_length": len(text),
            "voice_id": voice_id,
            "message": "Audio generated successfully. Use the audio_base64 field to play the audio."
        }

        logger.info(f"Successfully generated {len(audio_data)} bytes of audio")
        return json.dumps(result, indent=2)

    except httpx.HTTPStatusError as e:
        logger.error(f"ElevenLabs API error: {e}")
        if e.response.status_code == 401:
            return json.dumps({
                "status": "error",
                "error": "Authentication failed",
                "message": "Check your ELEVENLABS_API_KEY"
            }, indent=2)
        elif e.response.status_code == 429:
            return json.dumps({
                "status": "error",
                "error": "Rate limit exceeded",
                "message": "Please wait before trying again"
            }, indent=2)
        else:
            return json.dumps({
                "status": "error",
                "error": f"API error: {e.response.status_code}",
                "message": str(e)
            }, indent=2)

    except Exception as e:
        logger.error(f"Unexpected error in text_to_speech: {e}")
        return json.dumps({
            "status": "error",
            "error": "Text-to-speech error",
            "message": str(e)
        }, indent=2)


@mcp.tool()
async def create_audio_summary(
    content: str,
    summary_type: str = "research",
    max_duration: int = 60
) -> str:
    """Create an audio summary of research content optimized for listening.

    This tool reformats technical content into a more listenable format
    before converting to speech - important for complex medical research.

    Args:
        content: Research content to summarize (paper abstract, findings, etc.)
        summary_type: Type of summary - "research", "clinical", "patient-friendly"
        max_duration: Target duration in seconds (affects summary length)

    Returns:
        Audio summary with both text and audio versions
    """
    try:
        # Calculate target word count (assuming 150 words per minute)
        target_words = int((max_duration / 60) * 150)

        # Process content based on summary type
        if summary_type == "patient-friendly":
            # Simplify medical jargon for patients/families
            processed_text = _simplify_medical_content(content, target_words)
        elif summary_type == "clinical":
            # Focus on clinical relevance
            processed_text = _extract_clinical_relevance(content, target_words)
        else:  # research
            # Standard research summary
            processed_text = _create_research_summary(content, target_words)

        # Add intro for context
        intro = "Here's your audio research summary: "
        final_text = intro + processed_text

        # Convert to speech
        tts_result = await text_to_speech(
            text=final_text,
            speed=0.95  # Slightly slower for complex content
        )

        # Parse the TTS result
        tts_data = json.loads(tts_result)

        if tts_data.get("status") != "success":
            return tts_result  # Return error from TTS

        # Return enhanced result
        result = {
            "status": "success",
            "audio_base64": tts_data["audio_base64"],
            "text_summary": processed_text,
            "summary_type": summary_type,
            "word_count": len(processed_text.split()),
            "estimated_duration": tts_data["duration_estimate"],
            "format": tts_data["format"],
            "message": f"Audio summary created: {summary_type} format, ~{int(tts_data['duration_estimate'])} seconds"
        }

        return json.dumps(result, indent=2)

    except Exception as e:
        logger.error(f"Error creating audio summary: {e}")
        return json.dumps({
            "status": "error",
            "error": "Summary creation error",
            "message": str(e)
        }, indent=2)


@mcp.tool()
async def list_voices() -> str:
    """List available voices optimized for medical/research content.

    Returns voices suitable for clear pronunciation of medical terminology.
    """
    try:
        if not ELEVENLABS_API_KEY:
            return json.dumps({
                "status": "error",
                "error": "ELEVENLABS_API_KEY not configured",
                "message": "Please set your ElevenLabs API key in .env file"
            }, indent=2)

        url = f"{ELEVENLABS_API_BASE}/voices"
        headers = {"xi-api-key": ELEVENLABS_API_KEY}

        # Use shared HTTP client for connection pooling
        client = get_http_client(timeout=10.0)
        response = await client.get(url, headers=headers)
        response.raise_for_status()

        data = response.json()
        voices = data.get("voices", [])

        # Filter and rank voices for medical content
        recommended_voices = []
        for voice in voices:
            # Prefer clear, professional voices
            labels = voice.get("labels", {})
            if any(label in ["clear", "professional", "narration"] for label in labels.values()):
                recommended_voices.append({
                    "voice_id": voice["voice_id"],
                    "name": voice["name"],
                    "preview_url": voice.get("preview_url"),
                    "description": voice.get("description", ""),
                    "recommended_for": "medical_content"
                })

        # Add all other voices
        other_voices = []
        for voice in voices:
            if voice["voice_id"] not in [v["voice_id"] for v in recommended_voices]:
                other_voices.append({
                    "voice_id": voice["voice_id"],
                    "name": voice["name"],
                    "preview_url": voice.get("preview_url"),
                    "description": voice.get("description", "")
                })

        result = {
            "status": "success",
            "recommended_voices": recommended_voices[:5],  # Top 5 recommended
            "other_voices": other_voices[:10],  # Limit for clarity
            "total_voices": len(voices),
            "message": "Recommended voices are optimized for clear medical terminology pronunciation"
        }

        return json.dumps(result, indent=2)

    except Exception as e:
        logger.error(f"Error listing voices: {e}")
        return json.dumps({
            "status": "error",
            "error": "Failed to list voices",
            "message": str(e)
        }, indent=2)


@mcp.tool()
async def pronunciation_guide(
    medical_terms: list[str],
    include_audio: bool = True
) -> str:
    """Generate pronunciation guide for medical terms.

    Critical for ALS patients/caregivers learning about complex terminology.

    Args:
        medical_terms: List of medical terms to pronounce
        include_audio: Whether to include audio pronunciation

    Returns:
        Pronunciation guide with optional audio
    """
    try:
        results = []

        for term in medical_terms[:10]:  # Limit to prevent long processing
            # Create phonetic breakdown
            phonetic = _get_phonetic_spelling(term)

            # Create pronunciation text
            pronunciation_text = f"{term}. {phonetic}. {term}."

            result_entry = {
                "term": term,
                "phonetic": phonetic
            }

            if include_audio:
                # Generate audio
                tts_result = await text_to_speech(
                    text=pronunciation_text,
                    speed=0.8  # Slower for clarity
                )

                tts_data = json.loads(tts_result)
                if tts_data.get("status") == "success":
                    result_entry["audio_base64"] = tts_data["audio_base64"]

            results.append(result_entry)

        return json.dumps({
            "status": "success",
            "pronunciations": results,
            "message": f"Generated pronunciation guide for {len(results)} terms"
        }, indent=2)

    except Exception as e:
        logger.error(f"Error creating pronunciation guide: {e}")
        return json.dumps({
            "status": "error",
            "error": "Pronunciation guide error",
            "message": str(e)
        }, indent=2)


# Helper functions for content processing

def _simplify_medical_content(content: str, target_words: int) -> str:
    """Simplify medical content for patient understanding."""
    # This would ideally use NLP, but for now, basic simplification

    # First, strip references for cleaner audio
    content = _strip_references(content)

    # Common medical term replacements
    replacements = {
        "amyotrophic lateral sclerosis": "ALS or Lou Gehrig's disease",
        "motor neurons": "nerve cells that control muscles",
        "neurodegeneration": "nerve cell damage",
        "pathogenesis": "disease development",
        "etiology": "cause",
        "prognosis": "expected outcome",
        "therapeutic": "treatment",
        "pharmacological": "drug-based",
        "intervention": "treatment",
        "mortality": "death rate",
        "morbidity": "illness rate"
    }

    simplified = content.lower()
    for term, replacement in replacements.items():
        simplified = simplified.replace(term, replacement)

    # Truncate to target length
    words = simplified.split()
    if len(words) > target_words:
        words = words[:target_words]
        simplified = " ".join(words) + "..."

    return simplified.capitalize()


def _extract_clinical_relevance(content: str, target_words: int) -> str:
    """Extract clinically relevant information."""
    # Focus on treatment, outcomes, and practical implications

    # First, strip references for cleaner audio
    content = _strip_references(content)

    # Look for key clinical phrases
    clinical_markers = [
        "treatment", "therapy", "outcome", "survival", "progression",
        "clinical trial", "efficacy", "safety", "adverse", "benefit",
        "patient", "dose", "administration"
    ]

    sentences = content.split(". ")
    relevant_sentences = []

    for sentence in sentences:
        if any(marker in sentence.lower() for marker in clinical_markers):
            relevant_sentences.append(sentence)

    result = ". ".join(relevant_sentences)

    # Truncate to target length
    words = result.split()
    if len(words) > target_words:
        words = words[:target_words]
        result = " ".join(words) + "..."

    return result


def _create_research_summary(content: str, target_words: int) -> str:
    """Create a research-focused summary."""
    # Extract key findings and implications

    # First, strip references section if present
    content = _strip_references(content)

    # Simply truncate for now (could be enhanced with NLP)
    words = content.split()
    if len(words) > target_words:
        words = words[:target_words]
        content = " ".join(words) + "..."

    return content


def _strip_references(content: str) -> str:
    """Remove references section and citations from content for audio reading."""
    import re

    # Extract only synthesis content if it's marked
    synthesis_match = re.search(r'✅\s*SYNTHESIS:?\s*(.*?)(?=##?\s*References|##?\s*Bibliography|$)',
                               content, flags=re.DOTALL | re.IGNORECASE)
    if synthesis_match:
        content = synthesis_match.group(1)

    # Remove References section (multiple possible formats)
    patterns_to_remove = [
        r'##?\s*References.*$',  # ## References or # References to end
        r'##?\s*Bibliography.*$',  # Bibliography section
        r'##?\s*Citations.*$',  # Citations section
        r'##?\s*Works Cited.*$',  # Works Cited section
        r'##?\s*Key References.*$',  # Key References section
    ]

    for pattern in patterns_to_remove:
        content = re.sub(pattern, '', content, flags=re.DOTALL | re.IGNORECASE)

    # Remove inline citations like [1], [2,3], [PMID: 12345678]
    content = re.sub(r'\[[\d,\s]+\]', '', content)  # [1], [2,3], etc.
    content = re.sub(r'\[PMID:\s*\d+\]', '', content)  # [PMID: 12345678]
    content = re.sub(r'\[NCT\d+\]', '', content)  # [NCT12345678]

    # Remove URLs for cleaner audio
    content = re.sub(r'https?://[^\s\)]+', '', content)
    content = re.sub(r'www\.[^\s\)]+', '', content)

    # Remove PMID/DOI/NCT references
    content = re.sub(r'PMID:\s*\d+', '', content)
    content = re.sub(r'DOI:\s*[^\s]+', '', content)
    content = re.sub(r'NCT\d{8}', '', content)

    # Remove markdown formatting that sounds awkward in audio
    content = re.sub(r'\*\*(.*?)\*\*', r'\1', content)  # Remove bold
    content = re.sub(r'\*(.*?)\*', r'\1', content)  # Remove italic
    content = re.sub(r'`(.*?)`', r'\1', content)  # Remove inline code
    content = re.sub(r'#{1,6}\s*', '', content)  # Remove headers
    content = re.sub(r'^[-*+]\s+', '', content, flags=re.MULTILINE)  # Remove bullet points
    content = re.sub(r'^\d+\.\s+', '', content, flags=re.MULTILINE)  # Remove numbered lists

    # Replace markdown links with just the text
    content = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', content)

    # Clean up extra whitespace
    content = re.sub(r'\s+', ' ', content)
    content = re.sub(r'\n{3,}', '\n\n', content)

    return content.strip()


def _get_phonetic_spelling(term: str) -> str:
    """Generate phonetic spelling for medical terms."""
    # Basic phonetic rules for medical terms
    # This could be enhanced with a medical pronunciation dictionary

    phonetic_map = {
        "amyotrophic": "AM-ee-oh-TROH-fik",
        "lateral": "LAT-er-al",
        "sclerosis": "skleh-ROH-sis",
        "tdp-43": "T-D-P forty-three",
        "riluzole": "RIL-you-zole",
        "edaravone": "ed-AR-a-vone",
        "tofersen": "TOE-fer-sen",
        "neurofilament": "NUR-oh-FIL-a-ment",
        "astrocyte": "AS-tro-site",
        "oligodendrocyte": "oh-li-go-DEN-dro-site"
    }

    term_lower = term.lower()
    if term_lower in phonetic_map:
        return phonetic_map[term_lower]

    # Basic syllable breakdown for unknown terms
    # This is very simplified and could be improved
    syllables = []
    current = ""
    for char in term:
        if char in "aeiouAEIOU" and current:
            syllables.append(current + char)
            current = ""
        else:
            current += char
    if current:
        syllables.append(current)

    return "-".join(syllables).upper()


if __name__ == "__main__":
    # Check for API key
    if not ELEVENLABS_API_KEY:
        logger.warning("ELEVENLABS_API_KEY not set in environment")
        logger.warning("Voice features will be limited without API key")
        logger.info("Get your API key at: https://elevenlabs.io")

    # Run the MCP server
    mcp.run(transport="stdio")