"""Judge prompts for evidence assessment."""

from src.config.domain import ResearchDomain, get_domain_config
from src.utils.models import Evidence


def get_system_prompt(domain: ResearchDomain | str | None = None) -> str:
    """Get the system prompt for the judge agent."""
    config = get_domain_config(domain)
    return f"""{config.judge_system_prompt}

Your task is to SCORE evidence from biomedical literature. You do NOT decide whether to
continue searching or synthesize - that decision is made by the orchestration system
based on your scores.

## Your Role: Scoring Only

You provide objective scores. The system decides next steps based on explicit thresholds.
This separation prevents bias in the decision-making process.

## Scoring Criteria

1. **Mechanism Score (0-10)**: How well does the evidence explain the biological mechanism?
   - 0-3: No clear mechanism, speculative
   - 4-6: Some mechanistic insight, but gaps exist
   - 7-10: Clear, well-supported mechanism of action

2. **Clinical Evidence Score (0-10)**: Strength of clinical/preclinical support?
   - 0-3: No clinical data, only theoretical
   - 4-6: Preclinical or early clinical data
   - 7-10: Strong clinical evidence (trials, meta-analyses)

3. **Drug Candidates**: List SPECIFIC drug names mentioned in the evidence
   - Only include drugs explicitly mentioned
   - Do NOT hallucinate or infer drug names
   - Include drug class if specific names aren't available (e.g., "SSRI antidepressants")

4. **Key Findings**: Extract 3-5 key findings from the evidence
   - Focus on findings relevant to the research question
   - Include mechanism insights and clinical outcomes

5. **Confidence (0.0-1.0)**: Your confidence in the scores
   - Based on evidence quality and relevance
   - Lower if evidence is tangential or low-quality

## Output Format

Return valid JSON with these fields:
- details.mechanism_score (int 0-10)
- details.mechanism_reasoning (string)
- details.clinical_evidence_score (int 0-10)
- details.clinical_reasoning (string)
- details.drug_candidates (list of strings)
- details.key_findings (list of strings)
- sufficient (boolean) - TRUE if scores suggest enough evidence
- confidence (float 0-1)
- recommendation ("continue" or "synthesize") - Your suggestion (system may override)
- next_search_queries (list) - If continuing, suggest FOCUSED queries
- reasoning (string)

## CRITICAL: Search Query Rules

When suggesting next_search_queries:
- STAY FOCUSED on the original research question
- Do NOT drift to tangential topics
- If question is about "female libido", do NOT suggest "bone health" or "muscle mass"
- Refine existing terms, don't explore random medical associations
"""


def get_scoring_prompt(domain: ResearchDomain | str | None = None) -> str:
    """Get the scoring instructions for the judge."""
    config = get_domain_config(domain)
    return config.judge_scoring_prompt


# Keep SYSTEM_PROMPT for backwards compatibility
SYSTEM_PROMPT = get_system_prompt()

MAX_EVIDENCE_FOR_JUDGE = 30  # Keep under token limits


async def select_evidence_for_judge(
    evidence: list[Evidence],
    query: str,
    max_items: int = MAX_EVIDENCE_FOR_JUDGE,
) -> list[Evidence]:
    """
    Select diverse, relevant evidence for judge evaluation.

    Implements RAG best practices:
    - Diversity selection over recency-only
    - Lost-in-the-middle mitigation
    - Relevance re-ranking
    """
    if len(evidence) <= max_items:
        return evidence

    try:
        from src.utils.text_utils import select_diverse_evidence

        # Use embedding-based diversity selection
        return await select_diverse_evidence(evidence, n=max_items, query=query)
    except ImportError:
        # Fallback: mix of recent + early (lost-in-the-middle mitigation)
        early = evidence[: max_items // 3]  # First third
        recent = evidence[-(max_items * 2 // 3) :]  # Last two-thirds
        return early + recent


def format_user_prompt(
    question: str,
    evidence: list[Evidence],
    iteration: int = 0,
    max_iterations: int = 10,
    total_evidence_count: int | None = None,
    domain: ResearchDomain | str | None = None,
) -> str:
    """
    Format user prompt with selected evidence and iteration context.

    NOTE: Evidence should be pre-selected using select_evidence_for_judge().
    This function assumes evidence is already capped.
    """
    total_count = total_evidence_count or len(evidence)
    max_content_len = 1500
    scoring_prompt = get_scoring_prompt(domain)

    def format_single_evidence(i: int, e: Evidence) -> str:
        content = e.content
        if len(content) > max_content_len:
            content = content[:max_content_len] + "..."
        return (
            f"### Evidence {i + 1}\n"
            f"**Source**: {e.citation.source.upper()} - {e.citation.title}\n"
            f"**URL**: {e.citation.url}\n"
            f"**Content**:\n{content}"
        )

    evidence_text = "\n\n".join([format_single_evidence(i, e) for i, e in enumerate(evidence)])

    # Lost-in-the-middle mitigation: put critical context at START and END
    return f"""## Research Question (IMPORTANT - stay focused on this)
{question}

## Search Progress
- **Iteration**: {iteration}/{max_iterations}
- **Total evidence collected**: {total_count} sources
- **Evidence shown below**: {len(evidence)} diverse sources (selected for relevance)

## Available Evidence

{evidence_text}

## Your Task

{scoring_prompt}
DO NOT decide "synthesize" vs "continue" - that decision is made by the system.

## REMINDER: Original Question (stay focused)
{question}
"""


def format_empty_evidence_prompt(question: str) -> str:
    """
    Format prompt when no evidence was found.

    Args:
        question: The user's research question

    Returns:
        Formatted prompt string
    """
    return f"""## Research Question
{question}

## Available Evidence

No evidence was found from the search.

## Your Task

Since no evidence was found, recommend search queries that might yield better results.
Set sufficient=False and recommendation=\"continue\".
Suggest 3-5 specific search queries.
"""