DeepBoner / src /prompts /judge.py
VibecoderMcSwaggins's picture
feat: SPEC_10 - Domain-Agnostic Refactor + License Fix (#87)
fd1472e unverified
"""Judge prompts for evidence assessment."""
from src.config.domain import ResearchDomain, get_domain_config
from src.utils.models import Evidence
def get_system_prompt(domain: ResearchDomain | str | None = None) -> str:
"""Get the system prompt for the judge agent."""
config = get_domain_config(domain)
return f"""{config.judge_system_prompt}
Your task is to SCORE evidence from biomedical literature. You do NOT decide whether to
continue searching or synthesize - that decision is made by the orchestration system
based on your scores.
## Your Role: Scoring Only
You provide objective scores. The system decides next steps based on explicit thresholds.
This separation prevents bias in the decision-making process.
## Scoring Criteria
1. **Mechanism Score (0-10)**: How well does the evidence explain the biological mechanism?
- 0-3: No clear mechanism, speculative
- 4-6: Some mechanistic insight, but gaps exist
- 7-10: Clear, well-supported mechanism of action
2. **Clinical Evidence Score (0-10)**: Strength of clinical/preclinical support?
- 0-3: No clinical data, only theoretical
- 4-6: Preclinical or early clinical data
- 7-10: Strong clinical evidence (trials, meta-analyses)
3. **Drug Candidates**: List SPECIFIC drug names mentioned in the evidence
- Only include drugs explicitly mentioned
- Do NOT hallucinate or infer drug names
- Include drug class if specific names aren't available (e.g., "SSRI antidepressants")
4. **Key Findings**: Extract 3-5 key findings from the evidence
- Focus on findings relevant to the research question
- Include mechanism insights and clinical outcomes
5. **Confidence (0.0-1.0)**: Your confidence in the scores
- Based on evidence quality and relevance
- Lower if evidence is tangential or low-quality
## Output Format
Return valid JSON with these fields:
- details.mechanism_score (int 0-10)
- details.mechanism_reasoning (string)
- details.clinical_evidence_score (int 0-10)
- details.clinical_reasoning (string)
- details.drug_candidates (list of strings)
- details.key_findings (list of strings)
- sufficient (boolean) - TRUE if scores suggest enough evidence
- confidence (float 0-1)
- recommendation ("continue" or "synthesize") - Your suggestion (system may override)
- next_search_queries (list) - If continuing, suggest FOCUSED queries
- reasoning (string)
## CRITICAL: Search Query Rules
When suggesting next_search_queries:
- STAY FOCUSED on the original research question
- Do NOT drift to tangential topics
- If question is about "female libido", do NOT suggest "bone health" or "muscle mass"
- Refine existing terms, don't explore random medical associations
"""
def get_scoring_prompt(domain: ResearchDomain | str | None = None) -> str:
"""Get the scoring instructions for the judge."""
config = get_domain_config(domain)
return config.judge_scoring_prompt
# Keep SYSTEM_PROMPT for backwards compatibility
SYSTEM_PROMPT = get_system_prompt()
MAX_EVIDENCE_FOR_JUDGE = 30 # Keep under token limits
async def select_evidence_for_judge(
evidence: list[Evidence],
query: str,
max_items: int = MAX_EVIDENCE_FOR_JUDGE,
) -> list[Evidence]:
"""
Select diverse, relevant evidence for judge evaluation.
Implements RAG best practices:
- Diversity selection over recency-only
- Lost-in-the-middle mitigation
- Relevance re-ranking
"""
if len(evidence) <= max_items:
return evidence
try:
from src.utils.text_utils import select_diverse_evidence
# Use embedding-based diversity selection
return await select_diverse_evidence(evidence, n=max_items, query=query)
except ImportError:
# Fallback: mix of recent + early (lost-in-the-middle mitigation)
early = evidence[: max_items // 3] # First third
recent = evidence[-(max_items * 2 // 3) :] # Last two-thirds
return early + recent
def format_user_prompt(
question: str,
evidence: list[Evidence],
iteration: int = 0,
max_iterations: int = 10,
total_evidence_count: int | None = None,
domain: ResearchDomain | str | None = None,
) -> str:
"""
Format user prompt with selected evidence and iteration context.
NOTE: Evidence should be pre-selected using select_evidence_for_judge().
This function assumes evidence is already capped.
"""
total_count = total_evidence_count or len(evidence)
max_content_len = 1500
scoring_prompt = get_scoring_prompt(domain)
def format_single_evidence(i: int, e: Evidence) -> str:
content = e.content
if len(content) > max_content_len:
content = content[:max_content_len] + "..."
return (
f"### Evidence {i + 1}\n"
f"**Source**: {e.citation.source.upper()} - {e.citation.title}\n"
f"**URL**: {e.citation.url}\n"
f"**Content**:\n{content}"
)
evidence_text = "\n\n".join([format_single_evidence(i, e) for i, e in enumerate(evidence)])
# Lost-in-the-middle mitigation: put critical context at START and END
return f"""## Research Question (IMPORTANT - stay focused on this)
{question}
## Search Progress
- **Iteration**: {iteration}/{max_iterations}
- **Total evidence collected**: {total_count} sources
- **Evidence shown below**: {len(evidence)} diverse sources (selected for relevance)
## Available Evidence
{evidence_text}
## Your Task
{scoring_prompt}
DO NOT decide "synthesize" vs "continue" - that decision is made by the system.
## REMINDER: Original Question (stay focused)
{question}
"""
def format_empty_evidence_prompt(question: str) -> str:
"""
Format prompt when no evidence was found.
Args:
question: The user's research question
Returns:
Formatted prompt string
"""
return f"""## Research Question
{question}
## Available Evidence
No evidence was found from the search.
## Your Task
Since no evidence was found, recommend search queries that might yield better results.
Set sufficient=False and recommendation=\"continue\".
Suggest 3-5 specific search queries.
"""