Spaces:

MCP-1st-Birthday
/

DeepBoner

Running

App Files Files Community

DeepBoner / src /prompts /judge.py

VibecoderMcSwaggins

feat: SPEC_10 - Domain-Agnostic Refactor + License Fix (#87)

fd1472e unverified 15 days ago

raw

history blame contribute delete

6.14 kB

	"""Judge prompts for evidence assessment."""

	from src.config.domain import ResearchDomain, get_domain_config
	from src.utils.models import Evidence


	def get_system_prompt(domain: ResearchDomain \| str \| None = None) -> str:
	"""Get the system prompt for the judge agent."""
	config = get_domain_config(domain)
	return f"""{config.judge_system_prompt}

	Your task is to SCORE evidence from biomedical literature. You do NOT decide whether to
	continue searching or synthesize - that decision is made by the orchestration system
	based on your scores.

	## Your Role: Scoring Only

	You provide objective scores. The system decides next steps based on explicit thresholds.
	This separation prevents bias in the decision-making process.

	## Scoring Criteria

	1. Mechanism Score (0-10): How well does the evidence explain the biological mechanism?
	- 0-3: No clear mechanism, speculative
	- 4-6: Some mechanistic insight, but gaps exist
	- 7-10: Clear, well-supported mechanism of action

	2. Clinical Evidence Score (0-10): Strength of clinical/preclinical support?
	- 0-3: No clinical data, only theoretical
	- 4-6: Preclinical or early clinical data
	- 7-10: Strong clinical evidence (trials, meta-analyses)

	3. Drug Candidates: List SPECIFIC drug names mentioned in the evidence
	- Only include drugs explicitly mentioned
	- Do NOT hallucinate or infer drug names
	- Include drug class if specific names aren't available (e.g., "SSRI antidepressants")

	4. Key Findings: Extract 3-5 key findings from the evidence
	- Focus on findings relevant to the research question
	- Include mechanism insights and clinical outcomes

	5. Confidence (0.0-1.0): Your confidence in the scores
	- Based on evidence quality and relevance
	- Lower if evidence is tangential or low-quality

	## Output Format

	Return valid JSON with these fields:
	- details.mechanism_score (int 0-10)
	- details.mechanism_reasoning (string)
	- details.clinical_evidence_score (int 0-10)
	- details.clinical_reasoning (string)
	- details.drug_candidates (list of strings)
	- details.key_findings (list of strings)
	- sufficient (boolean) - TRUE if scores suggest enough evidence
	- confidence (float 0-1)
	- recommendation ("continue" or "synthesize") - Your suggestion (system may override)
	- next_search_queries (list) - If continuing, suggest FOCUSED queries
	- reasoning (string)

	## CRITICAL: Search Query Rules

	When suggesting next_search_queries:
	- STAY FOCUSED on the original research question
	- Do NOT drift to tangential topics
	- If question is about "female libido", do NOT suggest "bone health" or "muscle mass"
	- Refine existing terms, don't explore random medical associations
	"""


	def get_scoring_prompt(domain: ResearchDomain \| str \| None = None) -> str:
	"""Get the scoring instructions for the judge."""
	config = get_domain_config(domain)
	return config.judge_scoring_prompt


	# Keep SYSTEM_PROMPT for backwards compatibility
	SYSTEM_PROMPT = get_system_prompt()

	MAX_EVIDENCE_FOR_JUDGE = 30 # Keep under token limits


	async def select_evidence_for_judge(
	evidence: list[Evidence],
	query: str,
	max_items: int = MAX_EVIDENCE_FOR_JUDGE,
	) -> list[Evidence]:
	"""
	Select diverse, relevant evidence for judge evaluation.

	Implements RAG best practices:
	- Diversity selection over recency-only
	- Lost-in-the-middle mitigation
	- Relevance re-ranking
	"""
	if len(evidence) <= max_items:
	return evidence

	try:
	from src.utils.text_utils import select_diverse_evidence

	# Use embedding-based diversity selection
	return await select_diverse_evidence(evidence, n=max_items, query=query)
	except ImportError:
	# Fallback: mix of recent + early (lost-in-the-middle mitigation)
	early = evidence[: max_items // 3] # First third
	recent = evidence[-(max_items * 2 // 3) :] # Last two-thirds
	return early + recent


	def format_user_prompt(
	question: str,
	evidence: list[Evidence],
	iteration: int = 0,
	max_iterations: int = 10,
	total_evidence_count: int \| None = None,
	domain: ResearchDomain \| str \| None = None,
	) -> str:
	"""
	Format user prompt with selected evidence and iteration context.

	NOTE: Evidence should be pre-selected using select_evidence_for_judge().
	This function assumes evidence is already capped.
	"""
	total_count = total_evidence_count or len(evidence)
	max_content_len = 1500
	scoring_prompt = get_scoring_prompt(domain)

	def format_single_evidence(i: int, e: Evidence) -> str:
	content = e.content
	if len(content) > max_content_len:
	content = content[:max_content_len] + "..."
	return (
	f"### Evidence {i + 1}\n"
	f"Source: {e.citation.source.upper()} - {e.citation.title}\n"
	f"URL: {e.citation.url}\n"
	f"Content:\n{content}"
	)

	evidence_text = "\n\n".join([format_single_evidence(i, e) for i, e in enumerate(evidence)])

	# Lost-in-the-middle mitigation: put critical context at START and END
	return f"""## Research Question (IMPORTANT - stay focused on this)
	{question}

	## Search Progress
	- Iteration: {iteration}/{max_iterations}
	- Total evidence collected: {total_count} sources
	- Evidence shown below: {len(evidence)} diverse sources (selected for relevance)

	## Available Evidence

	{evidence_text}

	## Your Task

	{scoring_prompt}
	DO NOT decide "synthesize" vs "continue" - that decision is made by the system.

	## REMINDER: Original Question (stay focused)
	{question}
	"""


	def format_empty_evidence_prompt(question: str) -> str:
	"""
	Format prompt when no evidence was found.

	Args:
	question: The user's research question

	Returns:
	Formatted prompt string
	"""
	return f"""## Research Question
	{question}

	## Available Evidence

	No evidence was found from the search.

	## Your Task

	Since no evidence was found, recommend search queries that might yield better results.
	Set sufficient=False and recommendation=\"continue\".
	Suggest 3-5 specific search queries.
	"""