Spaces:
Running
Running
| """Judge prompts for evidence assessment.""" | |
| from src.config.domain import ResearchDomain, get_domain_config | |
| from src.utils.models import Evidence | |
| def get_system_prompt(domain: ResearchDomain | str | None = None) -> str: | |
| """Get the system prompt for the judge agent.""" | |
| config = get_domain_config(domain) | |
| return f"""{config.judge_system_prompt} | |
| Your task is to SCORE evidence from biomedical literature. You do NOT decide whether to | |
| continue searching or synthesize - that decision is made by the orchestration system | |
| based on your scores. | |
| ## Your Role: Scoring Only | |
| You provide objective scores. The system decides next steps based on explicit thresholds. | |
| This separation prevents bias in the decision-making process. | |
| ## Scoring Criteria | |
| 1. **Mechanism Score (0-10)**: How well does the evidence explain the biological mechanism? | |
| - 0-3: No clear mechanism, speculative | |
| - 4-6: Some mechanistic insight, but gaps exist | |
| - 7-10: Clear, well-supported mechanism of action | |
| 2. **Clinical Evidence Score (0-10)**: Strength of clinical/preclinical support? | |
| - 0-3: No clinical data, only theoretical | |
| - 4-6: Preclinical or early clinical data | |
| - 7-10: Strong clinical evidence (trials, meta-analyses) | |
| 3. **Drug Candidates**: List SPECIFIC drug names mentioned in the evidence | |
| - Only include drugs explicitly mentioned | |
| - Do NOT hallucinate or infer drug names | |
| - Include drug class if specific names aren't available (e.g., "SSRI antidepressants") | |
| 4. **Key Findings**: Extract 3-5 key findings from the evidence | |
| - Focus on findings relevant to the research question | |
| - Include mechanism insights and clinical outcomes | |
| 5. **Confidence (0.0-1.0)**: Your confidence in the scores | |
| - Based on evidence quality and relevance | |
| - Lower if evidence is tangential or low-quality | |
| ## Output Format | |
| Return valid JSON with these fields: | |
| - details.mechanism_score (int 0-10) | |
| - details.mechanism_reasoning (string) | |
| - details.clinical_evidence_score (int 0-10) | |
| - details.clinical_reasoning (string) | |
| - details.drug_candidates (list of strings) | |
| - details.key_findings (list of strings) | |
| - sufficient (boolean) - TRUE if scores suggest enough evidence | |
| - confidence (float 0-1) | |
| - recommendation ("continue" or "synthesize") - Your suggestion (system may override) | |
| - next_search_queries (list) - If continuing, suggest FOCUSED queries | |
| - reasoning (string) | |
| ## CRITICAL: Search Query Rules | |
| When suggesting next_search_queries: | |
| - STAY FOCUSED on the original research question | |
| - Do NOT drift to tangential topics | |
| - If question is about "female libido", do NOT suggest "bone health" or "muscle mass" | |
| - Refine existing terms, don't explore random medical associations | |
| """ | |
| def get_scoring_prompt(domain: ResearchDomain | str | None = None) -> str: | |
| """Get the scoring instructions for the judge.""" | |
| config = get_domain_config(domain) | |
| return config.judge_scoring_prompt | |
| # Keep SYSTEM_PROMPT for backwards compatibility | |
| SYSTEM_PROMPT = get_system_prompt() | |
| MAX_EVIDENCE_FOR_JUDGE = 30 # Keep under token limits | |
| async def select_evidence_for_judge( | |
| evidence: list[Evidence], | |
| query: str, | |
| max_items: int = MAX_EVIDENCE_FOR_JUDGE, | |
| ) -> list[Evidence]: | |
| """ | |
| Select diverse, relevant evidence for judge evaluation. | |
| Implements RAG best practices: | |
| - Diversity selection over recency-only | |
| - Lost-in-the-middle mitigation | |
| - Relevance re-ranking | |
| """ | |
| if len(evidence) <= max_items: | |
| return evidence | |
| try: | |
| from src.utils.text_utils import select_diverse_evidence | |
| # Use embedding-based diversity selection | |
| return await select_diverse_evidence(evidence, n=max_items, query=query) | |
| except ImportError: | |
| # Fallback: mix of recent + early (lost-in-the-middle mitigation) | |
| early = evidence[: max_items // 3] # First third | |
| recent = evidence[-(max_items * 2 // 3) :] # Last two-thirds | |
| return early + recent | |
| def format_user_prompt( | |
| question: str, | |
| evidence: list[Evidence], | |
| iteration: int = 0, | |
| max_iterations: int = 10, | |
| total_evidence_count: int | None = None, | |
| domain: ResearchDomain | str | None = None, | |
| ) -> str: | |
| """ | |
| Format user prompt with selected evidence and iteration context. | |
| NOTE: Evidence should be pre-selected using select_evidence_for_judge(). | |
| This function assumes evidence is already capped. | |
| """ | |
| total_count = total_evidence_count or len(evidence) | |
| max_content_len = 1500 | |
| scoring_prompt = get_scoring_prompt(domain) | |
| def format_single_evidence(i: int, e: Evidence) -> str: | |
| content = e.content | |
| if len(content) > max_content_len: | |
| content = content[:max_content_len] + "..." | |
| return ( | |
| f"### Evidence {i + 1}\n" | |
| f"**Source**: {e.citation.source.upper()} - {e.citation.title}\n" | |
| f"**URL**: {e.citation.url}\n" | |
| f"**Content**:\n{content}" | |
| ) | |
| evidence_text = "\n\n".join([format_single_evidence(i, e) for i, e in enumerate(evidence)]) | |
| # Lost-in-the-middle mitigation: put critical context at START and END | |
| return f"""## Research Question (IMPORTANT - stay focused on this) | |
| {question} | |
| ## Search Progress | |
| - **Iteration**: {iteration}/{max_iterations} | |
| - **Total evidence collected**: {total_count} sources | |
| - **Evidence shown below**: {len(evidence)} diverse sources (selected for relevance) | |
| ## Available Evidence | |
| {evidence_text} | |
| ## Your Task | |
| {scoring_prompt} | |
| DO NOT decide "synthesize" vs "continue" - that decision is made by the system. | |
| ## REMINDER: Original Question (stay focused) | |
| {question} | |
| """ | |
| def format_empty_evidence_prompt(question: str) -> str: | |
| """ | |
| Format prompt when no evidence was found. | |
| Args: | |
| question: The user's research question | |
| Returns: | |
| Formatted prompt string | |
| """ | |
| return f"""## Research Question | |
| {question} | |
| ## Available Evidence | |
| No evidence was found from the search. | |
| ## Your Task | |
| Since no evidence was found, recommend search queries that might yield better results. | |
| Set sufficient=False and recommendation=\"continue\". | |
| Suggest 3-5 specific search queries. | |
| """ | |