DeepBoner / src /utils /models.py
VibecoderMcSwaggins's picture
feat: SPEC_01 (Termination) + SPEC_02 (E2E Tests) implementation (#66)
0257d2f unverified
"""Data models for the Search feature."""
from datetime import UTC, datetime
from typing import Any, ClassVar, Literal
from pydantic import BaseModel, Field
# Centralized source type - add new sources here (e.g., new databases)
SourceName = Literal["pubmed", "clinicaltrials", "europepmc", "preprint", "openalex", "web"]
class Citation(BaseModel):
"""A citation to a source document."""
source: SourceName = Field(description="Where this came from")
title: str = Field(min_length=1, max_length=500)
url: str = Field(description="URL to the source")
date: str = Field(description="Publication date (YYYY-MM-DD or 'Unknown')")
authors: list[str] = Field(default_factory=list)
MAX_AUTHORS_IN_CITATION: ClassVar[int] = 3
@property
def formatted(self) -> str:
"""Format as a citation string."""
author_str = ", ".join(self.authors[: self.MAX_AUTHORS_IN_CITATION])
if len(self.authors) > self.MAX_AUTHORS_IN_CITATION:
author_str += " et al."
return f"{author_str} ({self.date}). {self.title}. {self.source.upper()}"
class Evidence(BaseModel):
"""A piece of evidence retrieved from search."""
content: str = Field(min_length=1, description="The actual text content")
citation: Citation
relevance: float = Field(default=0.0, ge=0.0, le=1.0, description="Relevance score 0-1")
metadata: dict[str, Any] = Field(
default_factory=dict,
description="Additional metadata (e.g., cited_by_count, concepts, is_open_access)",
)
model_config = {"frozen": True}
class SearchResult(BaseModel):
"""Result of a search operation."""
query: str
evidence: list[Evidence]
sources_searched: list[SourceName]
total_found: int
errors: list[str] = Field(default_factory=list)
class AssessmentDetails(BaseModel):
"""Detailed assessment of evidence quality."""
mechanism_score: int = Field(
...,
ge=0,
le=10,
description="How well does the evidence explain the mechanism? 0-10",
)
mechanism_reasoning: str = Field(
..., min_length=10, description="Explanation of mechanism score"
)
clinical_evidence_score: int = Field(
...,
ge=0,
le=10,
description="Strength of clinical/preclinical evidence. 0-10",
)
clinical_reasoning: str = Field(
..., min_length=10, description="Explanation of clinical evidence score"
)
drug_candidates: list[str] = Field(
default_factory=list, description="List of specific drug candidates mentioned"
)
key_findings: list[str] = Field(
default_factory=list, description="Key findings from the evidence"
)
class JudgeAssessment(BaseModel):
"""Complete assessment from the Judge."""
details: AssessmentDetails
sufficient: bool = Field(..., description="Is evidence sufficient to provide a recommendation?")
confidence: float = Field(..., ge=0.0, le=1.0, description="Confidence in the assessment (0-1)")
recommendation: Literal["continue", "synthesize"] = Field(
...,
description="continue = need more evidence, synthesize = ready to answer",
)
next_search_queries: list[str] = Field(
default_factory=list, description="If continue, what queries to search next"
)
reasoning: str = Field(
..., min_length=20, description="Overall reasoning for the recommendation"
)
class AgentEvent(BaseModel):
"""Event emitted by the orchestrator for UI streaming."""
type: Literal[
"started",
"thinking", # Multi-agent reasoning in progress (before first event)
"searching",
"search_complete",
"judging",
"judge_complete",
"looping",
"synthesizing",
"complete",
"error",
"streaming",
"hypothesizing",
"analyzing", # NEW for Phase 13
"analysis_complete", # NEW for Phase 13
"progress", # NEW for SPEC_01
]
message: str
data: Any = None
timestamp: datetime = Field(default_factory=lambda: datetime.now(UTC))
iteration: int = 0
def to_markdown(self) -> str:
"""Format event as markdown for chat display."""
icons = {
"started": "πŸš€",
"thinking": "⏳", # Hourglass for thinking/waiting
"searching": "πŸ”",
"search_complete": "πŸ“š",
"judging": "🧠",
"judge_complete": "βœ…",
"looping": "πŸ”„",
"synthesizing": "πŸ“",
"complete": "πŸŽ‰",
"error": "❌",
"streaming": "πŸ“‘",
"hypothesizing": "πŸ”¬", # NEW
"analyzing": "πŸ“Š", # NEW
"analysis_complete": "πŸ“ˆ", # NEW
"progress": "⏱️", # NEW
}
icon = icons.get(self.type, "β€’")
return f"{icon} **{self.type.upper()}**: {self.message}"
class MechanismHypothesis(BaseModel):
"""A scientific hypothesis about drug mechanism."""
drug: str = Field(description="The drug being studied")
target: str = Field(description="Molecular target (e.g., AMPK, mTOR)")
pathway: str = Field(description="Biological pathway affected")
effect: str = Field(description="Downstream effect on disease")
confidence: float = Field(ge=0, le=1, description="Confidence in hypothesis")
supporting_evidence: list[str] = Field(
default_factory=list, description="PMIDs or URLs supporting this hypothesis"
)
contradicting_evidence: list[str] = Field(
default_factory=list, description="PMIDs or URLs contradicting this hypothesis"
)
search_suggestions: list[str] = Field(
default_factory=list, description="Suggested searches to test this hypothesis"
)
def to_search_queries(self) -> list[str]:
"""Generate search queries to test this hypothesis."""
return [
f"{self.drug} {self.target}",
f"{self.target} {self.pathway}",
f"{self.pathway} {self.effect}",
*self.search_suggestions,
]
class HypothesisAssessment(BaseModel):
"""Assessment of evidence against hypotheses."""
hypotheses: list[MechanismHypothesis]
primary_hypothesis: MechanismHypothesis | None = Field(
default=None, description="Most promising hypothesis based on current evidence"
)
knowledge_gaps: list[str] = Field(description="What we don't know yet")
recommended_searches: list[str] = Field(description="Searches to fill knowledge gaps")
class ReportSection(BaseModel):
"""A section of the research report."""
title: str
content: str
# Reserved for future inline citation tracking within sections
citations: list[str] = Field(default_factory=list)
class ResearchReport(BaseModel):
"""Structured scientific report."""
title: str = Field(description="Report title")
executive_summary: str = Field(
description="One-paragraph summary for quick reading", min_length=100, max_length=1000
)
research_question: str = Field(description="Clear statement of what was investigated")
methodology: ReportSection = Field(description="How the research was conducted")
hypotheses_tested: list[dict[str, Any]] = Field(
description="Hypotheses with supporting/contradicting evidence counts"
)
mechanistic_findings: ReportSection = Field(description="Findings about drug mechanisms")
clinical_findings: ReportSection = Field(
description="Findings from clinical/preclinical studies"
)
drug_candidates: list[str] = Field(description="Identified drug candidates")
limitations: list[str] = Field(description="Study limitations")
conclusion: str = Field(description="Overall conclusion")
references: list[dict[str, str]] = Field(
default_factory=list,
description="Formatted references with title, authors, source, URL",
)
# Metadata
sources_searched: list[str] = Field(default_factory=list)
total_papers_reviewed: int = 0
search_iterations: int = 0
confidence_score: float = Field(ge=0, le=1)
def to_markdown(self) -> str:
"""Render report as markdown."""
sections = [
f"# {self.title}\n",
f"## Executive Summary\n{self.executive_summary}\n",
f"## Research Question\n{self.research_question}\n",
f"## Methodology\n{self.methodology.content}\n",
]
# Hypotheses
sections.append("## Hypotheses Tested\n")
if not self.hypotheses_tested:
sections.append("*No hypotheses tested yet.*\n")
for h in self.hypotheses_tested:
supported = h.get("supported", 0)
contradicted = h.get("contradicted", 0)
if supported == 0 and contradicted == 0:
status = "❓ Untested"
elif supported > contradicted:
status = "βœ… Supported"
else:
status = "⚠️ Mixed"
sections.append(
f"- **{h.get('mechanism', 'Unknown')}** ({status}): "
f"{supported} supporting, {contradicted} contradicting\n"
)
# Findings
sections.append(f"## Mechanistic Findings\n{self.mechanistic_findings.content}\n")
sections.append(f"## Clinical Findings\n{self.clinical_findings.content}\n")
# Drug candidates
sections.append("## Drug Candidates\n")
if self.drug_candidates:
for drug in self.drug_candidates:
sections.append(f"- **{drug}**\n")
else:
sections.append("*No drug candidates identified.*\n")
# Limitations
sections.append("## Limitations\n")
if self.limitations:
for lim in self.limitations:
sections.append(f"- {lim}\n")
else:
sections.append("*No limitations documented.*\n")
# Conclusion
sections.append(f"## Conclusion\n{self.conclusion}\n")
# References
sections.append("## References\n")
if self.references:
for i, ref in enumerate(self.references, 1):
sections.append(
f"{i}. {ref.get('authors', 'Unknown')}. "
f"*{ref.get('title', 'Untitled')}*. "
f"{ref.get('source', '')} ({ref.get('date', '')}). "
f"[Link]({ref.get('url', '#')})\n"
)
else:
sections.append("*No references available.*\n")
# Metadata footer
sections.append("\n---\n")
sections.append(
f"*Report generated from {self.total_papers_reviewed} papers "
f"across {self.search_iterations} search iterations. "
f"Confidence: {self.confidence_score:.0%}*"
)
return "\n".join(sections)
class OrchestratorConfig(BaseModel):
"""Configuration for the orchestrator."""
max_iterations: int = Field(default=10, ge=1, le=20)
max_results_per_tool: int = Field(default=10, ge=1, le=50)
search_timeout: float = Field(default=30.0, ge=5.0, le=120.0)