Spaces:
Running
Running
File size: 6,956 Bytes
499170b 599a754 499170b e502f0d 316dc7d 499170b 599a754 499170b 32e3b61 4732667 d247864 4732667 316dc7d 499170b 316dc7d 499170b cace02f 499170b e502f0d 499170b e502f0d 499170b 599a754 499170b 32e3b61 499170b 32e3b61 499170b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 |
"""PubMed search tool using NCBI E-utilities."""
from typing import Any
import httpx
import structlog
import xmltodict
from tenacity import retry, stop_after_attempt, wait_exponential
from src.tools.query_utils import preprocess_query
from src.tools.rate_limiter import get_pubmed_limiter
from src.utils.config import settings
from src.utils.exceptions import RateLimitError, SearchError
from src.utils.models import Citation, Evidence
logger = structlog.get_logger()
class PubMedTool:
"""Search tool for PubMed/NCBI."""
BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
HTTP_TOO_MANY_REQUESTS = 429
def __init__(self, api_key: str | None = None) -> None:
self.api_key = api_key or settings.ncbi_api_key
# Ignore placeholder values from .env.example
if self.api_key == "your-ncbi-key-here":
self.api_key = None
# Use shared rate limiter
self._limiter = get_pubmed_limiter(self.api_key)
@property
def name(self) -> str:
return "pubmed"
async def _rate_limit(self) -> None:
"""Enforce NCBI rate limiting."""
await self._limiter.acquire()
def _build_params(self, **kwargs: Any) -> dict[str, Any]:
"""Build request params with optional API key."""
params = {**kwargs, "retmode": "json"}
if self.api_key:
params["api_key"] = self.api_key
return params
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=1, max=10),
reraise=True,
)
async def search(self, query: str, max_results: int = 10) -> list[Evidence]:
"""
Search PubMed and return evidence.
1. ESearch: Get PMIDs matching query
2. EFetch: Get abstracts for those PMIDs
3. Parse and return Evidence objects
"""
await self._rate_limit()
# Preprocess query to remove noise and expand synonyms
clean_query = preprocess_query(query)
final_query = clean_query if clean_query else query
async with httpx.AsyncClient(timeout=30.0) as client:
# Step 1: Search for PMIDs
search_params = self._build_params(
db="pubmed",
term=final_query,
retmax=max_results,
sort="relevance",
)
try:
search_resp = await client.get(
f"{self.BASE_URL}/esearch.fcgi",
params=search_params,
)
search_resp.raise_for_status()
except httpx.HTTPStatusError as e:
if e.response.status_code == self.HTTP_TOO_MANY_REQUESTS:
raise RateLimitError("PubMed rate limit exceeded") from e
raise SearchError(f"PubMed search failed: {e}") from e
search_data = search_resp.json()
pmids = search_data.get("esearchresult", {}).get("idlist", [])
if not pmids:
return []
# Step 2: Fetch abstracts
await self._rate_limit()
fetch_params = self._build_params(
db="pubmed",
id=",".join(pmids),
rettype="abstract",
)
# Use XML for fetch (more reliable parsing)
fetch_params["retmode"] = "xml"
fetch_resp = await client.get(
f"{self.BASE_URL}/efetch.fcgi",
params=fetch_params,
)
fetch_resp.raise_for_status()
# Step 3: Parse XML to Evidence
return self._parse_pubmed_xml(fetch_resp.text)
def _parse_pubmed_xml(self, xml_text: str) -> list[Evidence]:
"""Parse PubMed XML into Evidence objects."""
try:
data = xmltodict.parse(xml_text)
except Exception as e:
raise SearchError(f"Failed to parse PubMed XML: {e}") from e
articles = data.get("PubmedArticleSet", {}).get("PubmedArticle", [])
# Handle single article (xmltodict returns dict instead of list)
if isinstance(articles, dict):
articles = [articles]
evidence_list = []
for article in articles:
try:
evidence = self._article_to_evidence(article)
if evidence:
evidence_list.append(evidence)
except (KeyError, AttributeError, TypeError) as e:
logger.debug("Skipping malformed article", error=str(e))
continue
return evidence_list
def _article_to_evidence(self, article: dict[str, Any]) -> Evidence | None:
"""Convert a single PubMed article to Evidence."""
medline = article.get("MedlineCitation", {})
article_data = medline.get("Article", {})
# Extract PMID
pmid = medline.get("PMID", {})
if isinstance(pmid, dict):
pmid = pmid.get("#text", "")
# Extract title
title = article_data.get("ArticleTitle", "")
if isinstance(title, dict):
title = title.get("#text", str(title))
# Extract abstract
abstract_data = article_data.get("Abstract", {}).get("AbstractText", "")
if isinstance(abstract_data, list):
abstract = " ".join(
item.get("#text", str(item)) if isinstance(item, dict) else str(item)
for item in abstract_data
)
elif isinstance(abstract_data, dict):
abstract = abstract_data.get("#text", str(abstract_data))
else:
abstract = str(abstract_data)
if not abstract or not title:
return None
# Extract date
pub_date = article_data.get("Journal", {}).get("JournalIssue", {}).get("PubDate", {})
year = pub_date.get("Year", "Unknown")
month = pub_date.get("Month", "01")
day = pub_date.get("Day", "01")
date_str = f"{year}-{month}-{day}" if year != "Unknown" else "Unknown"
# Extract authors
author_list = article_data.get("AuthorList", {}).get("Author", [])
if isinstance(author_list, dict):
author_list = [author_list]
authors = []
for author in author_list[:5]: # Limit to 5 authors
last = author.get("LastName", "")
first = author.get("ForeName", "")
if last:
authors.append(f"{last} {first}".strip())
# Truncation rationale: LLM context limits + cost optimization
# - Abstract: 2000 chars (~500 tokens) captures key findings
# - Title: 500 chars covers even verbose journal titles
return Evidence(
content=abstract[:2000],
citation=Citation(
source="pubmed",
title=title[:500],
url=f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
date=date_str,
authors=authors,
),
)
|