Spaces:
Running
Running
File size: 17,712 Bytes
3e435ad |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 |
# biorxiv_server_fixed.py
from mcp.server.fastmcp import FastMCP
import httpx
import logging
from datetime import datetime, timedelta
import sys
from pathlib import Path
import re
# Add parent directory to path for shared imports
sys.path.insert(0, str(Path(__file__).parent.parent))
from shared import (
config,
RateLimiter,
format_authors,
ErrorFormatter,
truncate_text
)
from shared.http_client import get_http_client, CustomHTTPClient
# Configure logging with DEBUG for detailed troubleshooting
logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
mcp = FastMCP("biorxiv-server")
# Rate limiting using shared utility
rate_limiter = RateLimiter(config.rate_limits.biorxiv_delay)
def preprocess_query(query: str) -> tuple[list[str], list[str]]:
"""Preprocess query into search terms and handle synonyms.
Returns:
tuple of (primary_terms, all_search_terms)
"""
# Convert to lowercase for matching
query_lower = query.lower()
# Common ALS-related synonyms and variations
synonyms = {
'als': ['amyotrophic lateral sclerosis', 'motor neuron disease', 'motor neurone disease', 'lou gehrig'],
'amyotrophic lateral sclerosis': ['als', 'motor neuron disease'],
'mnd': ['motor neuron disease', 'motor neurone disease', 'als'],
'sod1': ['superoxide dismutase 1', 'cu/zn superoxide dismutase'],
'tdp-43': ['tdp43', 'tardbp', 'tar dna binding protein'],
'c9orf72': ['c9', 'chromosome 9 open reading frame 72'],
'fus': ['fused in sarcoma', 'tls'],
}
# Split query into individual terms (handle multiple spaces and special chars)
# Keep hyphenated words together (like TDP-43)
terms = re.split(r'\s+', query_lower.strip())
# Build comprehensive search term list
all_terms = []
primary_terms = []
for term in terms:
# Skip very short terms unless they're known abbreviations
if len(term) < 3 and term not in ['als', 'mnd', 'fus', 'c9']:
continue
primary_terms.append(term)
all_terms.append(term)
# Add synonyms if they exist
if term in synonyms:
all_terms.extend(synonyms[term])
# Remove duplicates while preserving order
seen = set()
all_terms = [t for t in all_terms if not (t in seen or seen.add(t))]
primary_terms = [t for t in primary_terms if not (t in seen or seen.add(t))]
return primary_terms, all_terms
def matches_query(paper: dict, primary_terms: list[str], all_terms: list[str], require_all: bool = False) -> bool:
"""Check if a paper matches the search query.
Args:
paper: Paper dictionary from bioRxiv API
primary_terms: Main search terms from user query
all_terms: All search terms including synonyms
require_all: If True, require ALL primary terms. If False, require ANY term.
Returns:
True if paper matches search criteria
"""
# Get searchable text
title = paper.get("title", "").lower()
abstract = paper.get("abstract", "").lower()
searchable_text = f" {title} {abstract} " # Add spaces for boundary matching
# DEBUG: Log paper being checked
paper_doi = paper.get("doi", "unknown")
logger.debug(f"π Checking paper: {title[:60]}... (DOI: {paper_doi})")
if not searchable_text.strip():
logger.debug(f" β Rejected: No title/abstract")
return False
# For ALS specifically, need to be careful about word boundaries
has_any_match = False
matched_term = None
for term in all_terms:
# For short terms like "ALS", require word boundaries
if len(term) <= 3:
# Check for word boundary match
pattern = r'\b' + re.escape(term) + r'\b'
if re.search(pattern, searchable_text, re.IGNORECASE):
has_any_match = True
matched_term = term
break
else:
# For longer terms, can be more lenient
if term.lower() in searchable_text:
has_any_match = True
matched_term = term
break
if not has_any_match:
logger.debug(f" β Rejected: No term match. Terms searched: {all_terms[:3]}...")
return False
logger.debug(f" β
Matched on term: '{matched_term}'")
# If we only need any match, we're done
if not require_all:
return True
# For require_all, check that all primary terms are present
# Allow for word boundaries to avoid partial matches
for term in primary_terms:
# Create pattern that matches the term as a whole word or part of hyphenated word
# This handles cases like "TDP-43" or "SOD1"
pattern = r'\b' + re.escape(term) + r'(?:\b|[-])'
if not re.search(pattern, searchable_text, re.IGNORECASE):
return False
return True
@mcp.tool()
async def search_preprints(
query: str,
server: str = "both",
max_results: int = 10,
days_back: int = 365
) -> str:
"""Search bioRxiv and medRxiv for ALS preprints. Returns recent preprints before peer review.
Args:
query: Search query (e.g., 'ALS TDP-43')
server: Which server to search - one of: biorxiv, medrxiv, both (default: both)
max_results: Maximum number of results (default: 10)
days_back: Number of days to look back (default: 365 - about 1 year)
"""
try:
logger.info(f"π Searching bioRxiv/medRxiv for: '{query}'")
logger.info(f" Parameters: server={server}, max_results={max_results}, days_back={days_back}")
# Preprocess query for better matching
primary_terms, all_terms = preprocess_query(query)
logger.info(f"π Search terms: primary={primary_terms}, all={all_terms}")
# Calculate date range
end_date = datetime.now()
start_date = end_date - timedelta(days=days_back)
# Format dates for API (YYYY-MM-DD)
start_date_str = start_date.strftime("%Y-%m-%d")
end_date_str = end_date.strftime("%Y-%m-%d")
logger.info(f"π
Date range: {start_date_str} to {end_date_str}")
# bioRxiv/medRxiv API endpoint
base_url = "https://api.biorxiv.org/details"
all_results = []
servers_to_search = []
if server in ["biorxiv", "both"]:
servers_to_search.append("biorxiv")
if server in ["medrxiv", "both"]:
servers_to_search.append("medrxiv")
# Use a custom HTTP client with proper timeout for bioRxiv
# Don't use shared client as it may have conflicting timeout settings
async with CustomHTTPClient(timeout=15.0) as client:
for srv in servers_to_search:
try:
cursor = 0
found_in_server = []
max_iterations = 1 # Only check first page (100 papers) for much faster response
iteration = 0
while iteration < max_iterations:
# Rate limiting
await rate_limiter.wait()
# Search by date range with cursor for pagination
url = f"{base_url}/{srv}/{start_date_str}/{end_date_str}/{cursor}"
logger.info(f"π Querying {srv} API (page {iteration+1}, cursor={cursor})")
logger.info(f" URL: {url}")
response = await client.get(url)
response.raise_for_status()
data = response.json()
# Extract collection
collection = data.get("collection", [])
if not collection:
logger.info(f"π No more results from {srv}")
break
logger.info(f"π¦ Fetched {len(collection)} papers from API")
# Show first few papers for debugging
if iteration == 0 and collection:
logger.info(" Sample papers from API:")
for i, paper in enumerate(collection[:3]):
logger.info(f" {i+1}. {paper.get('title', 'No title')[:60]}...")
# Filter papers using improved matching
# Start with lenient matching (ANY term)
logger.debug(f"π Starting to filter {len(collection)} papers...")
filtered = [
paper for paper in collection
if matches_query(paper, primary_terms, all_terms, require_all=False)
]
logger.info(f"β
Filtered results: {len(filtered)}/{len(collection)} papers matched")
if len(filtered) > 0:
logger.info(" Matched papers:")
for i, paper in enumerate(filtered[:3]):
logger.info(f" {i+1}. {paper.get('title', 'No title')[:60]}...")
found_in_server.extend(filtered)
logger.info(f"π Running total for {srv}: {len(found_in_server)} papers")
# Check if we have enough results
if len(found_in_server) >= max_results:
logger.info(f"Reached max_results limit ({max_results})")
break
# Continue searching if we haven't found enough
if len(found_in_server) < 5 and iteration < max_iterations - 1:
# Keep searching for more results
pass
elif len(found_in_server) > 0 and iteration >= 3:
# Found some results after reasonable search
logger.info(f"Found {len(found_in_server)} results after {iteration+1} pages")
break
# Check for more pages
messages = data.get("messages", [])
# The API returns "cursor" in messages for next page
has_more = False
for msg in messages:
if "cursor=" in str(msg):
try:
cursor_str = str(msg).split("cursor=")[1].split()[0]
next_cursor = int(cursor_str)
if next_cursor > cursor:
cursor = next_cursor
has_more = True
break
except:
pass
# Alternative: increment by collection size
if not has_more:
if len(collection) >= 100:
cursor += len(collection)
else:
# Less than full page means we've reached the end
break
iteration += 1
all_results.extend(found_in_server[:max_results])
logger.info(f"π Total results from {srv}: {len(found_in_server)} papers found")
except httpx.HTTPStatusError as e:
logger.warning(f"Error searching {srv}: {e}")
continue
except Exception as e:
logger.warning(f"Unexpected error searching {srv}: {e}")
continue
# If no results with lenient matching, provide helpful message
if not all_results:
logger.warning(f"β οΈ No preprints found for query: {query}")
# Provide suggestions for improving search
suggestions = []
if len(primary_terms) > 3:
suggestions.append("Try using fewer search terms")
if not any(term in ['als', 'amyotrophic lateral sclerosis', 'motor neuron'] for term in all_terms):
suggestions.append("Add 'ALS' or 'motor neuron disease' to your search")
if days_back < 365:
suggestions.append(f"Expand the time range beyond {days_back} days")
suggestion_text = ""
if suggestions:
suggestion_text = "\n\nSuggestions:\n" + "\n".join(f"- {s}" for s in suggestions)
return f"No preprints found for query: '{query}' in the last {days_back} days{suggestion_text}"
# Sort by date (most recent first)
all_results.sort(key=lambda x: x.get("date", ""), reverse=True)
# Limit results
all_results = all_results[:max_results]
logger.info(f"π― FINAL RESULTS: Returning {len(all_results)} preprints for '{query}'")
if all_results:
logger.info(" Top results:")
for i, paper in enumerate(all_results[:3], 1):
logger.info(f" {i}. {paper.get('title', 'No title')[:60]}...")
logger.info(f" DOI: {paper.get('doi', 'unknown')}, Date: {paper.get('date', 'unknown')}")
# Format results
result = f"Found {len(all_results)} preprints for query: '{query}'\n\n"
for i, paper in enumerate(all_results, 1):
title = paper.get("title", "No title")
doi = paper.get("doi", "Unknown")
date = paper.get("date", "Unknown")
authors = paper.get("authors", "Unknown authors")
authors_str = format_authors(authors, max_authors=3)
abstract = paper.get("abstract", "No abstract available")
category = paper.get("category", "")
server_name = "bioRxiv" if "biorxiv" in doi else "medRxiv"
result += f"{i}. **{title}**\n"
result += f" DOI: {doi} | {server_name} | Posted: {date}\n"
result += f" Authors: {authors_str}\n"
if category:
result += f" Category: {category}\n"
result += f" Abstract: {truncate_text(abstract, max_chars=300, suffix='')}\n"
result += f" URL: https://doi.org/{doi}\n\n"
logger.info(f"Successfully retrieved {len(all_results)} preprints")
return result
except httpx.TimeoutException:
logger.error("bioRxiv/medRxiv API request timed out")
return "Error: bioRxiv/medRxiv API request timed out. Please try again."
except httpx.HTTPStatusError as e:
logger.error(f"bioRxiv/medRxiv API error: {e}")
return f"Error: bioRxiv/medRxiv API returned status code {e.response.status_code}"
except Exception as e:
logger.error(f"Unexpected error in search_preprints: {e}")
return f"Error searching preprints: {str(e)}"
@mcp.tool()
async def get_preprint_details(doi: str) -> str:
"""Get full details for a specific bioRxiv/medRxiv preprint by DOI.
Args:
doi: The DOI of the preprint (e.g., '10.1101/2024.01.01.123456')
"""
try:
logger.info(f"Getting details for DOI: {doi}")
# Ensure DOI is properly formatted
if not doi.startswith("10.1101/"):
doi = f"10.1101/{doi}"
# Determine server from DOI
# bioRxiv DOIs typically have format: 10.1101/YYYY.MM.DD.NNNNNN
# medRxiv DOIs are similar but the content determines the server
# Use shared HTTP client for connection pooling
client = get_http_client(timeout=30.0)
# Try the DOI endpoint
url = f"https://api.biorxiv.org/details/{doi}"
response = await client.get(url)
if response.status_code == 404:
# Try with both servers
for server in ["biorxiv", "medrxiv"]:
url = f"https://api.biorxiv.org/details/{server}/{doi}"
response = await client.get(url)
if response.status_code == 200:
break
else:
return f"Preprint with DOI {doi} not found"
response.raise_for_status()
data = response.json()
collection = data.get("collection", [])
if not collection:
return f"No details found for DOI: {doi}"
# Get the first (and should be only) paper
paper = collection[0]
title = paper.get("title", "No title")
date = paper.get("date", "Unknown")
authors = paper.get("authors", "Unknown authors")
abstract = paper.get("abstract", "No abstract available")
category = paper.get("category", "")
server_name = paper.get("server", "Unknown")
result = f"**{title}**\n\n"
result += f"**DOI:** {doi}\n"
result += f"**Server:** {server_name}\n"
result += f"**Posted:** {date}\n"
if category:
result += f"**Category:** {category}\n"
result += f"**Authors:** {authors}\n\n"
result += f"**Abstract:**\n{abstract}\n\n"
result += f"**Full Text URL:** https://doi.org/{doi}\n"
return result
except httpx.HTTPStatusError as e:
logger.error(f"Error fetching preprint details: {e}")
return f"Error fetching preprint details: HTTP {e.response.status_code}"
except Exception as e:
logger.error(f"Unexpected error getting preprint details: {e}")
return f"Error getting preprint details: {str(e)}"
if __name__ == "__main__":
mcp.run(transport="stdio") |