# biorxiv_server_fixed.py from mcp.server.fastmcp import FastMCP import httpx import logging from datetime import datetime, timedelta import sys from pathlib import Path import re # Add parent directory to path for shared imports sys.path.insert(0, str(Path(__file__).parent.parent)) from shared import ( config, RateLimiter, format_authors, ErrorFormatter, truncate_text ) from shared.http_client import get_http_client, CustomHTTPClient # Configure logging with DEBUG for detailed troubleshooting logging.basicConfig( level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) mcp = FastMCP("biorxiv-server") # Rate limiting using shared utility rate_limiter = RateLimiter(config.rate_limits.biorxiv_delay) def preprocess_query(query: str) -> tuple[list[str], list[str]]: """Preprocess query into search terms and handle synonyms. Returns: tuple of (primary_terms, all_search_terms) """ # Convert to lowercase for matching query_lower = query.lower() # Common ALS-related synonyms and variations synonyms = { 'als': ['amyotrophic lateral sclerosis', 'motor neuron disease', 'motor neurone disease', 'lou gehrig'], 'amyotrophic lateral sclerosis': ['als', 'motor neuron disease'], 'mnd': ['motor neuron disease', 'motor neurone disease', 'als'], 'sod1': ['superoxide dismutase 1', 'cu/zn superoxide dismutase'], 'tdp-43': ['tdp43', 'tardbp', 'tar dna binding protein'], 'c9orf72': ['c9', 'chromosome 9 open reading frame 72'], 'fus': ['fused in sarcoma', 'tls'], } # Split query into individual terms (handle multiple spaces and special chars) # Keep hyphenated words together (like TDP-43) terms = re.split(r'\s+', query_lower.strip()) # Build comprehensive search term list all_terms = [] primary_terms = [] for term in terms: # Skip very short terms unless they're known abbreviations if len(term) < 3 and term not in ['als', 'mnd', 'fus', 'c9']: continue primary_terms.append(term) all_terms.append(term) # Add synonyms if they exist if term in synonyms: all_terms.extend(synonyms[term]) # Remove duplicates while preserving order seen = set() all_terms = [t for t in all_terms if not (t in seen or seen.add(t))] primary_terms = [t for t in primary_terms if not (t in seen or seen.add(t))] return primary_terms, all_terms def matches_query(paper: dict, primary_terms: list[str], all_terms: list[str], require_all: bool = False) -> bool: """Check if a paper matches the search query. Args: paper: Paper dictionary from bioRxiv API primary_terms: Main search terms from user query all_terms: All search terms including synonyms require_all: If True, require ALL primary terms. If False, require ANY term. Returns: True if paper matches search criteria """ # Get searchable text title = paper.get("title", "").lower() abstract = paper.get("abstract", "").lower() searchable_text = f" {title} {abstract} " # Add spaces for boundary matching # DEBUG: Log paper being checked paper_doi = paper.get("doi", "unknown") logger.debug(f"🔍 Checking paper: {title[:60]}... (DOI: {paper_doi})") if not searchable_text.strip(): logger.debug(f" ❌ Rejected: No title/abstract") return False # For ALS specifically, need to be careful about word boundaries has_any_match = False matched_term = None for term in all_terms: # For short terms like "ALS", require word boundaries if len(term) <= 3: # Check for word boundary match pattern = r'\b' + re.escape(term) + r'\b' if re.search(pattern, searchable_text, re.IGNORECASE): has_any_match = True matched_term = term break else: # For longer terms, can be more lenient if term.lower() in searchable_text: has_any_match = True matched_term = term break if not has_any_match: logger.debug(f" ❌ Rejected: No term match. Terms searched: {all_terms[:3]}...") return False logger.debug(f" ✅ Matched on term: '{matched_term}'") # If we only need any match, we're done if not require_all: return True # For require_all, check that all primary terms are present # Allow for word boundaries to avoid partial matches for term in primary_terms: # Create pattern that matches the term as a whole word or part of hyphenated word # This handles cases like "TDP-43" or "SOD1" pattern = r'\b' + re.escape(term) + r'(?:\b|[-])' if not re.search(pattern, searchable_text, re.IGNORECASE): return False return True @mcp.tool() async def search_preprints( query: str, server: str = "both", max_results: int = 10, days_back: int = 365 ) -> str: """Search bioRxiv and medRxiv for ALS preprints. Returns recent preprints before peer review. Args: query: Search query (e.g., 'ALS TDP-43') server: Which server to search - one of: biorxiv, medrxiv, both (default: both) max_results: Maximum number of results (default: 10) days_back: Number of days to look back (default: 365 - about 1 year) """ try: logger.info(f"🔎 Searching bioRxiv/medRxiv for: '{query}'") logger.info(f" Parameters: server={server}, max_results={max_results}, days_back={days_back}") # Preprocess query for better matching primary_terms, all_terms = preprocess_query(query) logger.info(f"📝 Search terms: primary={primary_terms}, all={all_terms}") # Calculate date range end_date = datetime.now() start_date = end_date - timedelta(days=days_back) # Format dates for API (YYYY-MM-DD) start_date_str = start_date.strftime("%Y-%m-%d") end_date_str = end_date.strftime("%Y-%m-%d") logger.info(f"📅 Date range: {start_date_str} to {end_date_str}") # bioRxiv/medRxiv API endpoint base_url = "https://api.biorxiv.org/details" all_results = [] servers_to_search = [] if server in ["biorxiv", "both"]: servers_to_search.append("biorxiv") if server in ["medrxiv", "both"]: servers_to_search.append("medrxiv") # Use a custom HTTP client with proper timeout for bioRxiv # Don't use shared client as it may have conflicting timeout settings async with CustomHTTPClient(timeout=15.0) as client: for srv in servers_to_search: try: cursor = 0 found_in_server = [] max_iterations = 1 # Only check first page (100 papers) for much faster response iteration = 0 while iteration < max_iterations: # Rate limiting await rate_limiter.wait() # Search by date range with cursor for pagination url = f"{base_url}/{srv}/{start_date_str}/{end_date_str}/{cursor}" logger.info(f"🌐 Querying {srv} API (page {iteration+1}, cursor={cursor})") logger.info(f" URL: {url}") response = await client.get(url) response.raise_for_status() data = response.json() # Extract collection collection = data.get("collection", []) if not collection: logger.info(f"📭 No more results from {srv}") break logger.info(f"📦 Fetched {len(collection)} papers from API") # Show first few papers for debugging if iteration == 0 and collection: logger.info(" Sample papers from API:") for i, paper in enumerate(collection[:3]): logger.info(f" {i+1}. {paper.get('title', 'No title')[:60]}...") # Filter papers using improved matching # Start with lenient matching (ANY term) logger.debug(f"🔍 Starting to filter {len(collection)} papers...") filtered = [ paper for paper in collection if matches_query(paper, primary_terms, all_terms, require_all=False) ] logger.info(f"✅ Filtered results: {len(filtered)}/{len(collection)} papers matched") if len(filtered) > 0: logger.info(" Matched papers:") for i, paper in enumerate(filtered[:3]): logger.info(f" {i+1}. {paper.get('title', 'No title')[:60]}...") found_in_server.extend(filtered) logger.info(f"📊 Running total for {srv}: {len(found_in_server)} papers") # Check if we have enough results if len(found_in_server) >= max_results: logger.info(f"Reached max_results limit ({max_results})") break # Continue searching if we haven't found enough if len(found_in_server) < 5 and iteration < max_iterations - 1: # Keep searching for more results pass elif len(found_in_server) > 0 and iteration >= 3: # Found some results after reasonable search logger.info(f"Found {len(found_in_server)} results after {iteration+1} pages") break # Check for more pages messages = data.get("messages", []) # The API returns "cursor" in messages for next page has_more = False for msg in messages: if "cursor=" in str(msg): try: cursor_str = str(msg).split("cursor=")[1].split()[0] next_cursor = int(cursor_str) if next_cursor > cursor: cursor = next_cursor has_more = True break except: pass # Alternative: increment by collection size if not has_more: if len(collection) >= 100: cursor += len(collection) else: # Less than full page means we've reached the end break iteration += 1 all_results.extend(found_in_server[:max_results]) logger.info(f"🏁 Total results from {srv}: {len(found_in_server)} papers found") except httpx.HTTPStatusError as e: logger.warning(f"Error searching {srv}: {e}") continue except Exception as e: logger.warning(f"Unexpected error searching {srv}: {e}") continue # If no results with lenient matching, provide helpful message if not all_results: logger.warning(f"⚠️ No preprints found for query: {query}") # Provide suggestions for improving search suggestions = [] if len(primary_terms) > 3: suggestions.append("Try using fewer search terms") if not any(term in ['als', 'amyotrophic lateral sclerosis', 'motor neuron'] for term in all_terms): suggestions.append("Add 'ALS' or 'motor neuron disease' to your search") if days_back < 365: suggestions.append(f"Expand the time range beyond {days_back} days") suggestion_text = "" if suggestions: suggestion_text = "\n\nSuggestions:\n" + "\n".join(f"- {s}" for s in suggestions) return f"No preprints found for query: '{query}' in the last {days_back} days{suggestion_text}" # Sort by date (most recent first) all_results.sort(key=lambda x: x.get("date", ""), reverse=True) # Limit results all_results = all_results[:max_results] logger.info(f"🎯 FINAL RESULTS: Returning {len(all_results)} preprints for '{query}'") if all_results: logger.info(" Top results:") for i, paper in enumerate(all_results[:3], 1): logger.info(f" {i}. {paper.get('title', 'No title')[:60]}...") logger.info(f" DOI: {paper.get('doi', 'unknown')}, Date: {paper.get('date', 'unknown')}") # Format results result = f"Found {len(all_results)} preprints for query: '{query}'\n\n" for i, paper in enumerate(all_results, 1): title = paper.get("title", "No title") doi = paper.get("doi", "Unknown") date = paper.get("date", "Unknown") authors = paper.get("authors", "Unknown authors") authors_str = format_authors(authors, max_authors=3) abstract = paper.get("abstract", "No abstract available") category = paper.get("category", "") server_name = "bioRxiv" if "biorxiv" in doi else "medRxiv" result += f"{i}. **{title}**\n" result += f" DOI: {doi} | {server_name} | Posted: {date}\n" result += f" Authors: {authors_str}\n" if category: result += f" Category: {category}\n" result += f" Abstract: {truncate_text(abstract, max_chars=300, suffix='')}\n" result += f" URL: https://doi.org/{doi}\n\n" logger.info(f"Successfully retrieved {len(all_results)} preprints") return result except httpx.TimeoutException: logger.error("bioRxiv/medRxiv API request timed out") return "Error: bioRxiv/medRxiv API request timed out. Please try again." except httpx.HTTPStatusError as e: logger.error(f"bioRxiv/medRxiv API error: {e}") return f"Error: bioRxiv/medRxiv API returned status code {e.response.status_code}" except Exception as e: logger.error(f"Unexpected error in search_preprints: {e}") return f"Error searching preprints: {str(e)}" @mcp.tool() async def get_preprint_details(doi: str) -> str: """Get full details for a specific bioRxiv/medRxiv preprint by DOI. Args: doi: The DOI of the preprint (e.g., '10.1101/2024.01.01.123456') """ try: logger.info(f"Getting details for DOI: {doi}") # Ensure DOI is properly formatted if not doi.startswith("10.1101/"): doi = f"10.1101/{doi}" # Determine server from DOI # bioRxiv DOIs typically have format: 10.1101/YYYY.MM.DD.NNNNNN # medRxiv DOIs are similar but the content determines the server # Use shared HTTP client for connection pooling client = get_http_client(timeout=30.0) # Try the DOI endpoint url = f"https://api.biorxiv.org/details/{doi}" response = await client.get(url) if response.status_code == 404: # Try with both servers for server in ["biorxiv", "medrxiv"]: url = f"https://api.biorxiv.org/details/{server}/{doi}" response = await client.get(url) if response.status_code == 200: break else: return f"Preprint with DOI {doi} not found" response.raise_for_status() data = response.json() collection = data.get("collection", []) if not collection: return f"No details found for DOI: {doi}" # Get the first (and should be only) paper paper = collection[0] title = paper.get("title", "No title") date = paper.get("date", "Unknown") authors = paper.get("authors", "Unknown authors") abstract = paper.get("abstract", "No abstract available") category = paper.get("category", "") server_name = paper.get("server", "Unknown") result = f"**{title}**\n\n" result += f"**DOI:** {doi}\n" result += f"**Server:** {server_name}\n" result += f"**Posted:** {date}\n" if category: result += f"**Category:** {category}\n" result += f"**Authors:** {authors}\n\n" result += f"**Abstract:**\n{abstract}\n\n" result += f"**Full Text URL:** https://doi.org/{doi}\n" return result except httpx.HTTPStatusError as e: logger.error(f"Error fetching preprint details: {e}") return f"Error fetching preprint details: HTTP {e.response.status_code}" except Exception as e: logger.error(f"Unexpected error getting preprint details: {e}") return f"Error getting preprint details: {str(e)}" if __name__ == "__main__": mcp.run(transport="stdio")