File size: 17,712 Bytes
3e435ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
# biorxiv_server_fixed.py
from mcp.server.fastmcp import FastMCP
import httpx
import logging
from datetime import datetime, timedelta
import sys
from pathlib import Path
import re

# Add parent directory to path for shared imports
sys.path.insert(0, str(Path(__file__).parent.parent))

from shared import (
    config,
    RateLimiter,
    format_authors,
    ErrorFormatter,
    truncate_text
)
from shared.http_client import get_http_client, CustomHTTPClient

# Configure logging with DEBUG for detailed troubleshooting
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

mcp = FastMCP("biorxiv-server")

# Rate limiting using shared utility
rate_limiter = RateLimiter(config.rate_limits.biorxiv_delay)


def preprocess_query(query: str) -> tuple[list[str], list[str]]:
    """Preprocess query into search terms and handle synonyms.

    Returns:
        tuple of (primary_terms, all_search_terms)
    """
    # Convert to lowercase for matching
    query_lower = query.lower()

    # Common ALS-related synonyms and variations
    synonyms = {
        'als': ['amyotrophic lateral sclerosis', 'motor neuron disease', 'motor neurone disease', 'lou gehrig'],
        'amyotrophic lateral sclerosis': ['als', 'motor neuron disease'],
        'mnd': ['motor neuron disease', 'motor neurone disease', 'als'],
        'sod1': ['superoxide dismutase 1', 'cu/zn superoxide dismutase'],
        'tdp-43': ['tdp43', 'tardbp', 'tar dna binding protein'],
        'c9orf72': ['c9', 'chromosome 9 open reading frame 72'],
        'fus': ['fused in sarcoma', 'tls'],
    }

    # Split query into individual terms (handle multiple spaces and special chars)
    # Keep hyphenated words together (like TDP-43)
    terms = re.split(r'\s+', query_lower.strip())

    # Build comprehensive search term list
    all_terms = []
    primary_terms = []

    for term in terms:
        # Skip very short terms unless they're known abbreviations
        if len(term) < 3 and term not in ['als', 'mnd', 'fus', 'c9']:
            continue

        primary_terms.append(term)
        all_terms.append(term)

        # Add synonyms if they exist
        if term in synonyms:
            all_terms.extend(synonyms[term])

    # Remove duplicates while preserving order
    seen = set()
    all_terms = [t for t in all_terms if not (t in seen or seen.add(t))]
    primary_terms = [t for t in primary_terms if not (t in seen or seen.add(t))]

    return primary_terms, all_terms


def matches_query(paper: dict, primary_terms: list[str], all_terms: list[str], require_all: bool = False) -> bool:
    """Check if a paper matches the search query.

    Args:
        paper: Paper dictionary from bioRxiv API
        primary_terms: Main search terms from user query
        all_terms: All search terms including synonyms
        require_all: If True, require ALL primary terms. If False, require ANY term.

    Returns:
        True if paper matches search criteria
    """
    # Get searchable text
    title = paper.get("title", "").lower()
    abstract = paper.get("abstract", "").lower()
    searchable_text = f" {title} {abstract} "  # Add spaces for boundary matching

    # DEBUG: Log paper being checked
    paper_doi = paper.get("doi", "unknown")
    logger.debug(f"πŸ” Checking paper: {title[:60]}... (DOI: {paper_doi})")

    if not searchable_text.strip():
        logger.debug(f"  ❌ Rejected: No title/abstract")
        return False

    # For ALS specifically, need to be careful about word boundaries
    has_any_match = False
    matched_term = None
    for term in all_terms:
        # For short terms like "ALS", require word boundaries
        if len(term) <= 3:
            # Check for word boundary match
            pattern = r'\b' + re.escape(term) + r'\b'
            if re.search(pattern, searchable_text, re.IGNORECASE):
                has_any_match = True
                matched_term = term
                break
        else:
            # For longer terms, can be more lenient
            if term.lower() in searchable_text:
                has_any_match = True
                matched_term = term
                break

    if not has_any_match:
        logger.debug(f"  ❌ Rejected: No term match. Terms searched: {all_terms[:3]}...")
        return False

    logger.debug(f"  βœ… Matched on term: '{matched_term}'")

    # If we only need any match, we're done
    if not require_all:
        return True

    # For require_all, check that all primary terms are present
    # Allow for word boundaries to avoid partial matches
    for term in primary_terms:
        # Create pattern that matches the term as a whole word or part of hyphenated word
        # This handles cases like "TDP-43" or "SOD1"
        pattern = r'\b' + re.escape(term) + r'(?:\b|[-])'
        if not re.search(pattern, searchable_text, re.IGNORECASE):
            return False

    return True


@mcp.tool()
async def search_preprints(
    query: str,
    server: str = "both",
    max_results: int = 10,
    days_back: int = 365
) -> str:
    """Search bioRxiv and medRxiv for ALS preprints. Returns recent preprints before peer review.

    Args:
        query: Search query (e.g., 'ALS TDP-43')
        server: Which server to search - one of: biorxiv, medrxiv, both (default: both)
        max_results: Maximum number of results (default: 10)
        days_back: Number of days to look back (default: 365 - about 1 year)
    """
    try:
        logger.info(f"πŸ”Ž Searching bioRxiv/medRxiv for: '{query}'")
        logger.info(f"   Parameters: server={server}, max_results={max_results}, days_back={days_back}")

        # Preprocess query for better matching
        primary_terms, all_terms = preprocess_query(query)
        logger.info(f"πŸ“ Search terms: primary={primary_terms}, all={all_terms}")

        # Calculate date range
        end_date = datetime.now()
        start_date = end_date - timedelta(days=days_back)

        # Format dates for API (YYYY-MM-DD)
        start_date_str = start_date.strftime("%Y-%m-%d")
        end_date_str = end_date.strftime("%Y-%m-%d")
        logger.info(f"πŸ“… Date range: {start_date_str} to {end_date_str}")

        # bioRxiv/medRxiv API endpoint
        base_url = "https://api.biorxiv.org/details"

        all_results = []
        servers_to_search = []

        if server in ["biorxiv", "both"]:
            servers_to_search.append("biorxiv")
        if server in ["medrxiv", "both"]:
            servers_to_search.append("medrxiv")

        # Use a custom HTTP client with proper timeout for bioRxiv
        # Don't use shared client as it may have conflicting timeout settings
        async with CustomHTTPClient(timeout=15.0) as client:
            for srv in servers_to_search:
                try:
                    cursor = 0
                    found_in_server = []
                    max_iterations = 1  # Only check first page (100 papers) for much faster response
                    iteration = 0

                    while iteration < max_iterations:
                        # Rate limiting
                        await rate_limiter.wait()

                        # Search by date range with cursor for pagination
                        url = f"{base_url}/{srv}/{start_date_str}/{end_date_str}/{cursor}"

                        logger.info(f"🌐 Querying {srv} API (page {iteration+1}, cursor={cursor})")
                        logger.info(f"   URL: {url}")
                        response = await client.get(url)
                        response.raise_for_status()
                        data = response.json()

                        # Extract collection
                        collection = data.get("collection", [])

                        if not collection:
                            logger.info(f"πŸ“­ No more results from {srv}")
                            break

                        logger.info(f"πŸ“¦ Fetched {len(collection)} papers from API")

                        # Show first few papers for debugging
                        if iteration == 0 and collection:
                            logger.info("   Sample papers from API:")
                            for i, paper in enumerate(collection[:3]):
                                logger.info(f"   {i+1}. {paper.get('title', 'No title')[:60]}...")

                        # Filter papers using improved matching
                        # Start with lenient matching (ANY term)
                        logger.debug(f"πŸ” Starting to filter {len(collection)} papers...")
                        filtered = [
                            paper for paper in collection
                            if matches_query(paper, primary_terms, all_terms, require_all=False)
                        ]

                        logger.info(f"βœ… Filtered results: {len(filtered)}/{len(collection)} papers matched")

                        if len(filtered) > 0:
                            logger.info("   Matched papers:")
                            for i, paper in enumerate(filtered[:3]):
                                logger.info(f"   {i+1}. {paper.get('title', 'No title')[:60]}...")

                        found_in_server.extend(filtered)
                        logger.info(f"πŸ“Š Running total for {srv}: {len(found_in_server)} papers")

                        # Check if we have enough results
                        if len(found_in_server) >= max_results:
                            logger.info(f"Reached max_results limit ({max_results})")
                            break

                        # Continue searching if we haven't found enough
                        if len(found_in_server) < 5 and iteration < max_iterations - 1:
                            # Keep searching for more results
                            pass
                        elif len(found_in_server) > 0 and iteration >= 3:
                            # Found some results after reasonable search
                            logger.info(f"Found {len(found_in_server)} results after {iteration+1} pages")
                            break

                        # Check for more pages
                        messages = data.get("messages", [])

                        # The API returns "cursor" in messages for next page
                        has_more = False
                        for msg in messages:
                            if "cursor=" in str(msg):
                                try:
                                    cursor_str = str(msg).split("cursor=")[1].split()[0]
                                    next_cursor = int(cursor_str)
                                    if next_cursor > cursor:
                                        cursor = next_cursor
                                        has_more = True
                                        break
                                except:
                                    pass

                        # Alternative: increment by collection size
                        if not has_more:
                            if len(collection) >= 100:
                                cursor += len(collection)
                            else:
                                # Less than full page means we've reached the end
                                break

                        iteration += 1

                    all_results.extend(found_in_server[:max_results])
                    logger.info(f"🏁 Total results from {srv}: {len(found_in_server)} papers found")

                except httpx.HTTPStatusError as e:
                    logger.warning(f"Error searching {srv}: {e}")
                    continue
                except Exception as e:
                    logger.warning(f"Unexpected error searching {srv}: {e}")
                    continue

        # If no results with lenient matching, provide helpful message
        if not all_results:
            logger.warning(f"⚠️ No preprints found for query: {query}")

            # Provide suggestions for improving search
            suggestions = []
            if len(primary_terms) > 3:
                suggestions.append("Try using fewer search terms")
            if not any(term in ['als', 'amyotrophic lateral sclerosis', 'motor neuron'] for term in all_terms):
                suggestions.append("Add 'ALS' or 'motor neuron disease' to your search")
            if days_back < 365:
                suggestions.append(f"Expand the time range beyond {days_back} days")

            suggestion_text = ""
            if suggestions:
                suggestion_text = "\n\nSuggestions:\n" + "\n".join(f"- {s}" for s in suggestions)

            return f"No preprints found for query: '{query}' in the last {days_back} days{suggestion_text}"

        # Sort by date (most recent first)
        all_results.sort(key=lambda x: x.get("date", ""), reverse=True)

        # Limit results
        all_results = all_results[:max_results]

        logger.info(f"🎯 FINAL RESULTS: Returning {len(all_results)} preprints for '{query}'")
        if all_results:
            logger.info("   Top results:")
            for i, paper in enumerate(all_results[:3], 1):
                logger.info(f"   {i}. {paper.get('title', 'No title')[:60]}...")
                logger.info(f"      DOI: {paper.get('doi', 'unknown')}, Date: {paper.get('date', 'unknown')}")

        # Format results
        result = f"Found {len(all_results)} preprints for query: '{query}'\n\n"

        for i, paper in enumerate(all_results, 1):
            title = paper.get("title", "No title")
            doi = paper.get("doi", "Unknown")
            date = paper.get("date", "Unknown")
            authors = paper.get("authors", "Unknown authors")
            authors_str = format_authors(authors, max_authors=3)

            abstract = paper.get("abstract", "No abstract available")
            category = paper.get("category", "")
            server_name = "bioRxiv" if "biorxiv" in doi else "medRxiv"

            result += f"{i}. **{title}**\n"
            result += f"   DOI: {doi} | {server_name} | Posted: {date}\n"
            result += f"   Authors: {authors_str}\n"
            if category:
                result += f"   Category: {category}\n"
            result += f"   Abstract: {truncate_text(abstract, max_chars=300, suffix='')}\n"
            result += f"   URL: https://doi.org/{doi}\n\n"

        logger.info(f"Successfully retrieved {len(all_results)} preprints")
        return result

    except httpx.TimeoutException:
        logger.error("bioRxiv/medRxiv API request timed out")
        return "Error: bioRxiv/medRxiv API request timed out. Please try again."
    except httpx.HTTPStatusError as e:
        logger.error(f"bioRxiv/medRxiv API error: {e}")
        return f"Error: bioRxiv/medRxiv API returned status code {e.response.status_code}"
    except Exception as e:
        logger.error(f"Unexpected error in search_preprints: {e}")
        return f"Error searching preprints: {str(e)}"


@mcp.tool()
async def get_preprint_details(doi: str) -> str:
    """Get full details for a specific bioRxiv/medRxiv preprint by DOI.

    Args:
        doi: The DOI of the preprint (e.g., '10.1101/2024.01.01.123456')
    """
    try:
        logger.info(f"Getting details for DOI: {doi}")

        # Ensure DOI is properly formatted
        if not doi.startswith("10.1101/"):
            doi = f"10.1101/{doi}"

        # Determine server from DOI
        # bioRxiv DOIs typically have format: 10.1101/YYYY.MM.DD.NNNNNN
        # medRxiv DOIs are similar but the content determines the server

        # Use shared HTTP client for connection pooling
        client = get_http_client(timeout=30.0)
        # Try the DOI endpoint
        url = f"https://api.biorxiv.org/details/{doi}"

        response = await client.get(url)

        if response.status_code == 404:
            # Try with both servers
            for server in ["biorxiv", "medrxiv"]:
                url = f"https://api.biorxiv.org/details/{server}/{doi}"
                response = await client.get(url)
                if response.status_code == 200:
                    break
            else:
                return f"Preprint with DOI {doi} not found"

        response.raise_for_status()
        data = response.json()

        collection = data.get("collection", [])
        if not collection:
            return f"No details found for DOI: {doi}"

        # Get the first (and should be only) paper
        paper = collection[0]

        title = paper.get("title", "No title")
        date = paper.get("date", "Unknown")
        authors = paper.get("authors", "Unknown authors")
        abstract = paper.get("abstract", "No abstract available")
        category = paper.get("category", "")
        server_name = paper.get("server", "Unknown")

        result = f"**{title}**\n\n"
        result += f"**DOI:** {doi}\n"
        result += f"**Server:** {server_name}\n"
        result += f"**Posted:** {date}\n"
        if category:
            result += f"**Category:** {category}\n"
        result += f"**Authors:** {authors}\n\n"
        result += f"**Abstract:**\n{abstract}\n\n"
        result += f"**Full Text URL:** https://doi.org/{doi}\n"

        return result

    except httpx.HTTPStatusError as e:
        logger.error(f"Error fetching preprint details: {e}")
        return f"Error fetching preprint details: HTTP {e.response.status_code}"
    except Exception as e:
        logger.error(f"Unexpected error getting preprint details: {e}")
        return f"Error getting preprint details: {str(e)}"


if __name__ == "__main__":
    mcp.run(transport="stdio")