Spaces:
Running
Running
| """Query preprocessing utilities for biomedical search.""" | |
| import re | |
| # Question words and filler words to remove | |
| QUESTION_WORDS: set[str] = { | |
| # Question starters | |
| "what", | |
| "which", | |
| "how", | |
| "why", | |
| "when", | |
| "where", | |
| "who", | |
| "whom", | |
| # Auxiliary verbs in questions | |
| "is", | |
| "are", | |
| "was", | |
| "were", | |
| "do", | |
| "does", | |
| "did", | |
| "can", | |
| "could", | |
| "would", | |
| "should", | |
| "will", | |
| "shall", | |
| "may", | |
| "might", | |
| # Filler words in natural questions | |
| "show", | |
| "promise", | |
| "help", | |
| "believe", | |
| "think", | |
| "suggest", | |
| "possible", | |
| "potential", | |
| "effective", | |
| "useful", | |
| "good", | |
| # Articles (remove but less aggressively) | |
| "the", | |
| "a", | |
| "an", | |
| } | |
| # Medical synonym expansions (Sexual Health Focus) | |
| SYNONYMS: dict[str, list[str]] = { | |
| "erectile dysfunction": [ | |
| "ED", | |
| "impotence", | |
| "sexual dysfunction", | |
| ], | |
| "low libido": [ | |
| "hypoactive sexual desire disorder", | |
| "HSDD", | |
| "low sexual desire", | |
| "loss of libido", | |
| ], | |
| "menopause": [ | |
| "postmenopausal", | |
| "climacteric", | |
| "perimenopause", | |
| ], | |
| "testosterone": [ | |
| "androgen", | |
| "testosterone therapy", | |
| "TRT", | |
| ], | |
| "premature ejaculation": [ | |
| "PE", | |
| "rapid ejaculation", | |
| "early ejaculation", | |
| ], | |
| "pcos": [ | |
| "polycystic ovary syndrome", | |
| "Stein-Leventhal syndrome", | |
| ], | |
| } | |
| def strip_question_words(query: str) -> str: | |
| """ | |
| Remove question words and filler terms from query. | |
| Args: | |
| query: Raw query string | |
| Returns: | |
| Query with question words removed | |
| """ | |
| words = query.lower().split() | |
| filtered = [w for w in words if w not in QUESTION_WORDS] | |
| return " ".join(filtered) | |
| def expand_synonyms(query: str) -> str: | |
| """ | |
| Expand medical terms to include synonyms. | |
| Args: | |
| query: Search query (e.g., "testosterone libido") | |
| Returns: | |
| Query with synonym expansions in OR groups | |
| """ | |
| result = query.lower() | |
| for term, expansions in SYNONYMS.items(): | |
| if term in result: | |
| # Create OR group: ("term1" OR "term2" OR "term3") | |
| or_group = " OR ".join([f'"{exp}"' for exp in expansions]) | |
| # Case insensitive replacement is tricky with simple replace | |
| # But we lowercased result already. | |
| # However, this replaces ALL instances. | |
| # Also, result is lowercased, so we lose original casing if any. | |
| # But search engines are usually case-insensitive. | |
| result = result.replace(term, f"({or_group})") | |
| return result | |
| def preprocess_query(raw_query: str) -> str: | |
| """ | |
| Full preprocessing pipeline for PubMed queries. | |
| Pipeline: | |
| 1. Strip whitespace and punctuation | |
| 2. Remove question words | |
| 3. Expand medical synonyms | |
| Args: | |
| raw_query: Natural language query from user | |
| Returns: | |
| Optimized query for PubMed | |
| """ | |
| if not raw_query or not raw_query.strip(): | |
| return "" | |
| # Remove question marks and extra whitespace | |
| query = raw_query.replace("?", "").strip() | |
| query = re.sub(r"\s+", " ", query) | |
| # Strip question words | |
| query = strip_question_words(query) | |
| # Expand synonyms | |
| query = expand_synonyms(query) | |
| return query.strip() | |