Spaces:
Running
Running
File size: 3,386 Bytes
e502f0d fa696e8 e502f0d fa696e8 e502f0d fa696e8 e502f0d fa696e8 e502f0d fa696e8 e502f0d fa696e8 e502f0d fa696e8 e502f0d fa696e8 e502f0d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
"""Query preprocessing utilities for biomedical search."""
import re
# Question words and filler words to remove
QUESTION_WORDS: set[str] = {
# Question starters
"what",
"which",
"how",
"why",
"when",
"where",
"who",
"whom",
# Auxiliary verbs in questions
"is",
"are",
"was",
"were",
"do",
"does",
"did",
"can",
"could",
"would",
"should",
"will",
"shall",
"may",
"might",
# Filler words in natural questions
"show",
"promise",
"help",
"believe",
"think",
"suggest",
"possible",
"potential",
"effective",
"useful",
"good",
# Articles (remove but less aggressively)
"the",
"a",
"an",
}
# Medical synonym expansions (Sexual Health Focus)
SYNONYMS: dict[str, list[str]] = {
"erectile dysfunction": [
"ED",
"impotence",
"sexual dysfunction",
],
"low libido": [
"hypoactive sexual desire disorder",
"HSDD",
"low sexual desire",
"loss of libido",
],
"menopause": [
"postmenopausal",
"climacteric",
"perimenopause",
],
"testosterone": [
"androgen",
"testosterone therapy",
"TRT",
],
"premature ejaculation": [
"PE",
"rapid ejaculation",
"early ejaculation",
],
"pcos": [
"polycystic ovary syndrome",
"Stein-Leventhal syndrome",
],
}
def strip_question_words(query: str) -> str:
"""
Remove question words and filler terms from query.
Args:
query: Raw query string
Returns:
Query with question words removed
"""
words = query.lower().split()
filtered = [w for w in words if w not in QUESTION_WORDS]
return " ".join(filtered)
def expand_synonyms(query: str) -> str:
"""
Expand medical terms to include synonyms.
Args:
query: Search query (e.g., "testosterone libido")
Returns:
Query with synonym expansions in OR groups
"""
result = query.lower()
for term, expansions in SYNONYMS.items():
if term in result:
# Create OR group: ("term1" OR "term2" OR "term3")
or_group = " OR ".join([f'"{exp}"' for exp in expansions])
# Case insensitive replacement is tricky with simple replace
# But we lowercased result already.
# However, this replaces ALL instances.
# Also, result is lowercased, so we lose original casing if any.
# But search engines are usually case-insensitive.
result = result.replace(term, f"({or_group})")
return result
def preprocess_query(raw_query: str) -> str:
"""
Full preprocessing pipeline for PubMed queries.
Pipeline:
1. Strip whitespace and punctuation
2. Remove question words
3. Expand medical synonyms
Args:
raw_query: Natural language query from user
Returns:
Optimized query for PubMed
"""
if not raw_query or not raw_query.strip():
return ""
# Remove question marks and extra whitespace
query = raw_query.replace("?", "").strip()
query = re.sub(r"\s+", " ", query)
# Strip question words
query = strip_question_words(query)
# Expand synonyms
query = expand_synonyms(query)
return query.strip()
|