File size: 3,386 Bytes
e502f0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa696e8
e502f0d
fa696e8
 
 
 
e502f0d
fa696e8
 
 
 
 
e502f0d
fa696e8
 
 
 
e502f0d
fa696e8
 
 
 
e502f0d
fa696e8
 
 
 
e502f0d
fa696e8
 
 
e502f0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa696e8
e502f0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
"""Query preprocessing utilities for biomedical search."""

import re

# Question words and filler words to remove
QUESTION_WORDS: set[str] = {
    # Question starters
    "what",
    "which",
    "how",
    "why",
    "when",
    "where",
    "who",
    "whom",
    # Auxiliary verbs in questions
    "is",
    "are",
    "was",
    "were",
    "do",
    "does",
    "did",
    "can",
    "could",
    "would",
    "should",
    "will",
    "shall",
    "may",
    "might",
    # Filler words in natural questions
    "show",
    "promise",
    "help",
    "believe",
    "think",
    "suggest",
    "possible",
    "potential",
    "effective",
    "useful",
    "good",
    # Articles (remove but less aggressively)
    "the",
    "a",
    "an",
}

# Medical synonym expansions (Sexual Health Focus)
SYNONYMS: dict[str, list[str]] = {
    "erectile dysfunction": [
        "ED",
        "impotence",
        "sexual dysfunction",
    ],
    "low libido": [
        "hypoactive sexual desire disorder",
        "HSDD",
        "low sexual desire",
        "loss of libido",
    ],
    "menopause": [
        "postmenopausal",
        "climacteric",
        "perimenopause",
    ],
    "testosterone": [
        "androgen",
        "testosterone therapy",
        "TRT",
    ],
    "premature ejaculation": [
        "PE",
        "rapid ejaculation",
        "early ejaculation",
    ],
    "pcos": [
        "polycystic ovary syndrome",
        "Stein-Leventhal syndrome",
    ],
}


def strip_question_words(query: str) -> str:
    """
    Remove question words and filler terms from query.

    Args:
        query: Raw query string

    Returns:
        Query with question words removed
    """
    words = query.lower().split()
    filtered = [w for w in words if w not in QUESTION_WORDS]
    return " ".join(filtered)


def expand_synonyms(query: str) -> str:
    """
    Expand medical terms to include synonyms.

    Args:
        query: Search query (e.g., "testosterone libido")

    Returns:
        Query with synonym expansions in OR groups
    """
    result = query.lower()

    for term, expansions in SYNONYMS.items():
        if term in result:
            # Create OR group: ("term1" OR "term2" OR "term3")
            or_group = " OR ".join([f'"{exp}"' for exp in expansions])
            # Case insensitive replacement is tricky with simple replace
            # But we lowercased result already.
            # However, this replaces ALL instances.
            # Also, result is lowercased, so we lose original casing if any.
            # But search engines are usually case-insensitive.
            result = result.replace(term, f"({or_group})")

    return result


def preprocess_query(raw_query: str) -> str:
    """
    Full preprocessing pipeline for PubMed queries.

    Pipeline:
    1. Strip whitespace and punctuation
    2. Remove question words
    3. Expand medical synonyms

    Args:
        raw_query: Natural language query from user

    Returns:
        Optimized query for PubMed
    """
    if not raw_query or not raw_query.strip():
        return ""

    # Remove question marks and extra whitespace
    query = raw_query.replace("?", "").strip()
    query = re.sub(r"\s+", " ", query)

    # Strip question words
    query = strip_question_words(query)

    # Expand synonyms
    query = expand_synonyms(query)

    return query.strip()