Spaces:

MCP-1st-Birthday
/

ALSARA

Running

App Files Files Community

ALSARA / query_classifier.py

axegameon

Upload ALSARA app files (#1)

3e435ad verified 11 days ago

raw

history blame contribute delete

7.42 kB

	#!/usr/bin/env python3
	"""
	Query Classification Module for ALS Research Agent
	Determines whether a query requires full research workflow or simple response
	"""

	import re
	from typing import Dict, Tuple, List
	import logging

	logger = logging.getLogger(__name__)


	class QueryClassifier:
	"""Classify queries as research-required or simple questions"""

	# Keywords that indicate ALS research is needed
	RESEARCH_KEYWORDS = [
	# Disease-specific terms
	'als', 'amyotrophic lateral sclerosis', 'motor neuron disease',
	'mnd', 'lou gehrig', 'ftd', 'frontotemporal dementia',

	# Medical research terms
	'clinical trial', 'treatment', 'therapy', 'drug', 'medication',
	'gene therapy', 'stem cell', 'biomarker', 'diagnosis',
	'prognosis', 'survival', 'progression', 'symptom',
	'cure', 'breakthrough', 'research', 'study', 'paper',
	'latest', 'recent', 'new findings', 'advances',

	# Specific ALS-related
	'riluzole', 'edaravone', 'radicava', 'relyvrio', 'qalsody',
	'tofersen', 'sod1', 'c9orf72', 'tdp-43', 'fus',

	# Research actions
	'find studies', 'search papers', 'what research',
	'clinical evidence', 'scientific literature'
	]

	# Keywords that indicate simple/general questions
	SIMPLE_KEYWORDS = [
	'hello', 'hi', 'hey', 'thanks', 'thank you',
	'how are you', "what's your name", 'who are you',
	'what can you do', 'help', 'test', 'testing',
	'explain', 'define', 'what is', 'what are',
	'how does', 'why', 'when', 'where', 'who'
	]

	# Exclusion patterns for non-research queries
	NON_RESEARCH_PATTERNS = [
	r'^(hi\|hello\|hey\|thanks\|thank you)',
	r'^test\s',
	r'^how (are you\|do you)',
	r'^what (is\|are) (the\|a\|an)\s+\w+$', # Simple definitions
	r'^(explain\|define)\s+\w+$', # Simple explanations
	r'^\w{1,3}$', # Very short queries
	]

	@classmethod
	def classify_query(cls, query: str) -> Dict[str, any]:
	"""
	Classify a query and determine processing strategy.

	Returns:
	Dict with:
	- requires_research: bool - Whether to use full research workflow
	- confidence: float - Confidence in classification (0-1)
	- reason: str - Explanation of classification
	- suggested_mode: str - 'research' or 'simple'
	"""
	query_lower = query.lower().strip()

	# Check for very short or empty queries
	if len(query_lower) < 5:
	return {
	'requires_research': False,
	'confidence': 0.9,
	'reason': 'Query too short for research',
	'suggested_mode': 'simple'
	}

	# Check exclusion patterns first
	for pattern in cls.NON_RESEARCH_PATTERNS:
	if re.match(pattern, query_lower):
	return {
	'requires_research': False,
	'confidence': 0.85,
	'reason': 'Matches non-research pattern',
	'suggested_mode': 'simple'
	}

	# Count research keywords
	research_score = sum(
	1 for keyword in cls.RESEARCH_KEYWORDS
	if keyword in query_lower
	)

	# Count simple keywords
	simple_score = sum(
	1 for keyword in cls.SIMPLE_KEYWORDS
	if keyword in query_lower
	)

	# Check for question complexity
	has_multiple_questions = query.count('?') > 1
	has_complex_structure = len(query.split()) > 15
	mentions_comparison = any(word in query_lower for word in
	['compare', 'versus', 'vs', 'difference between'])

	# Decision logic - Conservative approach for ALS research agent

	# FIRST: Check if this is truly just a greeting/thanks (only these skip research)
	greeting_only = query_lower in ['hi', 'hello', 'hey', 'thanks', 'thank you', 'bye', 'goodbye', 'test']
	if greeting_only and research_score == 0:
	return {
	'requires_research': False,
	'confidence': 0.95,
	'reason': 'Pure greeting or acknowledgment',
	'suggested_mode': 'simple'
	}

	# SECOND: If ANY research keyword is present, use research mode
	# This includes "ALS", "treatment", "therapy", etc.
	if research_score >= 1:
	return {
	'requires_research': True,
	'confidence': min(0.95, 0.7 + research_score * 0.1),
	'reason': f'Contains research-related terms ({research_score} keywords)',
	'suggested_mode': 'research'
	}

	# THIRD: Check for questions about the agent itself
	about_agent = any(phrase in query_lower for phrase in [
	'who are you', 'what can you do', 'how do you work',
	'what are you', 'your capabilities'
	])
	if about_agent:
	return {
	'requires_research': False,
	'confidence': 0.85,
	'reason': 'Question about the agent itself',
	'suggested_mode': 'simple'
	}

	# DEFAULT: For an ALS research agent, when in doubt, use research mode
	# This is safer than potentially missing important medical queries
	return {
	'requires_research': True,
	'confidence': 0.6,
	'reason': 'Default to research mode for potential medical queries',
	'suggested_mode': 'research'
	}

	@classmethod
	def should_use_tools(cls, query: str) -> bool:
	"""Quick check if query needs research tools"""
	classification = cls.classify_query(query)
	return classification['requires_research'] and classification['confidence'] > 0.65

	@classmethod
	def get_processing_hint(cls, classification: Dict) -> str:
	"""Get a hint for how to process the query"""
	if classification['requires_research']:
	return "🔬 Using full research workflow ..."
	else:
	return "💬 Providing direct response without research tools"


	def test_classifier():
	"""Test the classifier with example queries"""
	test_queries = [
	# Should require research
	"What are the latest gene therapy trials for ALS?",
	"Compare riluzole and edaravone effectiveness",
	"Find recent studies on SOD1 mutations",
	"What breakthroughs in ALS treatment happened in 2024?",
	"Are there any promising stem cell therapies for motor neuron disease?",

	# Should NOT require research
	"Hello, how are you?",
	"What is your name?",
	"Test",
	"Thanks for your help",
	"Explain what a database is",
	"What time is it?",
	"How do I use this app?",
	]

	print("Query Classification Test Results")
	print("=" * 60)

	for query in test_queries:
	result = QueryClassifier.classify_query(query)
	print(f"\nQuery: {query[:50]}...")
	print(f"Requires Research: {result['requires_research']}")
	print(f"Confidence: {result['confidence']:.2f}")
	print(f"Reason: {result['reason']}")
	print(f"Mode: {result['suggested_mode']}")
	print("-" * 40)


	if __name__ == "__main__":
	test_classifier()