sentinel / tests /test_risk_models /test_plcom2012_model.py
jeuko's picture
Sync from GitHub (main)
8018595 verified
"""Tests for the PLCOm2012 Lung Cancer Risk Model.
Ground truth values are calculated from authors' reference implementation in
https://brocku.ca/lung-cancer-screening-and-risk-prediction/risk-calculators/
and the reference implementation in R: https://github.com/resplab/PLCOm2012.
"""
import pytest
from sentinel.risk_models.plcom2012 import PLCOm2012RiskModel
from sentinel.user_input import (
Anthropometrics,
CancerType,
ChronicCondition,
Demographics,
Ethnicity,
FamilyMemberCancer,
FamilyRelation,
FamilySide,
Lifestyle,
PersonalMedicalHistory,
RelationshipDegree,
Sex,
SmokingHistory,
SmokingStatus,
UserInput,
)
# Test cases with calculated ground truth data (inline UserInput like Gail tests)
GROUND_TRUTH_CASES = [
{
"name": "low_risk_current_smoker",
"input": UserInput(
demographics=Demographics(
age_years=55,
sex=Sex.MALE,
ethnicity=Ethnicity.WHITE,
anthropometrics=Anthropometrics(
height_cm=175.0,
weight_kg=25.0 * (1.75**2),
),
education_level=4,
),
lifestyle=Lifestyle(
smoking=SmokingHistory(
status=SmokingStatus.CURRENT,
cigarettes_per_day=10,
years_smoked=20,
years_since_quit=None,
),
),
personal_medical_history=PersonalMedicalHistory(
chronic_conditions=[],
previous_cancers=[],
),
family_history=[],
),
"expected": 0.31,
},
{
"name": "moderate_risk_former_smoker",
"input": UserInput(
demographics=Demographics(
age_years=62,
sex=Sex.MALE,
ethnicity=Ethnicity.WHITE,
anthropometrics=Anthropometrics(
height_cm=175.0,
weight_kg=27.0 * (1.75**2),
),
education_level=3,
),
lifestyle=Lifestyle(
smoking=SmokingHistory(
status=SmokingStatus.FORMER,
cigarettes_per_day=20,
years_smoked=30,
years_since_quit=5,
),
),
personal_medical_history=PersonalMedicalHistory(
chronic_conditions=[],
previous_cancers=[],
),
family_history=[],
),
"expected": 1.24,
},
{
"name": "high_risk_multiple_factors",
"input": UserInput(
demographics=Demographics(
age_years=70,
sex=Sex.MALE,
ethnicity=Ethnicity.WHITE,
anthropometrics=Anthropometrics(
height_cm=175.0,
weight_kg=22.0 * (1.75**2),
),
education_level=2,
),
lifestyle=Lifestyle(
smoking=SmokingHistory(
status=SmokingStatus.CURRENT,
cigarettes_per_day=40,
years_smoked=45,
years_since_quit=None,
),
),
personal_medical_history=PersonalMedicalHistory(
chronic_conditions=[ChronicCondition.COPD],
previous_cancers=[CancerType.BREAST],
),
family_history=[
FamilyMemberCancer(
relation=FamilyRelation.MOTHER,
side=FamilySide.MATERNAL,
degree=RelationshipDegree.FIRST,
cancer_type=CancerType.LUNG,
age_at_diagnosis=65,
)
],
),
"expected": 31.19,
},
{
"name": "black_race_variant",
"input": UserInput(
demographics=Demographics(
age_years=58,
sex=Sex.MALE,
ethnicity=Ethnicity.BLACK,
anthropometrics=Anthropometrics(
height_cm=175.0,
weight_kg=28.0 * (1.75**2),
),
education_level=4,
),
lifestyle=Lifestyle(
smoking=SmokingHistory(
status=SmokingStatus.FORMER,
cigarettes_per_day=15,
years_smoked=25,
years_since_quit=8,
),
),
personal_medical_history=PersonalMedicalHistory(
chronic_conditions=[],
previous_cancers=[],
),
family_history=[],
),
"expected": 0.696,
},
{
"name": "hispanic_low_education",
"input": UserInput(
demographics=Demographics(
age_years=60,
sex=Sex.MALE,
ethnicity=Ethnicity.HISPANIC,
anthropometrics=Anthropometrics(
height_cm=175.0,
weight_kg=30.0 * (1.75**2),
),
education_level=1,
),
lifestyle=Lifestyle(
smoking=SmokingHistory(
status=SmokingStatus.CURRENT,
cigarettes_per_day=25,
years_smoked=35,
years_since_quit=None,
),
),
personal_medical_history=PersonalMedicalHistory(
chronic_conditions=[],
previous_cancers=[],
),
family_history=[],
),
"expected": 1.161,
},
{
"name": "asian_former_heavy_smoker",
"input": UserInput(
demographics=Demographics(
age_years=65,
sex=Sex.MALE,
ethnicity=Ethnicity.ASIAN,
anthropometrics=Anthropometrics(
height_cm=175.0,
weight_kg=24.0 * (1.75**2),
),
education_level=5,
),
lifestyle=Lifestyle(
smoking=SmokingHistory(
status=SmokingStatus.FORMER,
cigarettes_per_day=25,
years_smoked=35,
years_since_quit=3,
),
),
personal_medical_history=PersonalMedicalHistory(
chronic_conditions=[ChronicCondition.COPD],
previous_cancers=[],
),
family_history=[
FamilyMemberCancer(
relation=FamilyRelation.MOTHER,
side=FamilySide.MATERNAL,
degree=RelationshipDegree.FIRST,
cancer_type=CancerType.LUNG,
age_at_diagnosis=65,
)
],
),
"expected": 3.40,
},
]
class TestPLCOm2012Model:
"""Test suite for PLCOm2012RiskModel."""
def setup_method(self):
"""Initialize PLCOm2012RiskModel instance for testing."""
self.model = PLCOm2012RiskModel()
@pytest.mark.parametrize("case", GROUND_TRUTH_CASES, ids=lambda x: x["name"])
def test_ground_truth_validation(self, case):
"""Test against calculated ground truth results.
Args:
case: Parameterized ground truth case dict.
"""
user = case["input"]
score_str = self.model.compute_score(user)
calculated = float(score_str.rstrip("%"))
expected = case["expected"]
# Using tight tolerance since these are calculated values
assert calculated == pytest.approx(expected, abs=0.01)
def test_user_input_integration_current_smoker(self):
"""Test integration with UserInput model for current smoker."""
user = UserInput(
demographics=Demographics(
age_years=60,
sex=Sex.MALE,
ethnicity=Ethnicity.WHITE,
anthropometrics=Anthropometrics(
height_cm=175.0,
weight_kg=80.0,
),
education_level=4,
),
lifestyle=Lifestyle(
smoking=SmokingHistory(
status=SmokingStatus.CURRENT,
cigarettes_per_day=20,
years_smoked=25,
years_since_quit=None,
),
),
personal_medical_history=PersonalMedicalHistory(
chronic_conditions=[],
previous_cancers=[],
),
family_history=[],
)
score = self.model.compute_score(user)
assert score != "N/A: Model is for current or former smokers only."
assert "%" in score
assert float(score.replace("%", "")) > 0
def test_user_input_integration_former_smoker(self):
"""Test integration with UserInput model for former smoker."""
user = UserInput(
demographics=Demographics(
age_years=65,
sex=Sex.FEMALE,
ethnicity=Ethnicity.BLACK,
anthropometrics=Anthropometrics(
height_cm=160.0,
weight_kg=70.0,
),
education_level=3,
),
lifestyle=Lifestyle(
smoking=SmokingHistory(
status=SmokingStatus.FORMER,
cigarettes_per_day=15,
years_smoked=30,
years_since_quit=10,
),
),
personal_medical_history=PersonalMedicalHistory(
chronic_conditions=[ChronicCondition.COPD],
previous_cancers=[CancerType.BREAST],
),
family_history=[
FamilyMemberCancer(
relation=FamilyRelation.FATHER,
side=FamilySide.PATERNAL,
degree=RelationshipDegree.FIRST,
cancer_type=CancerType.LUNG,
age_at_diagnosis=68,
)
],
)
score = self.model.compute_score(user)
assert score != "N/A: Model is for current or former smokers only."
assert "%" in score
assert float(score.replace("%", "")) > 0
def test_never_smoker_handling(self):
"""Test that never smokers receive N/A response."""
never_smoker = UserInput(
demographics=Demographics(
age_years=55,
sex=Sex.MALE,
ethnicity=Ethnicity.WHITE,
anthropometrics=Anthropometrics(height_cm=175.0, weight_kg=80.0),
education_level=4,
),
lifestyle=Lifestyle(
smoking=SmokingHistory(
status=SmokingStatus.NEVER,
cigarettes_per_day=0,
years_smoked=0,
years_since_quit=None,
),
),
personal_medical_history=PersonalMedicalHistory(),
family_history=[],
)
score = self.model.compute_score(never_smoker)
assert score == "N/A: Model is for current or former smokers only."
def test_validation_errors(self):
"""Test validation errors for missing required fields."""
user = UserInput(
demographics=Demographics(
age_years=60,
sex=Sex.MALE,
ethnicity=Ethnicity.WHITE,
anthropometrics=Anthropometrics(height_cm=175.0, weight_kg=80.0),
education_level=4,
),
lifestyle=Lifestyle(
smoking=SmokingHistory(
status=SmokingStatus.CURRENT,
cigarettes_per_day=20,
years_smoked=25,
years_since_quit=None,
),
),
personal_medical_history=PersonalMedicalHistory(),
family_history=[],
)
# This should pass validation since all required fields are present
score = self.model.compute_score(user)
assert "%" in score
def test_age_out_of_range(self):
"""Test age outside validated range raises ValueError."""
user = UserInput(
demographics=Demographics(
age_years=45, # Below minimum
sex=Sex.MALE,
ethnicity=Ethnicity.WHITE,
anthropometrics=Anthropometrics(height_cm=175.0, weight_kg=80.0),
education_level=4,
),
lifestyle=Lifestyle(
smoking=SmokingHistory(
status=SmokingStatus.CURRENT,
cigarettes_per_day=20,
years_smoked=25,
years_since_quit=None,
),
),
personal_medical_history=PersonalMedicalHistory(),
family_history=[],
)
with pytest.raises(ValueError, match=r"Invalid inputs for PLCOm2012:"):
self.model.compute_score(user)
def test_age_validation_legacy(self):
"""Test age validation (50-80 range) - legacy behavior."""
# This test is now handled by input validation, so we expect ValueError
young_user = UserInput(
demographics=Demographics(
age_years=49,
sex=Sex.MALE,
ethnicity=Ethnicity.WHITE,
anthropometrics=Anthropometrics(height_cm=175.0, weight_kg=75.0),
education_level=4,
),
lifestyle=Lifestyle(
smoking=SmokingHistory(
status=SmokingStatus.CURRENT,
cigarettes_per_day=20,
years_smoked=25,
years_since_quit=None,
),
),
personal_medical_history=PersonalMedicalHistory(),
family_history=[],
)
with pytest.raises(ValueError, match=r"Invalid inputs for PLCOm2012:"):
self.model.compute_score(young_user)
old_user = UserInput(
demographics=Demographics(
age_years=81,
sex=Sex.MALE,
ethnicity=Ethnicity.WHITE,
anthropometrics=Anthropometrics(height_cm=175.0, weight_kg=75.0),
education_level=4,
),
lifestyle=Lifestyle(
smoking=SmokingHistory(
status=SmokingStatus.CURRENT,
cigarettes_per_day=20,
years_smoked=25,
years_since_quit=None,
),
),
personal_medical_history=PersonalMedicalHistory(),
family_history=[],
)
with pytest.raises(ValueError, match=r"Invalid inputs for PLCOm2012:"):
self.model.compute_score(old_user)
def test_missing_bmi_data(self):
"""Test handling of missing BMI data."""
# This test is now handled by input validation since anthropometrics is required
# We can't create a UserInput without anthropometrics due to Pydantic validation
pass
def test_missing_education_level(self):
"""Test handling of missing education level."""
user = UserInput(
demographics=Demographics(
age_years=60,
sex=Sex.MALE,
ethnicity=Ethnicity.WHITE,
anthropometrics=Anthropometrics(height_cm=175.0, weight_kg=80.0),
# Missing education_level
),
lifestyle=Lifestyle(
smoking=SmokingHistory(
status=SmokingStatus.CURRENT,
cigarettes_per_day=20,
years_smoked=25,
years_since_quit=None,
),
),
personal_medical_history=PersonalMedicalHistory(),
family_history=[],
)
with pytest.raises(ValueError, match=r"Invalid inputs for PLCOm2012:"):
self.model.compute_score(user)
def test_missing_smoking_intensity(self):
"""Test handling of missing smoking intensity."""
# This test is now handled by the model's internal validation
# since 0 cigarettes per day causes a division by zero in the calculation
user = UserInput(
demographics=Demographics(
age_years=60,
sex=Sex.MALE,
ethnicity=Ethnicity.WHITE,
anthropometrics=Anthropometrics(height_cm=175.0, weight_kg=80.0),
education_level=4,
),
lifestyle=Lifestyle(
smoking=SmokingHistory(
status=SmokingStatus.CURRENT,
cigarettes_per_day=0, # This will cause division by zero
years_smoked=25,
years_since_quit=None,
),
),
personal_medical_history=PersonalMedicalHistory(),
family_history=[],
)
# The model should handle this gracefully and return an N/A message
score = self.model.compute_score(user)
assert "Calculation failed" in score
def test_missing_smoking_duration(self):
"""Test handling of missing smoking duration."""
# This test is now handled by input validation since years_smoked >= 0 is required
# The model will accept 0 years smoked as valid input
user = UserInput(
demographics=Demographics(
age_years=60,
sex=Sex.MALE,
ethnicity=Ethnicity.WHITE,
anthropometrics=Anthropometrics(height_cm=175.0, weight_kg=80.0),
education_level=4,
),
lifestyle=Lifestyle(
smoking=SmokingHistory(
status=SmokingStatus.CURRENT,
cigarettes_per_day=20,
years_smoked=0, # This is valid input
years_since_quit=None,
),
),
personal_medical_history=PersonalMedicalHistory(),
family_history=[],
)
# This should work fine with 0 years smoked
score = self.model.compute_score(user)
assert "%" in score
def test_missing_quit_years_former_smoker(self):
"""Test handling of missing quit years for former smoker."""
user = UserInput(
demographics=Demographics(
age_years=60,
sex=Sex.MALE,
ethnicity=Ethnicity.WHITE,
anthropometrics=Anthropometrics(height_cm=175.0, weight_kg=80.0),
education_level=4,
),
lifestyle=Lifestyle(
smoking=SmokingHistory(
status=SmokingStatus.FORMER,
cigarettes_per_day=20,
years_smoked=25,
years_since_quit=None, # This will trigger N/A message
),
),
personal_medical_history=PersonalMedicalHistory(),
family_history=[],
)
score = self.model.compute_score(user)
assert "Missing years since quitting for former smoker" in score
def test_copd_detection(self):
"""Test COPD detection from chronic illnesses."""
user = UserInput(
demographics=Demographics(
age_years=60,
sex=Sex.MALE,
ethnicity=Ethnicity.WHITE,
anthropometrics=Anthropometrics(height_cm=175.0, weight_kg=80.0),
education_level=4,
),
lifestyle=Lifestyle(
smoking=SmokingHistory(
status=SmokingStatus.CURRENT,
cigarettes_per_day=20,
years_smoked=25,
years_since_quit=None,
),
),
personal_medical_history=PersonalMedicalHistory(
chronic_conditions=[ChronicCondition.COPD, ChronicCondition.DIABETES],
),
family_history=[],
)
score = self.model.compute_score(user)
assert "%" in score
def test_family_history_lung_cancer_detection(self):
"""Test lung cancer family history detection."""
user = UserInput(
demographics=Demographics(
age_years=60,
sex=Sex.MALE,
ethnicity=Ethnicity.WHITE,
anthropometrics=Anthropometrics(height_cm=175.0, weight_kg=80.0),
education_level=4,
),
lifestyle=Lifestyle(
smoking=SmokingHistory(
status=SmokingStatus.CURRENT,
cigarettes_per_day=20,
years_smoked=25,
years_since_quit=None,
),
),
personal_medical_history=PersonalMedicalHistory(),
family_history=[
FamilyMemberCancer(
relation=FamilyRelation.MOTHER,
side=FamilySide.MATERNAL,
degree=RelationshipDegree.FIRST,
cancer_type=CancerType.LUNG,
age_at_diagnosis=65,
),
FamilyMemberCancer(
relation=FamilyRelation.MATERNAL_UNCLE,
side=FamilySide.MATERNAL,
degree=RelationshipDegree.SECOND,
cancer_type=CancerType.LUNG,
age_at_diagnosis=70,
), # Should not count (not first-degree relative)
],
)
score = self.model.compute_score(user)
assert "%" in score
def test_race_handling(self):
"""Test different race/ethnicity handling."""
races = [
Ethnicity.WHITE,
Ethnicity.BLACK,
Ethnicity.HISPANIC,
Ethnicity.ASIAN,
Ethnicity.PACIFIC_ISLANDER,
]
for race in races:
user = UserInput(
demographics=Demographics(
age_years=60,
sex=Sex.MALE,
ethnicity=race,
anthropometrics=Anthropometrics(height_cm=175.0, weight_kg=80.0),
education_level=4,
),
lifestyle=Lifestyle(
smoking=SmokingHistory(
status=SmokingStatus.CURRENT,
cigarettes_per_day=20,
years_smoked=25,
years_since_quit=None,
),
),
personal_medical_history=PersonalMedicalHistory(),
family_history=[],
)
score = self.model.compute_score(user)
assert "%" in score
assert float(score.replace("%", "")) > 0
def test_model_metadata(self):
"""Test model metadata methods."""
assert self.model.name == "plcom2012"
assert self.model.cancer_type() == "lung"
assert "PLCOm2012" in self.model.description()
assert "6-year" in self.model.description()
assert "percentage chance" in self.model.interpretation()
assert isinstance(self.model.references(), list)
assert len(self.model.references()) > 0
assert "Tammemägi" in self.model.references()[0]
def test_smoking_status_encoding(self):
"""Test smoking status encoding (current=0, former=1)."""
# Test current smoker
current_input = dict(
age=60,
race="white",
education=4,
bmi=25.0,
copd=0,
cancer_hist=0,
family_hist_lung_cancer=0,
smoking_status=0,
smoking_intensity=20,
duration_smoking=25,
smoking_quit_time=0,
)
current_risk = self.model.calculate_risk(**current_input)
# Test former smoker (same parameters except status and quit time)
former_input = dict(
age=60,
race="white",
education=4,
bmi=25.0,
copd=0,
cancer_hist=0,
family_hist_lung_cancer=0,
smoking_status=1,
smoking_intensity=20,
duration_smoking=25,
smoking_quit_time=5,
)
former_risk = self.model.calculate_risk(**former_input)
# Both should be positive numbers
assert current_risk > 0
assert former_risk > 0
def test_smoking_intensity_transformation(self):
"""Test smoking intensity transformation ((intensity/10)^-1)."""
# Test with different intensities
intensities = [10, 20, 30, 40]
risks = []
for intensity in intensities:
input_data = dict(
age=60,
race="white",
education=4,
bmi=25.0,
copd=0,
cancer_hist=0,
family_hist_lung_cancer=0,
smoking_status=0,
smoking_intensity=intensity,
duration_smoking=25,
smoking_quit_time=0,
)
risk = self.model.calculate_risk(**input_data)
risks.append(risk)
# All risks should be positive
for risk in risks:
assert risk > 0