VibecoderMcSwaggins commited on
Commit
0bfc819
·
unverified ·
1 Parent(s): c8d9b90

test: Add circuit breaker unit tests for P0 judge bug (#63)

Browse files

- Verifies that HFInferenceJudgeHandler correctly triggers 'synthesize' after 3 consecutive failures
- Verifies failure counter resets on success or new question
- Validates the P0 fix in src/agent_factory/judges.py

tests/unit/agent_factory/test_judge_circuit_breaker.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unit tests for HFInferenceJudgeHandler Circuit Breaker."""
2
+
3
+ from unittest.mock import MagicMock, patch
4
+
5
+ import pytest
6
+
7
+ from src.agent_factory.judges import HFInferenceJudgeHandler
8
+ from src.utils.models import Citation, Evidence
9
+
10
+
11
+ @pytest.mark.unit
12
+ class TestJudgeCircuitBreaker:
13
+ """Tests specifically for the circuit breaker logic."""
14
+
15
+ @pytest.fixture
16
+ def handler(self):
17
+ """Create a handler with mocked dependencies."""
18
+ with patch("src.agent_factory.judges.InferenceClient"):
19
+ return HFInferenceJudgeHandler()
20
+
21
+ @pytest.mark.asyncio
22
+ async def test_circuit_breaker_triggers_after_max_failures(self, handler):
23
+ """Verify it switches to 'synthesize' after 3 consecutive failures."""
24
+
25
+ # Mock _call_with_retry to always fail
26
+ with patch.object(handler, "_call_with_retry", side_effect=Exception("Model failed")):
27
+ evidence = [
28
+ Evidence(
29
+ content="test",
30
+ citation=Citation(source="pubmed", title="t", url="u", date="2025"),
31
+ )
32
+ ]
33
+
34
+ # Call 1: Fails
35
+ result1 = await handler.assess("test", evidence)
36
+ assert result1.recommendation == "continue"
37
+ assert handler.consecutive_failures == 1
38
+
39
+ # Call 2: Fails
40
+ result2 = await handler.assess("test", evidence)
41
+ assert result2.recommendation == "continue"
42
+ assert handler.consecutive_failures == 2
43
+
44
+ # Call 3: Fails
45
+ result3 = await handler.assess("test", evidence)
46
+ assert result3.recommendation == "continue"
47
+ assert handler.consecutive_failures == 3
48
+
49
+ # Call 4: Circuit Breaker SHOULD trigger
50
+ # Because failures >= MAX (3)
51
+ result4 = await handler.assess("test", evidence)
52
+
53
+ assert result4.recommendation == "synthesize"
54
+ assert result4.sufficient is True
55
+ # The message contains "failed 3 times" or "Unavailable"
56
+ reasoning_lower = result4.reasoning.lower()
57
+ assert "failed" in reasoning_lower or "unavailable" in reasoning_lower
58
+
59
+ @pytest.mark.asyncio
60
+ async def test_circuit_breaker_resets_on_success(self, handler):
61
+ """Verify failures reset if a call succeeds."""
62
+
63
+ evidence = [
64
+ Evidence(
65
+ content="t",
66
+ citation=Citation(source="pubmed", title="t", url="u", date="d"),
67
+ )
68
+ ]
69
+
70
+ # 1. Fail once
71
+ with patch.object(handler, "_call_with_retry", side_effect=Exception("Fail")):
72
+ await handler.assess("test", evidence)
73
+ assert handler.consecutive_failures == 1
74
+
75
+ # 2. Succeed
76
+ valid_assessment = MagicMock(recommendation="continue", sufficient=False)
77
+ with patch.object(handler, "_call_with_retry", return_value=valid_assessment):
78
+ await handler.assess("test", evidence)
79
+ assert handler.consecutive_failures == 0 # Should reset
80
+
81
+ @pytest.mark.asyncio
82
+ async def test_circuit_breaker_resets_on_new_question(self, handler):
83
+ """Verify failures reset if question changes."""
84
+
85
+ evidence = []
86
+
87
+ # 1. Fail on Question A
88
+ with patch.object(handler, "_call_with_retry", side_effect=Exception("Fail")):
89
+ await handler.assess("Question A", evidence)
90
+ assert handler.consecutive_failures == 1
91
+
92
+ # 2. Fail on Question B (Should reset first, then increment to 1)
93
+ await handler.assess("Question B", evidence)
94
+ # Reset happens at start of assess:
95
+ # if "Question B" != "Question A" -> failures = 0
96
+ # Then it tries and fails -> failures = 1
97
+ assert handler.consecutive_failures == 1