dror44 commited on
Commit
b286969
·
1 Parent(s): af28f6f

refactoring

Browse files
Files changed (5) hide show
  1. .cursor/rules/python.mdc +124 -0
  2. data/leaderboard.csv +18 -18
  3. src/app.py +94 -25
  4. src/judge.py +16 -9
  5. src/ui.py +23 -2
.cursor/rules/python.mdc ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ description:
3
+ globs: **/*.py, src/**/*.py, tests/**/*.py
4
+ alwaysApply: false
5
+ ---
6
+ ---
7
+ description: Python best practices and patterns for modern software development with Flask and SQLite
8
+ globs: **/*.py, src/**/*.py, tests/**/*.py
9
+ ---
10
+
11
+ # Python Best Practices
12
+
13
+ ## Project Structure
14
+ - Use src-layout with `src/your_package_name/`
15
+ - Place tests in `tests/` directory parallel to `src/`
16
+ - Keep configuration in `config/` or as environment variables
17
+ - Store requirements in `requirements.txt` or `pyproject.toml`
18
+ - Place static files in `static/` directory
19
+ - Use `templates/` for Jinja2 templates
20
+
21
+ ## Code Style
22
+ - Follow Black code formatting
23
+ - Use isort for import sorting
24
+ - Follow PEP 8 naming conventions:
25
+ - snake_case for functions and variables
26
+ - PascalCase for classes
27
+ - UPPER_CASE for constants
28
+ - Maximum line length of 88 characters (Black default)
29
+ - Use absolute imports over relative imports
30
+
31
+ ## Type Hints
32
+ - Use type hints for all function parameters and returns
33
+ - Import types from `typing` module
34
+ - Use `Optional[Type]` instead of `Type | None`
35
+ - Use `TypeVar` for generic types
36
+ - Define custom types in `types.py`
37
+ - Use `Protocol` for duck typing
38
+
39
+ ## Flask Structure
40
+ - Use Flask factory pattern
41
+ - Organize routes using Blueprints
42
+ - Use Flask-SQLAlchemy for database
43
+ - Implement proper error handlers
44
+ - Use Flask-Login for authentication
45
+ - Structure views with proper separation of concerns
46
+
47
+ ## Database
48
+ - Use SQLAlchemy ORM
49
+ - Implement database migrations with Alembic
50
+ - Use proper connection pooling
51
+ - Define models in separate modules
52
+ - Implement proper relationships
53
+ - Use proper indexing strategies
54
+ ## Authentication
55
+ - Use Flask-Login for session management
56
+ - Implement Google OAuth using Flask-OAuth
57
+ - Hash passwords with bcrypt
58
+ - Use proper session security
59
+ - Implement CSRF protection
60
+ - Use proper role-based access control
61
+
62
+ ## API Design
63
+ - Use Flask-RESTful for REST APIs
64
+ - Implement proper request validation
65
+ - Use proper HTTP status codes
66
+ - Handle errors consistently
67
+ - Use proper response formats
68
+ - Implement proper rate limiting
69
+
70
+ ## Testing
71
+ - Use pytest for testing
72
+ - Write tests for all routes
73
+ - Use pytest-cov for coverage
74
+ - Implement proper fixtures
75
+ - Use proper mocking with pytest-mock
76
+ - Test all error scenarios
77
+
78
+ ## Security
79
+ - Use HTTPS in production
80
+ - Implement proper CORS
81
+ - Sanitize all user inputs
82
+ - Use proper session configuration
83
+ - Implement proper logging
84
+ - Follow OWASP guidelines
85
+
86
+ ## Performance
87
+ - Use proper caching with Flask-Caching
88
+ - Implement database query optimization
89
+ - Use proper connection pooling
90
+ - Implement proper pagination
91
+ - Use background tasks for heavy operations
92
+ - Monitor application performance
93
+
94
+ ## Error Handling
95
+ - Create custom exception classes
96
+ - Use proper try-except blocks
97
+ - Implement proper logging
98
+ - Return proper error responses
99
+ - Handle edge cases properly
100
+ - Use proper error messages
101
+
102
+ ## Documentation
103
+ - Use Google-style docstrings
104
+ - Document all public APIs
105
+ - Keep README.md updated
106
+ - Use proper inline comments
107
+ - Generate API documentation
108
+ - Document environment setup
109
+
110
+ ## Development Workflow
111
+ - Use virtual environments (venv)
112
+ - Implement pre-commit hooks
113
+ - Use proper Git workflow
114
+ - Follow semantic versioning
115
+ - Use proper CI/CD practices
116
+ - Implement proper logging
117
+
118
+ ## Dependencies
119
+ - Pin dependency versions
120
+ - Use requirements.txt for production
121
+ - Separate dev dependencies
122
+ - Use proper package versions
123
+ - Regularly update dependencies
124
+ - Check for security vulnerabilities
data/leaderboard.csv CHANGED
@@ -1,31 +1,31 @@
1
  judge_id,judge_name,elo_score,wins,losses,total_evaluations,organization,license
 
2
  claude-3-opus-latest,Claude 3 Opus,1531.9661669788793,2.0,0.0,2.0,Anthropic,Proprietary
3
  mistral-7b-instruct-v0.1,Mistral (7B) Instruct v0.1,1516.736306793522,1.0,0.0,1.0,Mistral AI,Open Source
4
  qwen-2.5-7b-instruct-turbo,Qwen 2.5 7B Instruct,1516.0,1.0,0.0,1.0,Alibaba,Open Source
5
  meta-llama-3.1-8b-instruct-turbo,Meta Llama 3.1 8B Instruct,1515.2298601853572,1.0,0.0,1.0,Meta,Open Source
6
  gpt-4-turbo,GPT-4 Turbo,1500.736306793522,1.0,1.0,2.0,OpenAI,Proprietary
7
- meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,1500.0,0.0,0.0,0.0,Meta,Open Source
8
- gemma-2-27b-it,Gemma 2 27B,1500.0,0.0,0.0,0.0,Google,Open Source
9
- gemma-2-9b-it,Gemma 2 9B,1500.0,0.0,0.0,0.0,Google,Open Source
10
- qwen-2-72b-instruct,Qwen 2 Instruct (72B),1500.0,0.0,0.0,0.0,Alibaba,Open Source
11
- mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,1500.0,0.0,0.0,0.0,Mistral AI,Open Source
12
- gpt-3.5-turbo,GPT-3.5 Turbo,1500.0,0.0,0.0,0.0,OpenAI,Proprietary
13
- atla-selene,Atla Selene,1500.0,0.0,0.0,0.0,Atla,Proprietary
14
- claude-3-5-haiku-latest,Claude 3.5 Haiku,1500.0,0.0,0.0,0.0,Anthropic,Proprietary
15
- claude-3-sonnet-20240229,Claude 3 Sonnet,1500.0,0.0,0.0,0.0,Anthropic,Proprietary
16
  deepseek-r1,DeepSeek R1,1500.0,0.0,0.0,0.0,DeepSeek,Open Source
17
- judge1,EvalGPT,1500.0,0.0,0.0,0.0,OpenAI,Commercial
18
- judge2,CritiqueBot,1500.0,0.0,0.0,0.0,OpenAI,Commercial
19
- judge3,GradeAssist,1500.0,0.0,0.0,0.0,Anthropic,Commercial
20
- judge4,PrecisionJudge,1500.0,0.0,0.0,0.0,Anthropic,Commercial
21
- judge5,Mixtral,1500.0,0.0,0.0,0.0,Mistral AI,Commercial
22
- meta-llama-4-scout-17B-16E-instruct,Meta Llama 4 Scout 17B 16E Instruct,1500.0,0.0,0.0,0.0,Meta,Open Source
23
- meta-llama-3.3-70B-instruct-turbo,Meta Llama 4 Scout 32K Instruct,1500.0,0.0,0.0,0.0,Meta,Open Source
24
- o3-mini, o3-mini,1500.0,0.0,0.0,0.0,OpenAI,Proprietary
25
  deepseek-v3,DeepSeek V3,1500.0,0.0,0.0,0.0,DeepSeek,Open Source
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,1499.263693206478,1.0,1.0,2.0,Meta,Open Source
27
  claude-3-haiku-20240307,Claude 3 Haiku,1499.263693206478,1.0,1.0,2.0,Anthropic,Proprietary
28
  gpt-4.1,GPT-4.1,1484.7701398146428,0.0,1.0,1.0,OpenAI,Proprietary
29
  claude-3-5-sonnet-latest,Claude 3.5 Sonnet,1484.0,0.0,1.0,1.0,Anthropic,Proprietary
30
  gpt-4o,GPT-4o,1484.0,0.0,1.0,1.0,OpenAI,Proprietary
31
- qwen-2.5-72b-instruct-turbo,Qwen 2.5 72B Instruct,1468.0338330211207,0.0,2.0,2.0,Alibaba,Open Source
 
1
  judge_id,judge_name,elo_score,wins,losses,total_evaluations,organization,license
2
+ qwen-2.5-72b-instruct-turbo,Qwen 2.5 72B Instruct,1649.8276600728695,21.0,2.0,23.0,Alibaba,Open Source
3
  claude-3-opus-latest,Claude 3 Opus,1531.9661669788793,2.0,0.0,2.0,Anthropic,Proprietary
4
  mistral-7b-instruct-v0.1,Mistral (7B) Instruct v0.1,1516.736306793522,1.0,0.0,1.0,Mistral AI,Open Source
5
  qwen-2.5-7b-instruct-turbo,Qwen 2.5 7B Instruct,1516.0,1.0,0.0,1.0,Alibaba,Open Source
6
  meta-llama-3.1-8b-instruct-turbo,Meta Llama 3.1 8B Instruct,1515.2298601853572,1.0,0.0,1.0,Meta,Open Source
7
  gpt-4-turbo,GPT-4 Turbo,1500.736306793522,1.0,1.0,2.0,OpenAI,Proprietary
 
 
 
 
 
 
 
 
 
8
  deepseek-r1,DeepSeek R1,1500.0,0.0,0.0,0.0,DeepSeek,Open Source
 
 
 
 
 
 
 
 
9
  deepseek-v3,DeepSeek V3,1500.0,0.0,0.0,0.0,DeepSeek,Open Source
10
+ o3-mini, o3-mini,1500.0,0.0,0.0,0.0,OpenAI,Proprietary
11
+ meta-llama-3.3-70B-instruct-turbo,Meta Llama 4 Scout 32K Instruct,1500.0,0.0,0.0,0.0,Meta,Open Source
12
+ meta-llama-4-scout-17B-16E-instruct,Meta Llama 4 Scout 17B 16E Instruct,1500.0,0.0,0.0,0.0,Meta,Open Source
13
+ judge5,Mixtral,1500.0,0.0,0.0,0.0,Mistral AI,Commercial
14
+ judge4,PrecisionJudge,1500.0,0.0,0.0,0.0,Anthropic,Commercial
15
+ judge3,GradeAssist,1500.0,0.0,0.0,0.0,Anthropic,Commercial
16
+ judge2,CritiqueBot,1500.0,0.0,0.0,0.0,OpenAI,Commercial
17
+ claude-3-sonnet-20240229,Claude 3 Sonnet,1500.0,0.0,0.0,0.0,Anthropic,Proprietary
18
+ claude-3-5-haiku-latest,Claude 3.5 Haiku,1500.0,0.0,0.0,0.0,Anthropic,Proprietary
19
+ atla-selene,Atla Selene,1500.0,0.0,0.0,0.0,Atla,Proprietary
20
+ mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,1500.0,0.0,0.0,0.0,Mistral AI,Open Source
21
+ qwen-2-72b-instruct,Qwen 2 Instruct (72B),1500.0,0.0,0.0,0.0,Alibaba,Open Source
22
+ gemma-2-9b-it,Gemma 2 9B,1500.0,0.0,0.0,0.0,Google,Open Source
23
+ gemma-2-27b-it,Gemma 2 27B,1500.0,0.0,0.0,0.0,Google,Open Source
24
+ meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,1500.0,0.0,0.0,0.0,Meta,Open Source
25
+ judge1,EvalGPT,1500.0,0.0,0.0,0.0,OpenAI,Commercial
26
  meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,1499.263693206478,1.0,1.0,2.0,Meta,Open Source
27
  claude-3-haiku-20240307,Claude 3 Haiku,1499.263693206478,1.0,1.0,2.0,Anthropic,Proprietary
28
  gpt-4.1,GPT-4.1,1484.7701398146428,0.0,1.0,1.0,OpenAI,Proprietary
29
  claude-3-5-sonnet-latest,Claude 3.5 Sonnet,1484.0,0.0,1.0,1.0,Anthropic,Proprietary
30
  gpt-4o,GPT-4o,1484.0,0.0,1.0,1.0,OpenAI,Proprietary
31
+ gpt-3.5-turbo,GPT-3.5 Turbo,1318.2061729482512,0.0,21.0,21.0,OpenAI,Proprietary
src/app.py CHANGED
@@ -10,6 +10,7 @@ from src.ui import UI
10
  # Global state for evaluations
11
  eval1: Optional[Dict[str, Any]] = None
12
  eval2: Optional[Dict[str, Any]] = None
 
13
  current_test_type: str = "grounding"
14
 
15
 
@@ -31,6 +32,18 @@ def initialize():
31
  test_type,
32
  judge_manager,
33
  ),
 
 
 
 
 
 
 
 
 
 
 
 
34
  winner1_fn=lambda: select_winner("Evaluation 1", judge_manager),
35
  winner2_fn=lambda: select_winner("Evaluation 2", judge_manager),
36
  refresh_leaderboard_fn=lambda: judge_manager.leaderboard_df,
@@ -57,50 +70,106 @@ def submit_example(
57
  output_text: str,
58
  test_type: str,
59
  judge_manager: JudgeManager,
60
- ) -> Tuple[str, str, Any, Any]:
61
- """Submit an example for evaluation."""
62
- global eval1, eval2, current_test_type
63
 
64
  try:
65
- logger.info(f"Submitting example for test type: {test_type}")
66
  current_test_type = test_type
67
- selected_judges = judge_manager.pick_random_judges()
68
- eval1 = judge_manager.get_random_judges_evaluations(
69
- input_text,
70
- output_text,
71
- test_type,
72
- selected_judges[0],
73
- )
74
 
75
- eval2 = judge_manager.get_random_judges_evaluations(
76
- input_text,
77
- output_text,
78
- test_type,
79
- selected_judges[1],
80
- )
81
 
82
- if not eval1 or not eval2:
 
 
 
83
  return (
84
  "Error: Not enough judges available",
85
  "Error: Not enough judges available",
86
  None,
87
  None,
 
 
88
  )
89
 
 
 
90
  return (
91
- eval1["display_evaluation"],
92
- eval2["display_evaluation"],
93
- gr.update(visible=True),
94
- gr.update(visible=True),
 
 
95
  )
96
  except Exception as e:
97
- logger.error(f"Error submitting example: {e}")
98
  return (
99
  f"Error: {str(e)}",
100
  f"Error: {str(e)}",
101
- None,
102
- None,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
 
106
  def select_winner(choice: str, judge_manager: JudgeManager) -> str:
 
10
  # Global state for evaluations
11
  eval1: Optional[Dict[str, Any]] = None
12
  eval2: Optional[Dict[str, Any]] = None
13
+ selected_judges: list = []
14
  current_test_type: str = "grounding"
15
 
16
 
 
32
  test_type,
33
  judge_manager,
34
  ),
35
+ evaluate1_fn=lambda input_text, output_text, test_type: get_evaluation1(
36
+ input_text,
37
+ output_text,
38
+ test_type,
39
+ judge_manager,
40
+ ),
41
+ evaluate2_fn=lambda input_text, output_text, test_type: get_evaluation2(
42
+ input_text,
43
+ output_text,
44
+ test_type,
45
+ judge_manager,
46
+ ),
47
  winner1_fn=lambda: select_winner("Evaluation 1", judge_manager),
48
  winner2_fn=lambda: select_winner("Evaluation 2", judge_manager),
49
  refresh_leaderboard_fn=lambda: judge_manager.leaderboard_df,
 
70
  output_text: str,
71
  test_type: str,
72
  judge_manager: JudgeManager,
73
+ ) -> Tuple[str, str, Any, Any, Any, Any]:
74
+ """Prepare for evaluation and select random judges."""
75
+ global selected_judges, current_test_type, eval1, eval2
76
 
77
  try:
78
+ logger.info(f"Preparing evaluation for test type: {test_type}")
79
  current_test_type = test_type
 
 
 
 
 
 
 
80
 
81
+ # Reset evaluations
82
+ eval1 = None
83
+ eval2 = None
 
 
 
84
 
85
+ # Select random judges
86
+ selected_judges = judge_manager.pick_random_judges()
87
+
88
+ if len(selected_judges) < 2:
89
  return (
90
  "Error: Not enough judges available",
91
  "Error: Not enough judges available",
92
  None,
93
  None,
94
+ None,
95
+ gr.update(visible=False),
96
  )
97
 
98
+ # Show loading messages while evaluations are in progress
99
+ status_text = "Evaluations starting... Both judges will evaluate in parallel."
100
  return (
101
+ "Loading evaluation 1...",
102
+ "Loading evaluation 2...",
103
+ gr.update(value=input_text),
104
+ gr.update(value=output_text),
105
+ gr.update(value=test_type),
106
+ gr.update(visible=True, value=status_text),
107
  )
108
  except Exception as e:
109
+ logger.error(f"Error preparing evaluation: {e}")
110
  return (
111
  f"Error: {str(e)}",
112
  f"Error: {str(e)}",
113
+ gr.update(value=input_text),
114
+ gr.update(value=output_text),
115
+ gr.update(value=test_type),
116
+ gr.update(visible=False),
117
+ )
118
+
119
+
120
+ def get_evaluation1(
121
+ input_text: str,
122
+ output_text: str,
123
+ test_type: str,
124
+ judge_manager: JudgeManager,
125
+ ) -> Tuple[str, Any]:
126
+ """Get evaluation from the first judge."""
127
+ global eval1, selected_judges
128
+
129
+ try:
130
+ if not selected_judges or len(selected_judges) < 1:
131
+ return "No judges selected", gr.update(visible=False)
132
+
133
+ logger.info(f"Starting evaluation 1 with judge {selected_judges[0]['name']}")
134
+ # Get evaluation from the first judge
135
+ eval1 = judge_manager.get_evaluation(
136
+ selected_judges[0],
137
+ input_text,
138
+ output_text,
139
+ test_type,
140
  )
141
+ logger.info("Completed evaluation 1")
142
+
143
+ # Make the selection button visible once the evaluation is ready
144
+ return eval1["display_evaluation"], gr.update(visible=True)
145
+ except Exception as e:
146
+ logger.error(f"Error getting evaluation 1: {e}")
147
+ return f"Error: {str(e)}", gr.update(visible=False)
148
+
149
+
150
+ def get_evaluation2(
151
+ input_text: str,
152
+ output_text: str,
153
+ test_type: str,
154
+ judge_manager: JudgeManager,
155
+ ) -> Tuple[str, Any]:
156
+ """Get evaluation from the second judge."""
157
+ global eval2, selected_judges
158
+
159
+ try:
160
+ if not selected_judges or len(selected_judges) < 2:
161
+ return "No judges selected", gr.update(visible=False)
162
+
163
+ logger.info(f"Starting evaluation 2 with judge {selected_judges[1]['name']}")
164
+ # Get evaluation from the second judge
165
+ eval2 = judge_manager.get_evaluation(selected_judges[1], input_text, output_text, test_type)
166
+ logger.info("Completed evaluation 2")
167
+
168
+ # Make the selection button visible once the evaluation is ready
169
+ return eval2["display_evaluation"], gr.update(visible=True)
170
+ except Exception as e:
171
+ logger.error(f"Error getting evaluation 2: {e}")
172
+ return f"Error: {str(e)}", gr.update(visible=False)
173
 
174
 
175
  def select_winner(choice: str, judge_manager: JudgeManager) -> str:
src/judge.py CHANGED
@@ -101,8 +101,10 @@ class JudgeManager:
101
  temperature=0.2,
102
  max_tokens=500,
103
  )
104
- # Default fallback
105
  evaluation = api_response.choices[0].message.content
 
 
 
106
 
107
  # Format the evaluation
108
  eval_prefix = f"Evaluation by {judge['name']} (ID: {judge['id']}):\n\n"
@@ -137,8 +139,11 @@ AI RESPONSE:
137
 
138
  Please evaluate this response carefully and provide your assessment."""
139
 
140
- def pick_random_judges(self) -> Tuple[Dict[str, Any], Dict[str, Any]]:
141
  """Pick two random judges"""
 
 
 
142
  return random.sample(self.judges, 2)
143
 
144
  def get_random_judges_evaluations(
@@ -146,7 +151,7 @@ Please evaluate this response carefully and provide your assessment."""
146
  input_text: str,
147
  output_text: str,
148
  test_type: str,
149
- selected_judges: List[Dict[str, Any]],
150
  ) -> Tuple[Optional[Dict[str, Any]], Optional[Dict[str, Any]]]:
151
  """Get evaluations from two random judges"""
152
  if len(self.judges) < 2:
@@ -154,12 +159,14 @@ Please evaluate this response carefully and provide your assessment."""
154
  return None, None
155
 
156
  # Get evaluations from the judges
157
- evaluations = []
158
- for judge in selected_judges:
159
- evaluation = self.get_evaluation(judge, input_text, output_text, test_type)
160
- evaluations.append(evaluation)
161
-
162
- return evaluations[0], evaluations[1]
 
 
163
 
164
  def update_leaderboard(self, winner_id: str, loser_id: str) -> pd.DataFrame:
165
  """Update the leaderboard after a comparison"""
 
101
  temperature=0.2,
102
  max_tokens=500,
103
  )
 
104
  evaluation = api_response.choices[0].message.content
105
+ else:
106
+ # Default fallback
107
+ evaluation = f"No evaluation provider for {judge['provider']}"
108
 
109
  # Format the evaluation
110
  eval_prefix = f"Evaluation by {judge['name']} (ID: {judge['id']}):\n\n"
 
139
 
140
  Please evaluate this response carefully and provide your assessment."""
141
 
142
+ def pick_random_judges(self) -> List[Dict[str, Any]]:
143
  """Pick two random judges"""
144
+ if len(self.judges) < 2:
145
+ logger.error("Not enough judges available for comparison")
146
+ return []
147
  return random.sample(self.judges, 2)
148
 
149
  def get_random_judges_evaluations(
 
151
  input_text: str,
152
  output_text: str,
153
  test_type: str,
154
+ selected_judge: Dict[str, Any],
155
  ) -> Tuple[Optional[Dict[str, Any]], Optional[Dict[str, Any]]]:
156
  """Get evaluations from two random judges"""
157
  if len(self.judges) < 2:
 
159
  return None, None
160
 
161
  # Get evaluations from the judges
162
+ evaluation = self.get_evaluation(
163
+ selected_judge,
164
+ input_text,
165
+ output_text,
166
+ test_type,
167
+ )
168
+
169
+ return evaluation
170
 
171
  def update_leaderboard(self, winner_id: str, loser_id: str) -> pd.DataFrame:
172
  """Update the leaderboard after a comparison"""
src/ui.py CHANGED
@@ -13,6 +13,8 @@ class UI:
13
  self,
14
  refresh_fn: Callable,
15
  submit_fn: Callable,
 
 
16
  winner1_fn: Callable,
17
  winner2_fn: Callable,
18
  refresh_leaderboard_fn: Callable,
@@ -20,6 +22,8 @@ class UI:
20
  ):
21
  self.refresh_fn = refresh_fn
22
  self.submit_fn = submit_fn
 
 
23
  self.winner1_fn = winner1_fn
24
  self.winner2_fn = winner2_fn
25
  self.refresh_leaderboard_fn = refresh_leaderboard_fn
@@ -97,6 +101,7 @@ class UI:
97
  input_text = gr.Textbox(label="Input", lines=4)
98
  output_text = gr.Textbox(label="Output", lines=6)
99
  submit_button = gr.Button("Get Judge Evaluations")
 
100
 
101
  with gr.Row():
102
  with gr.Column():
@@ -129,10 +134,26 @@ class UI:
129
  [input_text, output_text],
130
  )
131
 
132
- submit_button.click(
 
133
  self.submit_fn,
134
  [input_text, output_text, test_type_dropdown],
135
- [evaluation1, evaluation2, select_eval1, select_eval2],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  )
137
 
138
  select_eval1.click(
 
13
  self,
14
  refresh_fn: Callable,
15
  submit_fn: Callable,
16
+ evaluate1_fn: Callable,
17
+ evaluate2_fn: Callable,
18
  winner1_fn: Callable,
19
  winner2_fn: Callable,
20
  refresh_leaderboard_fn: Callable,
 
22
  ):
23
  self.refresh_fn = refresh_fn
24
  self.submit_fn = submit_fn
25
+ self.evaluate1_fn = evaluate1_fn
26
+ self.evaluate2_fn = evaluate2_fn
27
  self.winner1_fn = winner1_fn
28
  self.winner2_fn = winner2_fn
29
  self.refresh_leaderboard_fn = refresh_leaderboard_fn
 
101
  input_text = gr.Textbox(label="Input", lines=4)
102
  output_text = gr.Textbox(label="Output", lines=6)
103
  submit_button = gr.Button("Get Judge Evaluations")
104
+ status_message = gr.Markdown(visible=False)
105
 
106
  with gr.Row():
107
  with gr.Column():
 
134
  [input_text, output_text],
135
  )
136
 
137
+ # Modified submit to prepare for evaluation and trigger both evaluations in parallel
138
+ submit_event = submit_button.click(
139
  self.submit_fn,
140
  [input_text, output_text, test_type_dropdown],
141
+ [evaluation1, evaluation2, input_text, output_text, test_type_dropdown, status_message],
142
+ )
143
+
144
+ # Start both evaluations simultaneously (in parallel) after submit completes
145
+ submit_event.then(
146
+ self.evaluate1_fn,
147
+ [input_text, output_text, test_type_dropdown],
148
+ [evaluation1, select_eval1],
149
+ queue=False, # Run immediately without waiting in queue
150
+ )
151
+
152
+ submit_event.then(
153
+ self.evaluate2_fn,
154
+ [input_text, output_text, test_type_dropdown],
155
+ [evaluation2, select_eval2],
156
+ queue=False, # Run immediately without waiting in queue
157
  )
158
 
159
  select_eval1.click(