dror44 commited on
Commit
8c8b2c7
·
1 Parent(s): 5a05fa9

Hotfixes and benchmarks

Browse files
benchmarks/grounding/allenai-wildjailbreak-judges-metrics.csv ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ judge_id,judge_name,dataset,f1,bacc,avg_latency,total_time,count,correct
2
+ meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,allenai-wildjailbreak,0.21428571428571427,0.12,0.8566377925872802,85.66377925872803,100,12
3
+ meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,allenai-wildjailbreak,0.7421383647798742,0.59,1.1272331833839417,112.72331833839417,100,59
4
+ meta-llama-4-scout-17B-16E-instruct,Meta Llama 4 Scout 17B 16E Instruct,allenai-wildjailbreak,0.5294117647058824,0.36,0.4795390796661377,47.95390796661377,100,36
5
+ meta-llama-3.3-70B-instruct-turbo,Meta Llama 4 Scout 32K Instruct,allenai-wildjailbreak,0.5401459854014599,0.37,5.12372554063797,512.372554063797,100,37
6
+ meta-llama-3.1-8b-instruct-turbo,Meta Llama 3.1 8B Instruct,allenai-wildjailbreak,0.8950276243093923,0.81,1.0803885889053344,108.03885889053345,100,81
7
+ gemma-2-27b-it,Gemma 2 27B,allenai-wildjailbreak,0.3050847457627119,0.18,1.0046957421302796,100.46957421302795,100,18
8
+ gemma-2-9b-it,Gemma 2 9B,allenai-wildjailbreak,0.4126984126984127,0.26,0.5609125876426697,56.09125876426697,100,26
9
+ mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,allenai-wildjailbreak,0.14814814814814814,0.08,30.8281710100174,3082.81710100174,100,8
10
+ o3-mini, o3-mini,allenai-wildjailbreak,0.09523809523809523,0.05,3.8824497079849243,388.24497079849243,100,5
11
+ gpt-4.1,GPT-4.1,allenai-wildjailbreak,0.23008849557522124,0.13,1.033246524333954,103.32465243339539,100,13
12
+ gpt-4o,GPT-4o,allenai-wildjailbreak,0.09523809523809523,0.05,1.0374453783035278,103.74453783035278,100,5
13
+ gpt-4-turbo,GPT-4 Turbo,allenai-wildjailbreak,0.27586206896551724,0.16,1.118471143245697,111.8471143245697,100,16
14
+ gpt-3.5-turbo,GPT-3.5 Turbo,allenai-wildjailbreak,0.37398373983739835,0.23,0.6795877623558044,67.95877623558044,100,23
15
+ claude-3-haiku-20240307,Claude 3 Haiku,allenai-wildjailbreak,0.05825242718446602,0.03,0.6856383895874023,68.56383895874023,100,3
16
+ claude-3-sonnet-20240229,Claude 3 Sonnet,allenai-wildjailbreak,0.5074626865671642,0.34,0.8858131814002991,88.58131814002991,100,34
17
+ claude-3-opus-latest,Claude 3 Opus,allenai-wildjailbreak,0.6301369863013698,0.46,1.6495161414146424,164.95161414146423,100,46
18
+ claude-3-5-sonnet-latest,Claude 3.5 Sonnet,allenai-wildjailbreak,0.7878787878787878,0.65,1.9892964005470275,198.92964005470276,100,65
19
+ claude-3-5-haiku-latest,Claude 3.5 Haiku,allenai-wildjailbreak,0.8439306358381503,0.73,0.9016167116165161,90.16167116165161,100,73
20
+ qwen-2.5-72b-instruct-turbo,Qwen 2.5 72B Instruct,allenai-wildjailbreak,0.6301369863013698,0.46,0.8251621770858765,82.51621770858765,100,46
21
+ qwen-2.5-7b-instruct-turbo,Qwen 2.5 7B Instruct,allenai-wildjailbreak,0.48484848484848486,0.32,0.5128253746032715,51.28253746032715,100,32
22
+ deepseek-v3,DeepSeek V3,allenai-wildjailbreak,0.49624060150375937,0.33,6.41716570854187,641.716570854187,100,33
23
+ deepseek-r1,DeepSeek R1,allenai-wildjailbreak,0.46153846153846156,0.3,6.692396397590637,669.2396397590637,100,30
24
+ qualifire-eval,Qualifire,allenai-wildjailbreak,0.46153846153846156,0.3,0.9121422719955444,91.21422719955444,100,30
src/app.py CHANGED
@@ -1,4 +1,5 @@
1
  import datetime
 
2
  from typing import Any, Dict, Optional, Tuple
3
 
4
  import gradio as gr
@@ -16,6 +17,21 @@ eval2: Optional[Dict[str, Any]] = None
16
  selected_judges: list = []
17
  current_test_type: str = "grounding"
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  def format_leaderboard_for_display(df: pd.DataFrame) -> pd.DataFrame:
21
  """Format the leaderboard dataframe for display in the UI.
@@ -31,6 +47,92 @@ def format_leaderboard_for_display(df: pd.DataFrame) -> pd.DataFrame:
31
  return display_df
32
 
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  def initialize():
35
  """Initialize the application."""
36
  # Load models from file
@@ -88,6 +190,7 @@ def initialize():
88
  leaderboard_df=format_leaderboard_for_display(
89
  judge_manager.leaderboard_df,
90
  ),
 
91
  )
92
 
93
  return ui.create_interface()
@@ -634,9 +737,9 @@ def handle_both_incorrect(judge_manager: JudgeManager) -> str:
634
 
635
 
636
  def main():
637
- """Main application entry point."""
638
  demo = initialize()
639
- demo.launch()
640
 
641
 
642
  if __name__ == "__main__":
 
1
  import datetime
2
+ import os
3
  from typing import Any, Dict, Optional, Tuple
4
 
5
  import gradio as gr
 
17
  selected_judges: list = []
18
  current_test_type: str = "grounding"
19
 
20
+ # Add more detailed logging
21
+ logger.info("EvalArena starting up")
22
+ # Check if benchmarks directory exists
23
+ if os.path.exists("benchmarks") and os.path.isdir("benchmarks"):
24
+ benchmark_dirs = [d for d in os.listdir("benchmarks") if os.path.isdir(os.path.join("benchmarks", d))]
25
+ logger.info(f"Found benchmark directories: {benchmark_dirs}")
26
+
27
+ # Log CSV files in each directory
28
+ for d in benchmark_dirs:
29
+ dir_path = os.path.join("benchmarks", d)
30
+ files = [f for f in os.listdir(dir_path) if f.endswith("-judges-metrics.csv")]
31
+ logger.info(f"Benchmark directory '{d}' contains files: {files}")
32
+ else:
33
+ logger.warning("Benchmarks directory not found or not accessible")
34
+
35
 
36
  def format_leaderboard_for_display(df: pd.DataFrame) -> pd.DataFrame:
37
  """Format the leaderboard dataframe for display in the UI.
 
47
  return display_df
48
 
49
 
50
+ def load_benchmark_data(benchmark_type: str, dataset_name: str) -> Tuple[pd.DataFrame, str]:
51
+ """Load benchmark data for the selected benchmark type and dataset.
52
+
53
+ Args:
54
+ benchmark_type: The type of benchmark (e.g., 'prompt-injections')
55
+ dataset_name: The name of the dataset (e.g., 'allenai-wildjailbreak')
56
+
57
+ Returns:
58
+ Tuple containing:
59
+ - DataFrame formatted for display
60
+ - Markdown string with benchmark information
61
+ """
62
+ # Create empty dataframe with the expected columns
63
+ empty_df = pd.DataFrame(
64
+ columns=["Judge Name", "F1 Score", "Balanced Accuracy", "Avg Latency (s)", "Correct", "Total"]
65
+ )
66
+
67
+ # Handle case when None or empty values are passed
68
+ if not benchmark_type or not dataset_name:
69
+ logger.warning(f"Invalid benchmark parameters: type={benchmark_type}, dataset={dataset_name}")
70
+ return empty_df, "Please select both a benchmark type and dataset"
71
+
72
+ try:
73
+ # Construct the path to the benchmark metrics file
74
+ metrics_file = os.path.join("benchmarks", benchmark_type, f"{dataset_name}-judges-metrics.csv")
75
+
76
+ logger.info(f"Loading benchmark from {metrics_file}")
77
+
78
+ if not os.path.exists(metrics_file):
79
+ error_message = f"Error: Could not find metrics file at {metrics_file}"
80
+ logger.error(error_message)
81
+ return empty_df, error_message
82
+
83
+ # Load the CSV file
84
+ df = pd.read_csv(metrics_file)
85
+ logger.info(f"Loaded benchmark with {len(df)} rows")
86
+
87
+ # Check if the file has the required columns
88
+ required_columns = ["judge_name", "f1", "bacc", "avg_latency", "correct", "count"]
89
+ missing_columns = [col for col in required_columns if col not in df.columns]
90
+
91
+ if missing_columns:
92
+ error_message = f"Error: CSV file missing required columns: {', '.join(missing_columns)}"
93
+ logger.error(error_message)
94
+ return empty_df, error_message
95
+
96
+ # Format the dataframe for display
97
+ display_df = pd.DataFrame()
98
+ display_df["Judge Name"] = df["judge_name"]
99
+ display_df["F1 Score"] = df["f1"].round(3)
100
+ display_df["Balanced Accuracy"] = df["bacc"].round(3)
101
+ display_df["Avg Latency (s)"] = df["avg_latency"].round(2)
102
+ display_df["Correct"] = df["correct"]
103
+ display_df["Total"] = df["count"]
104
+
105
+ # Sort by balanced accuracy descending
106
+ display_df = display_df.sort_values("Balanced Accuracy", ascending=False)
107
+
108
+ # Generate information about the benchmark
109
+ total_samples = df["count"].iloc[0] if not df.empty else 0
110
+ info_md = f"""
111
+ # Benchmark: {dataset_name}
112
+
113
+ **Type**: {benchmark_type}
114
+ **Total Samples**: {total_samples}
115
+
116
+ This table shows how different AI judge models performed on this benchmark.
117
+ Higher F1 score and balanced accuracy indicate better performance.
118
+ """
119
+
120
+ return display_df, info_md
121
+
122
+ except pd.errors.EmptyDataError:
123
+ error_message = "Error: The CSV file is empty"
124
+ logger.error(error_message)
125
+ return empty_df, error_message
126
+ except pd.errors.ParserError:
127
+ error_message = "Error: Could not parse the CSV file - it may be corrupted or not in CSV format"
128
+ logger.error(error_message)
129
+ return empty_df, error_message
130
+ except Exception as e:
131
+ error_message = f"Error loading benchmark data: {str(e)}"
132
+ logger.error(error_message)
133
+ return empty_df, error_message
134
+
135
+
136
  def initialize():
137
  """Initialize the application."""
138
  # Load models from file
 
190
  leaderboard_df=format_leaderboard_for_display(
191
  judge_manager.leaderboard_df,
192
  ),
193
+ load_benchmark_fn=load_benchmark_data,
194
  )
195
 
196
  return ui.create_interface()
 
737
 
738
 
739
  def main():
740
+ """Initialize the application."""
741
  demo = initialize()
742
+ demo.launch(server_name="0.0.0.0")
743
 
744
 
745
  if __name__ == "__main__":
src/ui.py CHANGED
@@ -1,7 +1,10 @@
 
 
1
  from typing import Callable
2
 
3
  import gradio as gr
4
  import pandas as pd
 
5
 
6
  from src.config import TEST_TYPES
7
 
@@ -21,6 +24,7 @@ class UI:
21
  both_incorrect_fn: Callable,
22
  refresh_leaderboard_fn: Callable,
23
  leaderboard_df: pd.DataFrame,
 
24
  ):
25
  self.refresh_fn = refresh_fn
26
  self.submit_fn = submit_fn
@@ -32,6 +36,7 @@ class UI:
32
  self.both_incorrect_fn = both_incorrect_fn
33
  self.refresh_leaderboard_fn = refresh_leaderboard_fn
34
  self.leaderboard_df = leaderboard_df
 
35
 
36
  def create_interface(self) -> gr.Blocks:
37
  """Create the Gradio interface"""
@@ -151,6 +156,55 @@ class UI:
151
  )
152
  refresh_leaderboard = gr.Button("Refresh Leaderboard")
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  with gr.Tab("About"):
155
  self._create_about_tab()
156
 
@@ -250,36 +304,114 @@ class UI:
250
  queue=False, # Run immediately without waiting in queue
251
  )
252
 
 
253
  select_eval1.click(
254
  self.winner1_fn,
255
  [],
256
- result_text,
257
  )
258
 
259
  select_eval2.click(
260
  self.winner2_fn,
261
  [],
262
- result_text,
263
  )
264
 
265
  both_correct_btn.click(
266
  self.both_correct_fn,
267
  [],
268
- result_text,
269
  )
270
 
271
  both_incorrect_btn.click(
272
  self.both_incorrect_fn,
273
  [],
274
- result_text,
275
  )
276
 
277
  refresh_leaderboard.click(
278
  self.refresh_leaderboard_fn,
279
  [],
280
- leaderboard_dataframe,
281
  )
282
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  # Add footer
284
  with gr.Row():
285
  gr.HTML(
@@ -290,80 +422,78 @@ class UI:
290
  """
291
  )
292
 
293
- return demo
294
 
295
  def _create_about_tab(self) -> None:
296
  """Create the About tab content"""
297
  gr.Markdown(
298
  """
299
- ## About AI Evaluation Judge Arena
300
-
301
- This platform allows users to compare and rate different AI evaluation models (judges).
302
-
303
- ### How it works:
304
- 1. You are presented with an input prompt and AI-generated output
305
- 2. Two AI judges provide evaluations of the output
306
- 3. The evaluations are presented anonymously (blind evaluation)
307
- 4. You select which evaluation you think is better, or if both are correct/incorrect
308
- 5. The judges' identities are revealed after your selection, and their ELO ratings are updated
309
-
310
- ### ELO Rating System
311
- The platform uses the ELO rating system (like in chess) to rank the judges.
312
- When you choose a winner:
313
- - The winning judge gains ELO points
314
- - The losing judge loses ELO points
315
- - If both are correct, both gain a small amount of points
316
- - If both are incorrect, both lose a small amount of points
317
- - The amount of points transferred depends on the difference in current ratings
318
-
319
- ### Test Types
320
- - **Prompt Injections**: Evaluates how well judges detect and assess prompt
321
- injection attempts
322
- - **Safety**: Tests judges on responses involving potentially harmful content
323
- - **Grounding**: Assesses judges' ability to evaluate factual correctness
324
- - **Policy**: Tests judges on evaluating responses to ethical dilemmas and
325
- policy questions
326
-
327
- ### Purpose
328
- This platform helps determine which AI evaluation methods are most aligned
329
- with human preferences.
330
- """
331
  )
332
 
333
  def _update_input_visibility(self, test_type):
334
- """Update the visibility of input fields based on the test type"""
335
- # Hide all inputs first
336
- text_visible = False
337
- claim_visible = False
338
- single_text_visible = False
339
- policy_input_visible = False
340
- policy_output_visible = False
341
- policy_assertion_visible = False
342
- input_visible = False
343
- output_visible = False
344
-
345
- # Show the appropriate inputs based on test type
346
  if test_type == "grounding":
347
- text_visible = True
348
- claim_visible = True
 
 
 
 
 
 
 
 
349
  elif test_type in ["prompt injections", "safety"]:
350
- single_text_visible = True
 
 
 
 
 
 
 
 
 
351
  elif test_type == "policy":
352
- policy_input_visible = True
353
- policy_output_visible = True
354
- policy_assertion_visible = True
 
 
 
 
 
 
 
355
  else:
356
- # Fallback to legacy layout
357
- input_visible = True
358
- output_visible = True
359
-
360
- return (
361
- gr.update(visible=text_visible),
362
- gr.update(visible=claim_visible),
363
- gr.update(visible=single_text_visible),
364
- gr.update(visible=policy_input_visible),
365
- gr.update(visible=policy_output_visible),
366
- gr.update(visible=policy_assertion_visible),
367
- gr.update(visible=input_visible),
368
- gr.update(visible=output_visible),
369
- )
 
1
+ import glob
2
+ import os
3
  from typing import Callable
4
 
5
  import gradio as gr
6
  import pandas as pd
7
+ from loguru import logger
8
 
9
  from src.config import TEST_TYPES
10
 
 
24
  both_incorrect_fn: Callable,
25
  refresh_leaderboard_fn: Callable,
26
  leaderboard_df: pd.DataFrame,
27
+ load_benchmark_fn: Callable = None,
28
  ):
29
  self.refresh_fn = refresh_fn
30
  self.submit_fn = submit_fn
 
36
  self.both_incorrect_fn = both_incorrect_fn
37
  self.refresh_leaderboard_fn = refresh_leaderboard_fn
38
  self.leaderboard_df = leaderboard_df
39
+ self.load_benchmark_fn = load_benchmark_fn
40
 
41
  def create_interface(self) -> gr.Blocks:
42
  """Create the Gradio interface"""
 
156
  )
157
  refresh_leaderboard = gr.Button("Refresh Leaderboard")
158
 
159
+ # New Benchmarks Tab
160
+ with gr.Tab("📊 Benchmarks"):
161
+ with gr.Row():
162
+ with gr.Column(scale=1):
163
+ # Get available test types from the benchmarks directory
164
+ try:
165
+ benchmark_types = [
166
+ d for d in os.listdir("benchmarks") if os.path.isdir(os.path.join("benchmarks", d))
167
+ ]
168
+ except (FileNotFoundError, PermissionError):
169
+ # Fallback if directory can't be read
170
+ benchmark_types = []
171
+ logger.error("Failed to read benchmarks directory")
172
+
173
+ benchmark_type_dropdown = gr.Dropdown(
174
+ choices=benchmark_types,
175
+ label="Benchmark Type",
176
+ info="Select the type of benchmark to view",
177
+ value=benchmark_types[0] if benchmark_types else None,
178
+ )
179
+
180
+ with gr.Row():
181
+ with gr.Column():
182
+ # Get available benchmark datasets for the selected type
183
+ benchmark_dataset_dropdown = gr.Dropdown(
184
+ label="Benchmark Dataset",
185
+ info="Select the benchmark dataset to view",
186
+ )
187
+
188
+ with gr.Row():
189
+ with gr.Column():
190
+ benchmark_dataframe = gr.DataFrame(
191
+ headers=[
192
+ "Judge Name",
193
+ "F1 Score",
194
+ "Balanced Accuracy",
195
+ "Avg Latency (s)",
196
+ "Correct",
197
+ "Total",
198
+ ],
199
+ label="Benchmark Results",
200
+ interactive=False,
201
+ )
202
+
203
+ benchmark_info = gr.Markdown("Select a benchmark dataset to view results")
204
+
205
+ # Add a refresh button
206
+ refresh_benchmarks_btn = gr.Button("Refresh Benchmark List")
207
+
208
  with gr.Tab("About"):
209
  self._create_about_tab()
210
 
 
304
  queue=False, # Run immediately without waiting in queue
305
  )
306
 
307
+ # Show result buttons after both evaluations are done
308
  select_eval1.click(
309
  self.winner1_fn,
310
  [],
311
+ [result_text],
312
  )
313
 
314
  select_eval2.click(
315
  self.winner2_fn,
316
  [],
317
+ [result_text],
318
  )
319
 
320
  both_correct_btn.click(
321
  self.both_correct_fn,
322
  [],
323
+ [result_text],
324
  )
325
 
326
  both_incorrect_btn.click(
327
  self.both_incorrect_fn,
328
  [],
329
+ [result_text],
330
  )
331
 
332
  refresh_leaderboard.click(
333
  self.refresh_leaderboard_fn,
334
  [],
335
+ [leaderboard_dataframe],
336
  )
337
 
338
+ # Benchmark tab event handlers
339
+ def get_benchmark_datasets(benchmark_type):
340
+ if not benchmark_type:
341
+ return gr.update(choices=[], value=None)
342
+
343
+ try:
344
+ # Find all CSV files that match the pattern <dataset>-judges-metrics.csv
345
+ pattern = os.path.join("benchmarks", benchmark_type, "*-judges-metrics.csv")
346
+ files = glob.glob(pattern)
347
+
348
+ # Extract dataset names from file paths
349
+ datasets = []
350
+ for file in files:
351
+ basename = os.path.basename(file)
352
+ dataset_name = basename.replace("-judges-metrics.csv", "")
353
+ datasets.append(dataset_name)
354
+
355
+ logger.info(f"Found datasets for {benchmark_type}: {datasets}")
356
+
357
+ if datasets:
358
+ return gr.update(choices=datasets, value=datasets[0])
359
+ else:
360
+ return gr.update(choices=[], value=None)
361
+ except Exception as e:
362
+ logger.error(f"Error getting benchmark datasets: {e}")
363
+ return gr.update(choices=[], value=None)
364
+
365
+ def refresh_benchmark_types():
366
+ try:
367
+ new_benchmark_types = [
368
+ d for d in os.listdir("benchmarks") if os.path.isdir(os.path.join("benchmarks", d))
369
+ ]
370
+
371
+ logger.info(f"Refreshed benchmark types: {new_benchmark_types}")
372
+
373
+ # Update the benchmark type dropdown
374
+ if new_benchmark_types:
375
+ # Return the updated dropdown and trigger dataset reload
376
+ return gr.update(choices=new_benchmark_types, value=new_benchmark_types[0])
377
+ else:
378
+ return gr.update(choices=[], value=None)
379
+ except (FileNotFoundError, PermissionError) as e:
380
+ logger.error(f"Error refreshing benchmark types: {e}")
381
+ return gr.update(choices=[], value=None)
382
+
383
+ # Set up event handlers for the benchmark tab
384
+ benchmark_type_dropdown.change(
385
+ get_benchmark_datasets,
386
+ [benchmark_type_dropdown],
387
+ [benchmark_dataset_dropdown],
388
+ )
389
+
390
+ # Add refresh button handler
391
+ refresh_benchmarks_btn.click(
392
+ refresh_benchmark_types,
393
+ [],
394
+ [benchmark_type_dropdown],
395
+ ).then( # Chain the dataset dropdown update after the type is refreshed
396
+ get_benchmark_datasets,
397
+ [benchmark_type_dropdown],
398
+ [benchmark_dataset_dropdown],
399
+ )
400
+
401
+ # Add handler to load benchmark data when dataset is selected
402
+ if self.load_benchmark_fn:
403
+ benchmark_dataset_dropdown.change(
404
+ self.load_benchmark_fn,
405
+ [benchmark_type_dropdown, benchmark_dataset_dropdown],
406
+ [benchmark_dataframe, benchmark_info],
407
+ )
408
+
409
+ # Load initial datasets for the default benchmark type if it exists
410
+ if benchmark_types:
411
+ initial_benchmark_type = benchmark_types[0]
412
+ logger.info(f"Loading initial datasets for benchmark type: {initial_benchmark_type}")
413
+ benchmark_type_dropdown.value = initial_benchmark_type
414
+
415
  # Add footer
416
  with gr.Row():
417
  gr.HTML(
 
422
  """
423
  )
424
 
425
+ return demo
426
 
427
  def _create_about_tab(self) -> None:
428
  """Create the About tab content"""
429
  gr.Markdown(
430
  """
431
+ # About AI Evaluators Arena
432
+
433
+ This platform allows you to evaluate and compare different AI judges in their ability to assess various types of content.
434
+
435
+ ## How it works
436
+ 1. Choose a test type from the dropdown
437
+ 2. Fill in the input fields or load a random example from our dataset
438
+ 3. Click "Evaluate" to get assessments from two randomly selected judges
439
+ 4. Choose which evaluation you think is better
440
+ 5. See which judge provided each evaluation
441
+
442
+ ## Test Types
443
+ - **Grounding**: Evaluate if a claim is grounded in a given text
444
+ - **Prompt Injections**: Detect attempts to manipulate or jailbreak the model
445
+ - **Safety**: Identify harmful, offensive, or dangerous content
446
+ - **Policy**: Determine if output complies with a given policy
447
+
448
+ ## Leaderboard
449
+ The leaderboard tracks judge performance using an ELO rating system, with scores adjusted based on human preferences.
450
+ """
 
 
 
 
 
 
 
 
 
 
 
 
451
  )
452
 
453
  def _update_input_visibility(self, test_type):
454
+ """Update which input fields are visible based on the selected test type"""
 
 
 
 
 
 
 
 
 
 
 
455
  if test_type == "grounding":
456
+ return [
457
+ gr.update(visible=True), # text_input
458
+ gr.update(visible=True), # claim_input
459
+ gr.update(visible=False), # single_text_input
460
+ gr.update(visible=False), # policy_input
461
+ gr.update(visible=False), # policy_output
462
+ gr.update(visible=False), # policy_assertion
463
+ gr.update(visible=False), # input_text
464
+ gr.update(visible=False), # output_text
465
+ ]
466
  elif test_type in ["prompt injections", "safety"]:
467
+ return [
468
+ gr.update(visible=False), # text_input
469
+ gr.update(visible=False), # claim_input
470
+ gr.update(visible=True), # single_text_input
471
+ gr.update(visible=False), # policy_input
472
+ gr.update(visible=False), # policy_output
473
+ gr.update(visible=False), # policy_assertion
474
+ gr.update(visible=False), # input_text
475
+ gr.update(visible=False), # output_text
476
+ ]
477
  elif test_type == "policy":
478
+ return [
479
+ gr.update(visible=False), # text_input
480
+ gr.update(visible=False), # claim_input
481
+ gr.update(visible=False), # single_text_input
482
+ gr.update(visible=True), # policy_input
483
+ gr.update(visible=True), # policy_output
484
+ gr.update(visible=True), # policy_assertion
485
+ gr.update(visible=False), # input_text
486
+ gr.update(visible=False), # output_text
487
+ ]
488
  else:
489
+ # Legacy fallback
490
+ return [
491
+ gr.update(visible=False), # text_input
492
+ gr.update(visible=False), # claim_input
493
+ gr.update(visible=False), # single_text_input
494
+ gr.update(visible=False), # policy_input
495
+ gr.update(visible=False), # policy_output
496
+ gr.update(visible=False), # policy_assertion
497
+ gr.update(visible=True), # input_text
498
+ gr.update(visible=True), # output_text
499
+ ]