Hotfixes and benchmarks
Browse files- benchmarks/grounding/allenai-wildjailbreak-judges-metrics.csv +24 -0
- src/app.py +105 -2
- src/ui.py +200 -70
benchmarks/grounding/allenai-wildjailbreak-judges-metrics.csv
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
judge_id,judge_name,dataset,f1,bacc,avg_latency,total_time,count,correct
|
| 2 |
+
meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,allenai-wildjailbreak,0.21428571428571427,0.12,0.8566377925872802,85.66377925872803,100,12
|
| 3 |
+
meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,allenai-wildjailbreak,0.7421383647798742,0.59,1.1272331833839417,112.72331833839417,100,59
|
| 4 |
+
meta-llama-4-scout-17B-16E-instruct,Meta Llama 4 Scout 17B 16E Instruct,allenai-wildjailbreak,0.5294117647058824,0.36,0.4795390796661377,47.95390796661377,100,36
|
| 5 |
+
meta-llama-3.3-70B-instruct-turbo,Meta Llama 4 Scout 32K Instruct,allenai-wildjailbreak,0.5401459854014599,0.37,5.12372554063797,512.372554063797,100,37
|
| 6 |
+
meta-llama-3.1-8b-instruct-turbo,Meta Llama 3.1 8B Instruct,allenai-wildjailbreak,0.8950276243093923,0.81,1.0803885889053344,108.03885889053345,100,81
|
| 7 |
+
gemma-2-27b-it,Gemma 2 27B,allenai-wildjailbreak,0.3050847457627119,0.18,1.0046957421302796,100.46957421302795,100,18
|
| 8 |
+
gemma-2-9b-it,Gemma 2 9B,allenai-wildjailbreak,0.4126984126984127,0.26,0.5609125876426697,56.09125876426697,100,26
|
| 9 |
+
mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,allenai-wildjailbreak,0.14814814814814814,0.08,30.8281710100174,3082.81710100174,100,8
|
| 10 |
+
o3-mini, o3-mini,allenai-wildjailbreak,0.09523809523809523,0.05,3.8824497079849243,388.24497079849243,100,5
|
| 11 |
+
gpt-4.1,GPT-4.1,allenai-wildjailbreak,0.23008849557522124,0.13,1.033246524333954,103.32465243339539,100,13
|
| 12 |
+
gpt-4o,GPT-4o,allenai-wildjailbreak,0.09523809523809523,0.05,1.0374453783035278,103.74453783035278,100,5
|
| 13 |
+
gpt-4-turbo,GPT-4 Turbo,allenai-wildjailbreak,0.27586206896551724,0.16,1.118471143245697,111.8471143245697,100,16
|
| 14 |
+
gpt-3.5-turbo,GPT-3.5 Turbo,allenai-wildjailbreak,0.37398373983739835,0.23,0.6795877623558044,67.95877623558044,100,23
|
| 15 |
+
claude-3-haiku-20240307,Claude 3 Haiku,allenai-wildjailbreak,0.05825242718446602,0.03,0.6856383895874023,68.56383895874023,100,3
|
| 16 |
+
claude-3-sonnet-20240229,Claude 3 Sonnet,allenai-wildjailbreak,0.5074626865671642,0.34,0.8858131814002991,88.58131814002991,100,34
|
| 17 |
+
claude-3-opus-latest,Claude 3 Opus,allenai-wildjailbreak,0.6301369863013698,0.46,1.6495161414146424,164.95161414146423,100,46
|
| 18 |
+
claude-3-5-sonnet-latest,Claude 3.5 Sonnet,allenai-wildjailbreak,0.7878787878787878,0.65,1.9892964005470275,198.92964005470276,100,65
|
| 19 |
+
claude-3-5-haiku-latest,Claude 3.5 Haiku,allenai-wildjailbreak,0.8439306358381503,0.73,0.9016167116165161,90.16167116165161,100,73
|
| 20 |
+
qwen-2.5-72b-instruct-turbo,Qwen 2.5 72B Instruct,allenai-wildjailbreak,0.6301369863013698,0.46,0.8251621770858765,82.51621770858765,100,46
|
| 21 |
+
qwen-2.5-7b-instruct-turbo,Qwen 2.5 7B Instruct,allenai-wildjailbreak,0.48484848484848486,0.32,0.5128253746032715,51.28253746032715,100,32
|
| 22 |
+
deepseek-v3,DeepSeek V3,allenai-wildjailbreak,0.49624060150375937,0.33,6.41716570854187,641.716570854187,100,33
|
| 23 |
+
deepseek-r1,DeepSeek R1,allenai-wildjailbreak,0.46153846153846156,0.3,6.692396397590637,669.2396397590637,100,30
|
| 24 |
+
qualifire-eval,Qualifire,allenai-wildjailbreak,0.46153846153846156,0.3,0.9121422719955444,91.21422719955444,100,30
|
src/app.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import datetime
|
|
|
|
| 2 |
from typing import Any, Dict, Optional, Tuple
|
| 3 |
|
| 4 |
import gradio as gr
|
|
@@ -16,6 +17,21 @@ eval2: Optional[Dict[str, Any]] = None
|
|
| 16 |
selected_judges: list = []
|
| 17 |
current_test_type: str = "grounding"
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
def format_leaderboard_for_display(df: pd.DataFrame) -> pd.DataFrame:
|
| 21 |
"""Format the leaderboard dataframe for display in the UI.
|
|
@@ -31,6 +47,92 @@ def format_leaderboard_for_display(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 31 |
return display_df
|
| 32 |
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
def initialize():
|
| 35 |
"""Initialize the application."""
|
| 36 |
# Load models from file
|
|
@@ -88,6 +190,7 @@ def initialize():
|
|
| 88 |
leaderboard_df=format_leaderboard_for_display(
|
| 89 |
judge_manager.leaderboard_df,
|
| 90 |
),
|
|
|
|
| 91 |
)
|
| 92 |
|
| 93 |
return ui.create_interface()
|
|
@@ -634,9 +737,9 @@ def handle_both_incorrect(judge_manager: JudgeManager) -> str:
|
|
| 634 |
|
| 635 |
|
| 636 |
def main():
|
| 637 |
-
"""
|
| 638 |
demo = initialize()
|
| 639 |
-
demo.launch()
|
| 640 |
|
| 641 |
|
| 642 |
if __name__ == "__main__":
|
|
|
|
| 1 |
import datetime
|
| 2 |
+
import os
|
| 3 |
from typing import Any, Dict, Optional, Tuple
|
| 4 |
|
| 5 |
import gradio as gr
|
|
|
|
| 17 |
selected_judges: list = []
|
| 18 |
current_test_type: str = "grounding"
|
| 19 |
|
| 20 |
+
# Add more detailed logging
|
| 21 |
+
logger.info("EvalArena starting up")
|
| 22 |
+
# Check if benchmarks directory exists
|
| 23 |
+
if os.path.exists("benchmarks") and os.path.isdir("benchmarks"):
|
| 24 |
+
benchmark_dirs = [d for d in os.listdir("benchmarks") if os.path.isdir(os.path.join("benchmarks", d))]
|
| 25 |
+
logger.info(f"Found benchmark directories: {benchmark_dirs}")
|
| 26 |
+
|
| 27 |
+
# Log CSV files in each directory
|
| 28 |
+
for d in benchmark_dirs:
|
| 29 |
+
dir_path = os.path.join("benchmarks", d)
|
| 30 |
+
files = [f for f in os.listdir(dir_path) if f.endswith("-judges-metrics.csv")]
|
| 31 |
+
logger.info(f"Benchmark directory '{d}' contains files: {files}")
|
| 32 |
+
else:
|
| 33 |
+
logger.warning("Benchmarks directory not found or not accessible")
|
| 34 |
+
|
| 35 |
|
| 36 |
def format_leaderboard_for_display(df: pd.DataFrame) -> pd.DataFrame:
|
| 37 |
"""Format the leaderboard dataframe for display in the UI.
|
|
|
|
| 47 |
return display_df
|
| 48 |
|
| 49 |
|
| 50 |
+
def load_benchmark_data(benchmark_type: str, dataset_name: str) -> Tuple[pd.DataFrame, str]:
|
| 51 |
+
"""Load benchmark data for the selected benchmark type and dataset.
|
| 52 |
+
|
| 53 |
+
Args:
|
| 54 |
+
benchmark_type: The type of benchmark (e.g., 'prompt-injections')
|
| 55 |
+
dataset_name: The name of the dataset (e.g., 'allenai-wildjailbreak')
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
Tuple containing:
|
| 59 |
+
- DataFrame formatted for display
|
| 60 |
+
- Markdown string with benchmark information
|
| 61 |
+
"""
|
| 62 |
+
# Create empty dataframe with the expected columns
|
| 63 |
+
empty_df = pd.DataFrame(
|
| 64 |
+
columns=["Judge Name", "F1 Score", "Balanced Accuracy", "Avg Latency (s)", "Correct", "Total"]
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
# Handle case when None or empty values are passed
|
| 68 |
+
if not benchmark_type or not dataset_name:
|
| 69 |
+
logger.warning(f"Invalid benchmark parameters: type={benchmark_type}, dataset={dataset_name}")
|
| 70 |
+
return empty_df, "Please select both a benchmark type and dataset"
|
| 71 |
+
|
| 72 |
+
try:
|
| 73 |
+
# Construct the path to the benchmark metrics file
|
| 74 |
+
metrics_file = os.path.join("benchmarks", benchmark_type, f"{dataset_name}-judges-metrics.csv")
|
| 75 |
+
|
| 76 |
+
logger.info(f"Loading benchmark from {metrics_file}")
|
| 77 |
+
|
| 78 |
+
if not os.path.exists(metrics_file):
|
| 79 |
+
error_message = f"Error: Could not find metrics file at {metrics_file}"
|
| 80 |
+
logger.error(error_message)
|
| 81 |
+
return empty_df, error_message
|
| 82 |
+
|
| 83 |
+
# Load the CSV file
|
| 84 |
+
df = pd.read_csv(metrics_file)
|
| 85 |
+
logger.info(f"Loaded benchmark with {len(df)} rows")
|
| 86 |
+
|
| 87 |
+
# Check if the file has the required columns
|
| 88 |
+
required_columns = ["judge_name", "f1", "bacc", "avg_latency", "correct", "count"]
|
| 89 |
+
missing_columns = [col for col in required_columns if col not in df.columns]
|
| 90 |
+
|
| 91 |
+
if missing_columns:
|
| 92 |
+
error_message = f"Error: CSV file missing required columns: {', '.join(missing_columns)}"
|
| 93 |
+
logger.error(error_message)
|
| 94 |
+
return empty_df, error_message
|
| 95 |
+
|
| 96 |
+
# Format the dataframe for display
|
| 97 |
+
display_df = pd.DataFrame()
|
| 98 |
+
display_df["Judge Name"] = df["judge_name"]
|
| 99 |
+
display_df["F1 Score"] = df["f1"].round(3)
|
| 100 |
+
display_df["Balanced Accuracy"] = df["bacc"].round(3)
|
| 101 |
+
display_df["Avg Latency (s)"] = df["avg_latency"].round(2)
|
| 102 |
+
display_df["Correct"] = df["correct"]
|
| 103 |
+
display_df["Total"] = df["count"]
|
| 104 |
+
|
| 105 |
+
# Sort by balanced accuracy descending
|
| 106 |
+
display_df = display_df.sort_values("Balanced Accuracy", ascending=False)
|
| 107 |
+
|
| 108 |
+
# Generate information about the benchmark
|
| 109 |
+
total_samples = df["count"].iloc[0] if not df.empty else 0
|
| 110 |
+
info_md = f"""
|
| 111 |
+
# Benchmark: {dataset_name}
|
| 112 |
+
|
| 113 |
+
**Type**: {benchmark_type}
|
| 114 |
+
**Total Samples**: {total_samples}
|
| 115 |
+
|
| 116 |
+
This table shows how different AI judge models performed on this benchmark.
|
| 117 |
+
Higher F1 score and balanced accuracy indicate better performance.
|
| 118 |
+
"""
|
| 119 |
+
|
| 120 |
+
return display_df, info_md
|
| 121 |
+
|
| 122 |
+
except pd.errors.EmptyDataError:
|
| 123 |
+
error_message = "Error: The CSV file is empty"
|
| 124 |
+
logger.error(error_message)
|
| 125 |
+
return empty_df, error_message
|
| 126 |
+
except pd.errors.ParserError:
|
| 127 |
+
error_message = "Error: Could not parse the CSV file - it may be corrupted or not in CSV format"
|
| 128 |
+
logger.error(error_message)
|
| 129 |
+
return empty_df, error_message
|
| 130 |
+
except Exception as e:
|
| 131 |
+
error_message = f"Error loading benchmark data: {str(e)}"
|
| 132 |
+
logger.error(error_message)
|
| 133 |
+
return empty_df, error_message
|
| 134 |
+
|
| 135 |
+
|
| 136 |
def initialize():
|
| 137 |
"""Initialize the application."""
|
| 138 |
# Load models from file
|
|
|
|
| 190 |
leaderboard_df=format_leaderboard_for_display(
|
| 191 |
judge_manager.leaderboard_df,
|
| 192 |
),
|
| 193 |
+
load_benchmark_fn=load_benchmark_data,
|
| 194 |
)
|
| 195 |
|
| 196 |
return ui.create_interface()
|
|
|
|
| 737 |
|
| 738 |
|
| 739 |
def main():
|
| 740 |
+
"""Initialize the application."""
|
| 741 |
demo = initialize()
|
| 742 |
+
demo.launch(server_name="0.0.0.0")
|
| 743 |
|
| 744 |
|
| 745 |
if __name__ == "__main__":
|
src/ui.py
CHANGED
|
@@ -1,7 +1,10 @@
|
|
|
|
|
|
|
|
| 1 |
from typing import Callable
|
| 2 |
|
| 3 |
import gradio as gr
|
| 4 |
import pandas as pd
|
|
|
|
| 5 |
|
| 6 |
from src.config import TEST_TYPES
|
| 7 |
|
|
@@ -21,6 +24,7 @@ class UI:
|
|
| 21 |
both_incorrect_fn: Callable,
|
| 22 |
refresh_leaderboard_fn: Callable,
|
| 23 |
leaderboard_df: pd.DataFrame,
|
|
|
|
| 24 |
):
|
| 25 |
self.refresh_fn = refresh_fn
|
| 26 |
self.submit_fn = submit_fn
|
|
@@ -32,6 +36,7 @@ class UI:
|
|
| 32 |
self.both_incorrect_fn = both_incorrect_fn
|
| 33 |
self.refresh_leaderboard_fn = refresh_leaderboard_fn
|
| 34 |
self.leaderboard_df = leaderboard_df
|
|
|
|
| 35 |
|
| 36 |
def create_interface(self) -> gr.Blocks:
|
| 37 |
"""Create the Gradio interface"""
|
|
@@ -151,6 +156,55 @@ class UI:
|
|
| 151 |
)
|
| 152 |
refresh_leaderboard = gr.Button("Refresh Leaderboard")
|
| 153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
with gr.Tab("About"):
|
| 155 |
self._create_about_tab()
|
| 156 |
|
|
@@ -250,36 +304,114 @@ class UI:
|
|
| 250 |
queue=False, # Run immediately without waiting in queue
|
| 251 |
)
|
| 252 |
|
|
|
|
| 253 |
select_eval1.click(
|
| 254 |
self.winner1_fn,
|
| 255 |
[],
|
| 256 |
-
result_text,
|
| 257 |
)
|
| 258 |
|
| 259 |
select_eval2.click(
|
| 260 |
self.winner2_fn,
|
| 261 |
[],
|
| 262 |
-
result_text,
|
| 263 |
)
|
| 264 |
|
| 265 |
both_correct_btn.click(
|
| 266 |
self.both_correct_fn,
|
| 267 |
[],
|
| 268 |
-
result_text,
|
| 269 |
)
|
| 270 |
|
| 271 |
both_incorrect_btn.click(
|
| 272 |
self.both_incorrect_fn,
|
| 273 |
[],
|
| 274 |
-
result_text,
|
| 275 |
)
|
| 276 |
|
| 277 |
refresh_leaderboard.click(
|
| 278 |
self.refresh_leaderboard_fn,
|
| 279 |
[],
|
| 280 |
-
leaderboard_dataframe,
|
| 281 |
)
|
| 282 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
# Add footer
|
| 284 |
with gr.Row():
|
| 285 |
gr.HTML(
|
|
@@ -290,80 +422,78 @@ class UI:
|
|
| 290 |
"""
|
| 291 |
)
|
| 292 |
|
| 293 |
-
|
| 294 |
|
| 295 |
def _create_about_tab(self) -> None:
|
| 296 |
"""Create the About tab content"""
|
| 297 |
gr.Markdown(
|
| 298 |
"""
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
### Test Types
|
| 320 |
-
- **Prompt Injections**: Evaluates how well judges detect and assess prompt
|
| 321 |
-
injection attempts
|
| 322 |
-
- **Safety**: Tests judges on responses involving potentially harmful content
|
| 323 |
-
- **Grounding**: Assesses judges' ability to evaluate factual correctness
|
| 324 |
-
- **Policy**: Tests judges on evaluating responses to ethical dilemmas and
|
| 325 |
-
policy questions
|
| 326 |
-
|
| 327 |
-
### Purpose
|
| 328 |
-
This platform helps determine which AI evaluation methods are most aligned
|
| 329 |
-
with human preferences.
|
| 330 |
-
"""
|
| 331 |
)
|
| 332 |
|
| 333 |
def _update_input_visibility(self, test_type):
|
| 334 |
-
"""Update
|
| 335 |
-
# Hide all inputs first
|
| 336 |
-
text_visible = False
|
| 337 |
-
claim_visible = False
|
| 338 |
-
single_text_visible = False
|
| 339 |
-
policy_input_visible = False
|
| 340 |
-
policy_output_visible = False
|
| 341 |
-
policy_assertion_visible = False
|
| 342 |
-
input_visible = False
|
| 343 |
-
output_visible = False
|
| 344 |
-
|
| 345 |
-
# Show the appropriate inputs based on test type
|
| 346 |
if test_type == "grounding":
|
| 347 |
-
|
| 348 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
elif test_type in ["prompt injections", "safety"]:
|
| 350 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
elif test_type == "policy":
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
else:
|
| 356 |
-
#
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
gr.update(visible=input_visible),
|
| 368 |
-
gr.update(visible=output_visible),
|
| 369 |
-
)
|
|
|
|
| 1 |
+
import glob
|
| 2 |
+
import os
|
| 3 |
from typing import Callable
|
| 4 |
|
| 5 |
import gradio as gr
|
| 6 |
import pandas as pd
|
| 7 |
+
from loguru import logger
|
| 8 |
|
| 9 |
from src.config import TEST_TYPES
|
| 10 |
|
|
|
|
| 24 |
both_incorrect_fn: Callable,
|
| 25 |
refresh_leaderboard_fn: Callable,
|
| 26 |
leaderboard_df: pd.DataFrame,
|
| 27 |
+
load_benchmark_fn: Callable = None,
|
| 28 |
):
|
| 29 |
self.refresh_fn = refresh_fn
|
| 30 |
self.submit_fn = submit_fn
|
|
|
|
| 36 |
self.both_incorrect_fn = both_incorrect_fn
|
| 37 |
self.refresh_leaderboard_fn = refresh_leaderboard_fn
|
| 38 |
self.leaderboard_df = leaderboard_df
|
| 39 |
+
self.load_benchmark_fn = load_benchmark_fn
|
| 40 |
|
| 41 |
def create_interface(self) -> gr.Blocks:
|
| 42 |
"""Create the Gradio interface"""
|
|
|
|
| 156 |
)
|
| 157 |
refresh_leaderboard = gr.Button("Refresh Leaderboard")
|
| 158 |
|
| 159 |
+
# New Benchmarks Tab
|
| 160 |
+
with gr.Tab("📊 Benchmarks"):
|
| 161 |
+
with gr.Row():
|
| 162 |
+
with gr.Column(scale=1):
|
| 163 |
+
# Get available test types from the benchmarks directory
|
| 164 |
+
try:
|
| 165 |
+
benchmark_types = [
|
| 166 |
+
d for d in os.listdir("benchmarks") if os.path.isdir(os.path.join("benchmarks", d))
|
| 167 |
+
]
|
| 168 |
+
except (FileNotFoundError, PermissionError):
|
| 169 |
+
# Fallback if directory can't be read
|
| 170 |
+
benchmark_types = []
|
| 171 |
+
logger.error("Failed to read benchmarks directory")
|
| 172 |
+
|
| 173 |
+
benchmark_type_dropdown = gr.Dropdown(
|
| 174 |
+
choices=benchmark_types,
|
| 175 |
+
label="Benchmark Type",
|
| 176 |
+
info="Select the type of benchmark to view",
|
| 177 |
+
value=benchmark_types[0] if benchmark_types else None,
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
with gr.Row():
|
| 181 |
+
with gr.Column():
|
| 182 |
+
# Get available benchmark datasets for the selected type
|
| 183 |
+
benchmark_dataset_dropdown = gr.Dropdown(
|
| 184 |
+
label="Benchmark Dataset",
|
| 185 |
+
info="Select the benchmark dataset to view",
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
with gr.Row():
|
| 189 |
+
with gr.Column():
|
| 190 |
+
benchmark_dataframe = gr.DataFrame(
|
| 191 |
+
headers=[
|
| 192 |
+
"Judge Name",
|
| 193 |
+
"F1 Score",
|
| 194 |
+
"Balanced Accuracy",
|
| 195 |
+
"Avg Latency (s)",
|
| 196 |
+
"Correct",
|
| 197 |
+
"Total",
|
| 198 |
+
],
|
| 199 |
+
label="Benchmark Results",
|
| 200 |
+
interactive=False,
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
benchmark_info = gr.Markdown("Select a benchmark dataset to view results")
|
| 204 |
+
|
| 205 |
+
# Add a refresh button
|
| 206 |
+
refresh_benchmarks_btn = gr.Button("Refresh Benchmark List")
|
| 207 |
+
|
| 208 |
with gr.Tab("About"):
|
| 209 |
self._create_about_tab()
|
| 210 |
|
|
|
|
| 304 |
queue=False, # Run immediately without waiting in queue
|
| 305 |
)
|
| 306 |
|
| 307 |
+
# Show result buttons after both evaluations are done
|
| 308 |
select_eval1.click(
|
| 309 |
self.winner1_fn,
|
| 310 |
[],
|
| 311 |
+
[result_text],
|
| 312 |
)
|
| 313 |
|
| 314 |
select_eval2.click(
|
| 315 |
self.winner2_fn,
|
| 316 |
[],
|
| 317 |
+
[result_text],
|
| 318 |
)
|
| 319 |
|
| 320 |
both_correct_btn.click(
|
| 321 |
self.both_correct_fn,
|
| 322 |
[],
|
| 323 |
+
[result_text],
|
| 324 |
)
|
| 325 |
|
| 326 |
both_incorrect_btn.click(
|
| 327 |
self.both_incorrect_fn,
|
| 328 |
[],
|
| 329 |
+
[result_text],
|
| 330 |
)
|
| 331 |
|
| 332 |
refresh_leaderboard.click(
|
| 333 |
self.refresh_leaderboard_fn,
|
| 334 |
[],
|
| 335 |
+
[leaderboard_dataframe],
|
| 336 |
)
|
| 337 |
|
| 338 |
+
# Benchmark tab event handlers
|
| 339 |
+
def get_benchmark_datasets(benchmark_type):
|
| 340 |
+
if not benchmark_type:
|
| 341 |
+
return gr.update(choices=[], value=None)
|
| 342 |
+
|
| 343 |
+
try:
|
| 344 |
+
# Find all CSV files that match the pattern <dataset>-judges-metrics.csv
|
| 345 |
+
pattern = os.path.join("benchmarks", benchmark_type, "*-judges-metrics.csv")
|
| 346 |
+
files = glob.glob(pattern)
|
| 347 |
+
|
| 348 |
+
# Extract dataset names from file paths
|
| 349 |
+
datasets = []
|
| 350 |
+
for file in files:
|
| 351 |
+
basename = os.path.basename(file)
|
| 352 |
+
dataset_name = basename.replace("-judges-metrics.csv", "")
|
| 353 |
+
datasets.append(dataset_name)
|
| 354 |
+
|
| 355 |
+
logger.info(f"Found datasets for {benchmark_type}: {datasets}")
|
| 356 |
+
|
| 357 |
+
if datasets:
|
| 358 |
+
return gr.update(choices=datasets, value=datasets[0])
|
| 359 |
+
else:
|
| 360 |
+
return gr.update(choices=[], value=None)
|
| 361 |
+
except Exception as e:
|
| 362 |
+
logger.error(f"Error getting benchmark datasets: {e}")
|
| 363 |
+
return gr.update(choices=[], value=None)
|
| 364 |
+
|
| 365 |
+
def refresh_benchmark_types():
|
| 366 |
+
try:
|
| 367 |
+
new_benchmark_types = [
|
| 368 |
+
d for d in os.listdir("benchmarks") if os.path.isdir(os.path.join("benchmarks", d))
|
| 369 |
+
]
|
| 370 |
+
|
| 371 |
+
logger.info(f"Refreshed benchmark types: {new_benchmark_types}")
|
| 372 |
+
|
| 373 |
+
# Update the benchmark type dropdown
|
| 374 |
+
if new_benchmark_types:
|
| 375 |
+
# Return the updated dropdown and trigger dataset reload
|
| 376 |
+
return gr.update(choices=new_benchmark_types, value=new_benchmark_types[0])
|
| 377 |
+
else:
|
| 378 |
+
return gr.update(choices=[], value=None)
|
| 379 |
+
except (FileNotFoundError, PermissionError) as e:
|
| 380 |
+
logger.error(f"Error refreshing benchmark types: {e}")
|
| 381 |
+
return gr.update(choices=[], value=None)
|
| 382 |
+
|
| 383 |
+
# Set up event handlers for the benchmark tab
|
| 384 |
+
benchmark_type_dropdown.change(
|
| 385 |
+
get_benchmark_datasets,
|
| 386 |
+
[benchmark_type_dropdown],
|
| 387 |
+
[benchmark_dataset_dropdown],
|
| 388 |
+
)
|
| 389 |
+
|
| 390 |
+
# Add refresh button handler
|
| 391 |
+
refresh_benchmarks_btn.click(
|
| 392 |
+
refresh_benchmark_types,
|
| 393 |
+
[],
|
| 394 |
+
[benchmark_type_dropdown],
|
| 395 |
+
).then( # Chain the dataset dropdown update after the type is refreshed
|
| 396 |
+
get_benchmark_datasets,
|
| 397 |
+
[benchmark_type_dropdown],
|
| 398 |
+
[benchmark_dataset_dropdown],
|
| 399 |
+
)
|
| 400 |
+
|
| 401 |
+
# Add handler to load benchmark data when dataset is selected
|
| 402 |
+
if self.load_benchmark_fn:
|
| 403 |
+
benchmark_dataset_dropdown.change(
|
| 404 |
+
self.load_benchmark_fn,
|
| 405 |
+
[benchmark_type_dropdown, benchmark_dataset_dropdown],
|
| 406 |
+
[benchmark_dataframe, benchmark_info],
|
| 407 |
+
)
|
| 408 |
+
|
| 409 |
+
# Load initial datasets for the default benchmark type if it exists
|
| 410 |
+
if benchmark_types:
|
| 411 |
+
initial_benchmark_type = benchmark_types[0]
|
| 412 |
+
logger.info(f"Loading initial datasets for benchmark type: {initial_benchmark_type}")
|
| 413 |
+
benchmark_type_dropdown.value = initial_benchmark_type
|
| 414 |
+
|
| 415 |
# Add footer
|
| 416 |
with gr.Row():
|
| 417 |
gr.HTML(
|
|
|
|
| 422 |
"""
|
| 423 |
)
|
| 424 |
|
| 425 |
+
return demo
|
| 426 |
|
| 427 |
def _create_about_tab(self) -> None:
|
| 428 |
"""Create the About tab content"""
|
| 429 |
gr.Markdown(
|
| 430 |
"""
|
| 431 |
+
# About AI Evaluators Arena
|
| 432 |
+
|
| 433 |
+
This platform allows you to evaluate and compare different AI judges in their ability to assess various types of content.
|
| 434 |
+
|
| 435 |
+
## How it works
|
| 436 |
+
1. Choose a test type from the dropdown
|
| 437 |
+
2. Fill in the input fields or load a random example from our dataset
|
| 438 |
+
3. Click "Evaluate" to get assessments from two randomly selected judges
|
| 439 |
+
4. Choose which evaluation you think is better
|
| 440 |
+
5. See which judge provided each evaluation
|
| 441 |
+
|
| 442 |
+
## Test Types
|
| 443 |
+
- **Grounding**: Evaluate if a claim is grounded in a given text
|
| 444 |
+
- **Prompt Injections**: Detect attempts to manipulate or jailbreak the model
|
| 445 |
+
- **Safety**: Identify harmful, offensive, or dangerous content
|
| 446 |
+
- **Policy**: Determine if output complies with a given policy
|
| 447 |
+
|
| 448 |
+
## Leaderboard
|
| 449 |
+
The leaderboard tracks judge performance using an ELO rating system, with scores adjusted based on human preferences.
|
| 450 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 451 |
)
|
| 452 |
|
| 453 |
def _update_input_visibility(self, test_type):
|
| 454 |
+
"""Update which input fields are visible based on the selected test type"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 455 |
if test_type == "grounding":
|
| 456 |
+
return [
|
| 457 |
+
gr.update(visible=True), # text_input
|
| 458 |
+
gr.update(visible=True), # claim_input
|
| 459 |
+
gr.update(visible=False), # single_text_input
|
| 460 |
+
gr.update(visible=False), # policy_input
|
| 461 |
+
gr.update(visible=False), # policy_output
|
| 462 |
+
gr.update(visible=False), # policy_assertion
|
| 463 |
+
gr.update(visible=False), # input_text
|
| 464 |
+
gr.update(visible=False), # output_text
|
| 465 |
+
]
|
| 466 |
elif test_type in ["prompt injections", "safety"]:
|
| 467 |
+
return [
|
| 468 |
+
gr.update(visible=False), # text_input
|
| 469 |
+
gr.update(visible=False), # claim_input
|
| 470 |
+
gr.update(visible=True), # single_text_input
|
| 471 |
+
gr.update(visible=False), # policy_input
|
| 472 |
+
gr.update(visible=False), # policy_output
|
| 473 |
+
gr.update(visible=False), # policy_assertion
|
| 474 |
+
gr.update(visible=False), # input_text
|
| 475 |
+
gr.update(visible=False), # output_text
|
| 476 |
+
]
|
| 477 |
elif test_type == "policy":
|
| 478 |
+
return [
|
| 479 |
+
gr.update(visible=False), # text_input
|
| 480 |
+
gr.update(visible=False), # claim_input
|
| 481 |
+
gr.update(visible=False), # single_text_input
|
| 482 |
+
gr.update(visible=True), # policy_input
|
| 483 |
+
gr.update(visible=True), # policy_output
|
| 484 |
+
gr.update(visible=True), # policy_assertion
|
| 485 |
+
gr.update(visible=False), # input_text
|
| 486 |
+
gr.update(visible=False), # output_text
|
| 487 |
+
]
|
| 488 |
else:
|
| 489 |
+
# Legacy fallback
|
| 490 |
+
return [
|
| 491 |
+
gr.update(visible=False), # text_input
|
| 492 |
+
gr.update(visible=False), # claim_input
|
| 493 |
+
gr.update(visible=False), # single_text_input
|
| 494 |
+
gr.update(visible=False), # policy_input
|
| 495 |
+
gr.update(visible=False), # policy_output
|
| 496 |
+
gr.update(visible=False), # policy_assertion
|
| 497 |
+
gr.update(visible=True), # input_text
|
| 498 |
+
gr.update(visible=True), # output_text
|
| 499 |
+
]
|
|
|
|
|
|
|
|
|