EvalArena

Sleeping

App Files Files Community

dror44 commited on Apr 28

Commit

8c8b2c7

1 Parent(s): 5a05fa9

Hotfixes and benchmarks

Browse files

Files changed (3) hide show

benchmarks/grounding/allenai-wildjailbreak-judges-metrics.csv +24 -0
src/app.py +105 -2
src/ui.py +200 -70

benchmarks/grounding/allenai-wildjailbreak-judges-metrics.csv ADDED Viewed

	@@ -0,0 +1,24 @@

+judge_id,judge_name,dataset,f1,bacc,avg_latency,total_time,count,correct
+meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,allenai-wildjailbreak,0.21428571428571427,0.12,0.8566377925872802,85.66377925872803,100,12
+meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,allenai-wildjailbreak,0.7421383647798742,0.59,1.1272331833839417,112.72331833839417,100,59
+meta-llama-4-scout-17B-16E-instruct,Meta Llama 4 Scout 17B 16E Instruct,allenai-wildjailbreak,0.5294117647058824,0.36,0.4795390796661377,47.95390796661377,100,36
+meta-llama-3.3-70B-instruct-turbo,Meta Llama 4 Scout 32K Instruct,allenai-wildjailbreak,0.5401459854014599,0.37,5.12372554063797,512.372554063797,100,37
+meta-llama-3.1-8b-instruct-turbo,Meta Llama 3.1 8B Instruct,allenai-wildjailbreak,0.8950276243093923,0.81,1.0803885889053344,108.03885889053345,100,81
+gemma-2-27b-it,Gemma 2 27B,allenai-wildjailbreak,0.3050847457627119,0.18,1.0046957421302796,100.46957421302795,100,18
+gemma-2-9b-it,Gemma 2 9B,allenai-wildjailbreak,0.4126984126984127,0.26,0.5609125876426697,56.09125876426697,100,26
+mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,allenai-wildjailbreak,0.14814814814814814,0.08,30.8281710100174,3082.81710100174,100,8
+o3-mini, o3-mini,allenai-wildjailbreak,0.09523809523809523,0.05,3.8824497079849243,388.24497079849243,100,5
+gpt-4.1,GPT-4.1,allenai-wildjailbreak,0.23008849557522124,0.13,1.033246524333954,103.32465243339539,100,13
+gpt-4o,GPT-4o,allenai-wildjailbreak,0.09523809523809523,0.05,1.0374453783035278,103.74453783035278,100,5
+gpt-4-turbo,GPT-4 Turbo,allenai-wildjailbreak,0.27586206896551724,0.16,1.118471143245697,111.8471143245697,100,16
+gpt-3.5-turbo,GPT-3.5 Turbo,allenai-wildjailbreak,0.37398373983739835,0.23,0.6795877623558044,67.95877623558044,100,23
+claude-3-haiku-20240307,Claude 3 Haiku,allenai-wildjailbreak,0.05825242718446602,0.03,0.6856383895874023,68.56383895874023,100,3
+claude-3-sonnet-20240229,Claude 3 Sonnet,allenai-wildjailbreak,0.5074626865671642,0.34,0.8858131814002991,88.58131814002991,100,34
+claude-3-opus-latest,Claude 3 Opus,allenai-wildjailbreak,0.6301369863013698,0.46,1.6495161414146424,164.95161414146423,100,46
+claude-3-5-sonnet-latest,Claude 3.5 Sonnet,allenai-wildjailbreak,0.7878787878787878,0.65,1.9892964005470275,198.92964005470276,100,65
+claude-3-5-haiku-latest,Claude 3.5 Haiku,allenai-wildjailbreak,0.8439306358381503,0.73,0.9016167116165161,90.16167116165161,100,73
+qwen-2.5-72b-instruct-turbo,Qwen 2.5 72B Instruct,allenai-wildjailbreak,0.6301369863013698,0.46,0.8251621770858765,82.51621770858765,100,46
+qwen-2.5-7b-instruct-turbo,Qwen 2.5 7B Instruct,allenai-wildjailbreak,0.48484848484848486,0.32,0.5128253746032715,51.28253746032715,100,32
+deepseek-v3,DeepSeek V3,allenai-wildjailbreak,0.49624060150375937,0.33,6.41716570854187,641.716570854187,100,33
+deepseek-r1,DeepSeek R1,allenai-wildjailbreak,0.46153846153846156,0.3,6.692396397590637,669.2396397590637,100,30
+qualifire-eval,Qualifire,allenai-wildjailbreak,0.46153846153846156,0.3,0.9121422719955444,91.21422719955444,100,30

src/app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import datetime
 from typing import Any, Dict, Optional, Tuple
 import gradio as gr
@@ -16,6 +17,21 @@ eval2: Optional[Dict[str, Any]] = None
 selected_judges: list = []
 current_test_type: str = "grounding"
 def format_leaderboard_for_display(df: pd.DataFrame) -> pd.DataFrame:
     """Format the leaderboard dataframe for display in the UI.
@@ -31,6 +47,92 @@ def format_leaderboard_for_display(df: pd.DataFrame) -> pd.DataFrame:
     return display_df
 def initialize():
     """Initialize the application."""
     # Load models from file
@@ -88,6 +190,7 @@ def initialize():
         leaderboard_df=format_leaderboard_for_display(
             judge_manager.leaderboard_df,
         ),
     )
     return ui.create_interface()
@@ -634,9 +737,9 @@ def handle_both_incorrect(judge_manager: JudgeManager) -> str:
 def main():
-    """Main application entry point."""
     demo = initialize()
-    demo.launch()
 if __name__ == "__main__":

 import datetime
+import os
 from typing import Any, Dict, Optional, Tuple
 import gradio as gr
 selected_judges: list = []
 current_test_type: str = "grounding"
+# Add more detailed logging
+logger.info("EvalArena starting up")
+# Check if benchmarks directory exists
+if os.path.exists("benchmarks") and os.path.isdir("benchmarks"):
+    benchmark_dirs = [d for d in os.listdir("benchmarks") if os.path.isdir(os.path.join("benchmarks", d))]
+    logger.info(f"Found benchmark directories: {benchmark_dirs}")
+    # Log CSV files in each directory
+    for d in benchmark_dirs:
+        dir_path = os.path.join("benchmarks", d)
+        files = [f for f in os.listdir(dir_path) if f.endswith("-judges-metrics.csv")]
+        logger.info(f"Benchmark directory '{d}' contains files: {files}")
+else:
+    logger.warning("Benchmarks directory not found or not accessible")
 def format_leaderboard_for_display(df: pd.DataFrame) -> pd.DataFrame:
     """Format the leaderboard dataframe for display in the UI.
     return display_df
+def load_benchmark_data(benchmark_type: str, dataset_name: str) -> Tuple[pd.DataFrame, str]:
+    """Load benchmark data for the selected benchmark type and dataset.
+    Args:
+        benchmark_type: The type of benchmark (e.g., 'prompt-injections')
+        dataset_name: The name of the dataset (e.g., 'allenai-wildjailbreak')
+    Returns:
+        Tuple containing:
+            - DataFrame formatted for display
+            - Markdown string with benchmark information
+    """
+    # Create empty dataframe with the expected columns
+    empty_df = pd.DataFrame(
+        columns=["Judge Name", "F1 Score", "Balanced Accuracy", "Avg Latency (s)", "Correct", "Total"]
+    )
+    # Handle case when None or empty values are passed
+    if not benchmark_type or not dataset_name:
+        logger.warning(f"Invalid benchmark parameters: type={benchmark_type}, dataset={dataset_name}")
+        return empty_df, "Please select both a benchmark type and dataset"
+    try:
+        # Construct the path to the benchmark metrics file
+        metrics_file = os.path.join("benchmarks", benchmark_type, f"{dataset_name}-judges-metrics.csv")
+        logger.info(f"Loading benchmark from {metrics_file}")
+        if not os.path.exists(metrics_file):
+            error_message = f"Error: Could not find metrics file at {metrics_file}"
+            logger.error(error_message)
+            return empty_df, error_message
+        # Load the CSV file
+        df = pd.read_csv(metrics_file)
+        logger.info(f"Loaded benchmark with {len(df)} rows")
+        # Check if the file has the required columns
+        required_columns = ["judge_name", "f1", "bacc", "avg_latency", "correct", "count"]
+        missing_columns = [col for col in required_columns if col not in df.columns]
+        if missing_columns:
+            error_message = f"Error: CSV file missing required columns: {', '.join(missing_columns)}"
+            logger.error(error_message)
+            return empty_df, error_message
+        # Format the dataframe for display
+        display_df = pd.DataFrame()
+        display_df["Judge Name"] = df["judge_name"]
+        display_df["F1 Score"] = df["f1"].round(3)
+        display_df["Balanced Accuracy"] = df["bacc"].round(3)
+        display_df["Avg Latency (s)"] = df["avg_latency"].round(2)
+        display_df["Correct"] = df["correct"]
+        display_df["Total"] = df["count"]
+        # Sort by balanced accuracy descending
+        display_df = display_df.sort_values("Balanced Accuracy", ascending=False)
+        # Generate information about the benchmark
+        total_samples = df["count"].iloc[0] if not df.empty else 0
+        info_md = f"""
+        # Benchmark: {dataset_name}
+        **Type**: {benchmark_type}
+        **Total Samples**: {total_samples}
+        This table shows how different AI judge models performed on this benchmark.
+        Higher F1 score and balanced accuracy indicate better performance.
+        """
+        return display_df, info_md
+    except pd.errors.EmptyDataError:
+        error_message = "Error: The CSV file is empty"
+        logger.error(error_message)
+        return empty_df, error_message
+    except pd.errors.ParserError:
+        error_message = "Error: Could not parse the CSV file - it may be corrupted or not in CSV format"
+        logger.error(error_message)
+        return empty_df, error_message
+    except Exception as e:
+        error_message = f"Error loading benchmark data: {str(e)}"
+        logger.error(error_message)
+        return empty_df, error_message
 def initialize():
     """Initialize the application."""
     # Load models from file
         leaderboard_df=format_leaderboard_for_display(
             judge_manager.leaderboard_df,
         ),
+        load_benchmark_fn=load_benchmark_data,
     )
     return ui.create_interface()
 def main():
+    """Initialize the application."""
     demo = initialize()
+    demo.launch(server_name="0.0.0.0")
 if __name__ == "__main__":

src/ui.py CHANGED Viewed

@@ -1,7 +1,10 @@
 from typing import Callable
 import gradio as gr
 import pandas as pd
 from src.config import TEST_TYPES
@@ -21,6 +24,7 @@ class UI:
         both_incorrect_fn: Callable,
         refresh_leaderboard_fn: Callable,
         leaderboard_df: pd.DataFrame,
     ):
         self.refresh_fn = refresh_fn
         self.submit_fn = submit_fn
@@ -32,6 +36,7 @@ class UI:
         self.both_incorrect_fn = both_incorrect_fn
         self.refresh_leaderboard_fn = refresh_leaderboard_fn
         self.leaderboard_df = leaderboard_df
     def create_interface(self) -> gr.Blocks:
         """Create the Gradio interface"""
@@ -151,6 +156,55 @@ class UI:
                 )
                 refresh_leaderboard = gr.Button("Refresh Leaderboard")
             with gr.Tab("About"):
                 self._create_about_tab()
@@ -250,36 +304,114 @@ class UI:
                 queue=False,  # Run immediately without waiting in queue
             )
             select_eval1.click(
                 self.winner1_fn,
                 [],
-                result_text,
             )
             select_eval2.click(
                 self.winner2_fn,
                 [],
-                result_text,
             )
             both_correct_btn.click(
                 self.both_correct_fn,
                 [],
-                result_text,
             )
             both_incorrect_btn.click(
                 self.both_incorrect_fn,
                 [],
-                result_text,
             )
             refresh_leaderboard.click(
                 self.refresh_leaderboard_fn,
                 [],
-                leaderboard_dataframe,
             )
             # Add footer
             with gr.Row():
                 gr.HTML(
@@ -290,80 +422,78 @@ class UI:
                     """
                 )
-        return demo
     def _create_about_tab(self) -> None:
         """Create the About tab content"""
         gr.Markdown(
             """
-        ## About AI Evaluation Judge Arena
-        This platform allows users to compare and rate different AI evaluation models (judges).
-        ### How it works:
-        1. You are presented with an input prompt and AI-generated output
-        2. Two AI judges provide evaluations of the output
-        3. The evaluations are presented anonymously (blind evaluation)
-        4. You select which evaluation you think is better, or if both are correct/incorrect
-        5. The judges' identities are revealed after your selection, and their ELO ratings are updated
-        ### ELO Rating System
-        The platform uses the ELO rating system (like in chess) to rank the judges.
-        When you choose a winner:
-        - The winning judge gains ELO points
-        - The losing judge loses ELO points
-        - If both are correct, both gain a small amount of points
-        - If both are incorrect, both lose a small amount of points
-        - The amount of points transferred depends on the difference in current ratings
-        ### Test Types
-        - **Prompt Injections**: Evaluates how well judges detect and assess prompt
-          injection attempts
-        - **Safety**: Tests judges on responses involving potentially harmful content
-        - **Grounding**: Assesses judges' ability to evaluate factual correctness
-        - **Policy**: Tests judges on evaluating responses to ethical dilemmas and
-          policy questions
-        ### Purpose
-        This platform helps determine which AI evaluation methods are most aligned
-        with human preferences.
-        """
         )
     def _update_input_visibility(self, test_type):
-        """Update the visibility of input fields based on the test type"""
-        # Hide all inputs first
-        text_visible = False
-        claim_visible = False
-        single_text_visible = False
-        policy_input_visible = False
-        policy_output_visible = False
-        policy_assertion_visible = False
-        input_visible = False
-        output_visible = False
-        # Show the appropriate inputs based on test type
         if test_type == "grounding":
-            text_visible = True
-            claim_visible = True
         elif test_type in ["prompt injections", "safety"]:
-            single_text_visible = True
         elif test_type == "policy":
-            policy_input_visible = True
-            policy_output_visible = True
-            policy_assertion_visible = True
         else:
-            # Fallback to legacy layout
-            input_visible = True
-            output_visible = True
-        return (
-            gr.update(visible=text_visible),
-            gr.update(visible=claim_visible),
-            gr.update(visible=single_text_visible),
-            gr.update(visible=policy_input_visible),
-            gr.update(visible=policy_output_visible),
-            gr.update(visible=policy_assertion_visible),
-            gr.update(visible=input_visible),
-            gr.update(visible=output_visible),
-        )

+import glob
+import os
 from typing import Callable
 import gradio as gr
 import pandas as pd
+from loguru import logger
 from src.config import TEST_TYPES
         both_incorrect_fn: Callable,
         refresh_leaderboard_fn: Callable,
         leaderboard_df: pd.DataFrame,
+        load_benchmark_fn: Callable = None,
     ):
         self.refresh_fn = refresh_fn
         self.submit_fn = submit_fn
         self.both_incorrect_fn = both_incorrect_fn
         self.refresh_leaderboard_fn = refresh_leaderboard_fn
         self.leaderboard_df = leaderboard_df
+        self.load_benchmark_fn = load_benchmark_fn
     def create_interface(self) -> gr.Blocks:
         """Create the Gradio interface"""
                 )
                 refresh_leaderboard = gr.Button("Refresh Leaderboard")
+            # New Benchmarks Tab
+            with gr.Tab("📊 Benchmarks"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        # Get available test types from the benchmarks directory
+                        try:
+                            benchmark_types = [
+                                d for d in os.listdir("benchmarks") if os.path.isdir(os.path.join("benchmarks", d))
+                            ]
+                        except (FileNotFoundError, PermissionError):
+                            # Fallback if directory can't be read
+                            benchmark_types = []
+                            logger.error("Failed to read benchmarks directory")
+                        benchmark_type_dropdown = gr.Dropdown(
+                            choices=benchmark_types,
+                            label="Benchmark Type",
+                            info="Select the type of benchmark to view",
+                            value=benchmark_types[0] if benchmark_types else None,
+                        )
+                with gr.Row():
+                    with gr.Column():
+                        # Get available benchmark datasets for the selected type
+                        benchmark_dataset_dropdown = gr.Dropdown(
+                            label="Benchmark Dataset",
+                            info="Select the benchmark dataset to view",
+                        )
+                with gr.Row():
+                    with gr.Column():
+                        benchmark_dataframe = gr.DataFrame(
+                            headers=[
+                                "Judge Name",
+                                "F1 Score",
+                                "Balanced Accuracy",
+                                "Avg Latency (s)",
+                                "Correct",
+                                "Total",
+                            ],
+                            label="Benchmark Results",
+                            interactive=False,
+                        )
+                        benchmark_info = gr.Markdown("Select a benchmark dataset to view results")
+                        # Add a refresh button
+                        refresh_benchmarks_btn = gr.Button("Refresh Benchmark List")
             with gr.Tab("About"):
                 self._create_about_tab()
                 queue=False,  # Run immediately without waiting in queue
             )
+            # Show result buttons after both evaluations are done
             select_eval1.click(
                 self.winner1_fn,
                 [],
+                [result_text],
             )
             select_eval2.click(
                 self.winner2_fn,
                 [],
+                [result_text],
             )
             both_correct_btn.click(
                 self.both_correct_fn,
                 [],
+                [result_text],
             )
             both_incorrect_btn.click(
                 self.both_incorrect_fn,
                 [],
+                [result_text],
             )
             refresh_leaderboard.click(
                 self.refresh_leaderboard_fn,
                 [],
+                [leaderboard_dataframe],
             )
+            # Benchmark tab event handlers
+            def get_benchmark_datasets(benchmark_type):
+                if not benchmark_type:
+                    return gr.update(choices=[], value=None)
+                try:
+                    # Find all CSV files that match the pattern <dataset>-judges-metrics.csv
+                    pattern = os.path.join("benchmarks", benchmark_type, "*-judges-metrics.csv")
+                    files = glob.glob(pattern)
+                    # Extract dataset names from file paths
+                    datasets = []
+                    for file in files:
+                        basename = os.path.basename(file)
+                        dataset_name = basename.replace("-judges-metrics.csv", "")
+                        datasets.append(dataset_name)
+                    logger.info(f"Found datasets for {benchmark_type}: {datasets}")
+                    if datasets:
+                        return gr.update(choices=datasets, value=datasets[0])
+                    else:
+                        return gr.update(choices=[], value=None)
+                except Exception as e:
+                    logger.error(f"Error getting benchmark datasets: {e}")
+                    return gr.update(choices=[], value=None)
+            def refresh_benchmark_types():
+                try:
+                    new_benchmark_types = [
+                        d for d in os.listdir("benchmarks") if os.path.isdir(os.path.join("benchmarks", d))
+                    ]
+                    logger.info(f"Refreshed benchmark types: {new_benchmark_types}")
+                    # Update the benchmark type dropdown
+                    if new_benchmark_types:
+                        # Return the updated dropdown and trigger dataset reload
+                        return gr.update(choices=new_benchmark_types, value=new_benchmark_types[0])
+                    else:
+                        return gr.update(choices=[], value=None)
+                except (FileNotFoundError, PermissionError) as e:
+                    logger.error(f"Error refreshing benchmark types: {e}")
+                    return gr.update(choices=[], value=None)
+            # Set up event handlers for the benchmark tab
+            benchmark_type_dropdown.change(
+                get_benchmark_datasets,
+                [benchmark_type_dropdown],
+                [benchmark_dataset_dropdown],
+            )
+            # Add refresh button handler
+            refresh_benchmarks_btn.click(
+                refresh_benchmark_types,
+                [],
+                [benchmark_type_dropdown],
+            ).then(  # Chain the dataset dropdown update after the type is refreshed
+                get_benchmark_datasets,
+                [benchmark_type_dropdown],
+                [benchmark_dataset_dropdown],
+            )
+            # Add handler to load benchmark data when dataset is selected
+            if self.load_benchmark_fn:
+                benchmark_dataset_dropdown.change(
+                    self.load_benchmark_fn,
+                    [benchmark_type_dropdown, benchmark_dataset_dropdown],
+                    [benchmark_dataframe, benchmark_info],
+                )
+            # Load initial datasets for the default benchmark type if it exists
+            if benchmark_types:
+                initial_benchmark_type = benchmark_types[0]
+                logger.info(f"Loading initial datasets for benchmark type: {initial_benchmark_type}")
+                benchmark_type_dropdown.value = initial_benchmark_type
             # Add footer
             with gr.Row():
                 gr.HTML(
                     """
                 )
+            return demo
     def _create_about_tab(self) -> None:
         """Create the About tab content"""
         gr.Markdown(
             """
+            # About AI Evaluators Arena
+            This platform allows you to evaluate and compare different AI judges in their ability to assess various types of content.
+            ## How it works
+            1. Choose a test type from the dropdown
+            2. Fill in the input fields or load a random example from our dataset
+            3. Click "Evaluate" to get assessments from two randomly selected judges
+            4. Choose which evaluation you think is better
+            5. See which judge provided each evaluation
+            ## Test Types
+            - **Grounding**: Evaluate if a claim is grounded in a given text
+            - **Prompt Injections**: Detect attempts to manipulate or jailbreak the model
+            - **Safety**: Identify harmful, offensive, or dangerous content
+            - **Policy**: Determine if output complies with a given policy
+            ## Leaderboard
+            The leaderboard tracks judge performance using an ELO rating system, with scores adjusted based on human preferences.
+            """
         )
     def _update_input_visibility(self, test_type):
+        """Update which input fields are visible based on the selected test type"""
         if test_type == "grounding":
+            return [
+                gr.update(visible=True),  # text_input
+                gr.update(visible=True),  # claim_input
+                gr.update(visible=False),  # single_text_input
+                gr.update(visible=False),  # policy_input
+                gr.update(visible=False),  # policy_output
+                gr.update(visible=False),  # policy_assertion
+                gr.update(visible=False),  # input_text
+                gr.update(visible=False),  # output_text
+            ]
         elif test_type in ["prompt injections", "safety"]:
+            return [
+                gr.update(visible=False),  # text_input
+                gr.update(visible=False),  # claim_input
+                gr.update(visible=True),  # single_text_input
+                gr.update(visible=False),  # policy_input
+                gr.update(visible=False),  # policy_output
+                gr.update(visible=False),  # policy_assertion
+                gr.update(visible=False),  # input_text
+                gr.update(visible=False),  # output_text
+            ]
         elif test_type == "policy":
+            return [
+                gr.update(visible=False),  # text_input
+                gr.update(visible=False),  # claim_input
+                gr.update(visible=False),  # single_text_input
+                gr.update(visible=True),  # policy_input
+                gr.update(visible=True),  # policy_output
+                gr.update(visible=True),  # policy_assertion
+                gr.update(visible=False),  # input_text
+                gr.update(visible=False),  # output_text
+            ]
         else:
+            # Legacy fallback
+            return [
+                gr.update(visible=False),  # text_input
+                gr.update(visible=False),  # claim_input
+                gr.update(visible=False),  # single_text_input
+                gr.update(visible=False),  # policy_input
+                gr.update(visible=False),  # policy_output
+                gr.update(visible=False),  # policy_assertion
+                gr.update(visible=True),  # input_text
+                gr.update(visible=True),  # output_text
+            ]