EvalArena / src /app.py
dror44's picture
wip
6b070cd
import datetime
import os
from typing import Any, Dict, Optional, Tuple
import gradio as gr
import pandas as pd
from loguru import logger
from src.config import HISTORY_PATH
from src.data_manager import get_random_example, load_models
from src.judge import JudgeManager
from src.ui import UI
# Global state for evaluations
eval1: Optional[Dict[str, Any]] = None
eval2: Optional[Dict[str, Any]] = None
selected_judges: list = []
current_test_type: str = "grounding"
# Add more detailed logging
logger.info("EvalArena starting up")
# Check if benchmarks directory exists
if os.path.exists("benchmarks") and os.path.isdir("benchmarks"):
benchmark_dirs = [d for d in os.listdir("benchmarks") if os.path.isdir(os.path.join("benchmarks", d))]
logger.info(f"Found benchmark directories: {benchmark_dirs}")
# Log CSV files in each directory
for d in benchmark_dirs:
dir_path = os.path.join("benchmarks", d)
files = [f for f in os.listdir(dir_path) if f.endswith("-judges-metrics.csv")]
logger.info(f"Benchmark directory '{d}' contains files: {files}")
else:
logger.warning("Benchmarks directory not found or not accessible")
def format_leaderboard_for_display(df: pd.DataFrame) -> pd.DataFrame:
"""Format the leaderboard dataframe for display in the UI.
This ensures consistent display across environments
like Huggingface Spaces."""
# Create a copy of the dataframe with only the columns we want to display
display_df = pd.DataFrame()
display_df["Judge Name"] = df["judge_name"]
display_df["ELO Score"] = df["elo_score"]
display_df["Wins"] = df["wins"]
display_df["Losses"] = df["losses"]
display_df["Total Evaluations"] = df["total_evaluations"]
return display_df
def load_benchmark_data(benchmark_type: str, dataset_name: str) -> Tuple[pd.DataFrame, str]:
"""Load benchmark data for the selected benchmark type and dataset.
Args:
benchmark_type: The type of benchmark (e.g., 'prompt-injections')
dataset_name: The name of the dataset (e.g., 'allenai-wildjailbreak')
Returns:
Tuple containing:
- DataFrame formatted for display
- Markdown string with benchmark information
"""
# Create empty dataframe with the expected columns
empty_df = pd.DataFrame(
columns=["Judge Name", "F1 Score", "Balanced Accuracy", "Avg Latency (s)", "Correct", "Total"]
)
# Handle case when None or empty values are passed
if not benchmark_type or not dataset_name:
logger.warning(f"Invalid benchmark parameters: type={benchmark_type}, dataset={dataset_name}")
return empty_df, "Please select both a benchmark type and dataset"
try:
# Construct the path to the benchmark metrics file
metrics_file = os.path.join("benchmarks", benchmark_type, f"{dataset_name}-judges-metrics.csv")
logger.info(f"Loading benchmark from {metrics_file}")
if not os.path.exists(metrics_file):
error_message = f"Error: Could not find metrics file at {metrics_file}"
logger.error(error_message)
return empty_df, error_message
# Load the CSV file
df = pd.read_csv(metrics_file)
logger.info(f"Loaded benchmark with {len(df)} rows")
# Check if the file has the required columns
required_columns = ["judge_name", "f1", "bacc", "avg_latency", "correct", "count"]
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
error_message = f"Error: CSV file missing required columns: {', '.join(missing_columns)}"
logger.error(error_message)
return empty_df, error_message
# Format the dataframe for display
display_df = pd.DataFrame()
display_df["Judge Name"] = df["judge_name"]
display_df["F1 Score"] = df["f1"].round(3)
display_df["Balanced Accuracy"] = df["bacc"].round(3)
display_df["Avg Latency (s)"] = df["avg_latency"].round(2)
display_df["Correct"] = df["correct"]
display_df["Total"] = df["count"]
# Sort by balanced accuracy descending
display_df = display_df.sort_values("Balanced Accuracy", ascending=False)
# Generate information about the benchmark
total_samples = df["count"].iloc[0] if not df.empty else 0
info_md = f"""
# Benchmark: {dataset_name}
**Type**: {benchmark_type}
**Total Samples**: {total_samples}
This table shows how different AI judge models performed on this benchmark.
Higher F1 score and balanced accuracy indicate better performance.
"""
return display_df, info_md
except pd.errors.EmptyDataError:
error_message = "Error: The CSV file is empty"
logger.error(error_message)
return empty_df, error_message
except pd.errors.ParserError:
error_message = "Error: Could not parse the CSV file - it may be corrupted or not in CSV format"
logger.error(error_message)
return empty_df, error_message
except Exception as e:
error_message = f"Error loading benchmark data: {str(e)}"
logger.error(error_message)
return empty_df, error_message
def initialize():
"""Initialize the application."""
# Load models from file
judges = load_models()
logger.info(f"Loaded {len(judges)} judges")
# Initialize judge manager
judge_manager = JudgeManager(judges)
# Set default test type
default_test_type = "grounding"
global current_test_type
current_test_type = default_test_type
# Create UI
ui = UI(
refresh_fn=lambda test_type: refresh_example(test_type, judge_manager),
submit_fn=lambda text_input, claim_input, single_text_input, policy_input, policy_output, policy_assertion, test_type: submit_example(
text_input,
claim_input,
single_text_input,
policy_input,
policy_output,
policy_assertion,
test_type,
judge_manager,
),
evaluate1_fn=lambda text_input, claim_input, single_text_input, policy_input, policy_output, policy_assertion, test_type: get_evaluation1(
text_input,
claim_input,
single_text_input,
policy_input,
policy_output,
policy_assertion,
test_type,
judge_manager,
),
evaluate2_fn=lambda text_input, claim_input, single_text_input, policy_input, policy_output, policy_assertion, test_type: get_evaluation2(
text_input,
claim_input,
single_text_input,
policy_input,
policy_output,
policy_assertion,
test_type,
judge_manager,
),
winner1_fn=lambda: select_winner("Evaluation 1", judge_manager),
winner2_fn=lambda: select_winner("Evaluation 2", judge_manager),
both_correct_fn=lambda: handle_both_correct(judge_manager),
both_incorrect_fn=lambda: handle_both_incorrect(judge_manager),
refresh_leaderboard_fn=lambda: format_leaderboard_for_display(
judge_manager.leaderboard_df,
),
leaderboard_df=format_leaderboard_for_display(
judge_manager.leaderboard_df,
),
load_benchmark_fn=load_benchmark_data,
)
return ui.create_interface()
def refresh_example(test_type: str, judge_manager: JudgeManager) -> Tuple:
"""Get a random example for the given test type."""
try:
# Get example from the dataset
logger.info(f"Getting example for test type: {test_type}")
example = get_random_example(test_type)
# Default values for all return fields
input_text = ""
output_text = ""
text_input = ""
claim_input = ""
single_text_input = ""
policy_input = ""
policy_output = ""
policy_assertion = ""
# Populate fields based on test type
if test_type == "grounding":
text_input = example["text"]
claim_input = example["claim"]
elif test_type in ["prompt_injections", "safety"]:
single_text_input = example["text"]
elif test_type == "policy":
policy_input = example["input"]
policy_output = example["output"]
policy_assertion = example["assertion"]
else:
# Legacy format
input_text = example.get("text", f"Sample input for {test_type}")
output_text = example.get("claim", f"Sample output for {test_type}")
return (
input_text,
output_text,
text_input,
claim_input,
single_text_input,
policy_input,
policy_output,
policy_assertion,
)
except Exception as e:
logger.error(f"Error getting example: {e}")
# Return empty strings for all fields
return (
"",
"",
"",
"",
"",
"",
"",
"",
)
def submit_example(
text_input: str,
claim_input: str,
single_text_input: str,
policy_input: str,
policy_output: str,
policy_assertion: str,
test_type: str,
judge_manager: JudgeManager,
) -> Tuple:
"""Prepare for evaluation and select random judges."""
global selected_judges, current_test_type, eval1, eval2
try:
logger.info(f"Preparing evaluation for test type: {test_type}")
current_test_type = test_type
# Reset evaluations
eval1 = None
eval2 = None
# Select random judges
selected_judges = judge_manager.pick_random_judges()
if len(selected_judges) < 2:
return (
"Error: Not enough judges available",
"Error: Not enough judges available",
None,
None,
None,
None,
None,
None,
None,
gr.update(visible=False),
)
# Format inputs for Qualifire evaluation
input_text, output_text = format_inputs_for_evaluation(
text_input,
claim_input,
single_text_input,
policy_input,
policy_output,
policy_assertion,
test_type,
)
# Get a single Qualifire evaluation to be shared by both judges
try:
qualifire_result, time_elapsed = judge_manager.evaluate_with_qualifire(
input_text,
output_text,
test_type,
as_raw=True, # Get raw result to share between judges
)
logger.info("Completed Qualifire evaluation")
# Store the Qualifire result for both judges to use
judge_manager.shared_qualifire_result = qualifire_result
judge_manager.shared_qualifire_time = time_elapsed
except Exception as e:
logger.error(f"Error during Qualifire evaluation: {str(e)}")
# Continue even if Qualifire fails - judges can still work without it
# Show loading messages while evaluations are in progress
status_text = "Evaluations starting... Both judges will evaluate in parallel."
return (
"Loading evaluation 1...",
"Loading evaluation 2...",
gr.update(value=text_input),
gr.update(value=claim_input),
gr.update(value=single_text_input),
gr.update(value=policy_input),
gr.update(value=policy_output),
gr.update(value=policy_assertion),
gr.update(value=test_type),
gr.update(visible=True, value=status_text),
)
except Exception as e:
logger.error(f"Error preparing evaluation: {e}")
return (
f"Error: {str(e)}",
f"Error: {str(e)}",
gr.update(value=text_input),
gr.update(value=claim_input),
gr.update(value=single_text_input),
gr.update(value=policy_input),
gr.update(value=policy_output),
gr.update(value=policy_assertion),
gr.update(value=test_type),
gr.update(visible=False),
)
def get_evaluation1(
text_input: str,
claim_input: str,
single_text_input: str,
policy_input: str,
policy_output: str,
policy_assertion: str,
test_type: str,
judge_manager: JudgeManager,
) -> Tuple[str, Any]:
"""Get evaluation from the first judge."""
global eval1, selected_judges
try:
if not selected_judges or len(selected_judges) < 1:
return "No judges selected", gr.update(visible=False)
logger.info(f"Starting evaluation 1 with judge {selected_judges[0]['name']}")
# Format inputs based on test type
input_text, output_text = format_inputs_for_evaluation(
text_input,
claim_input,
single_text_input,
policy_input,
policy_output,
policy_assertion,
test_type,
)
# Get evaluation from the first judge
eval1 = judge_manager.get_evaluation(
selected_judges[0],
input_text,
output_text,
test_type,
use_shared_result=True,
)
logger.info("Completed evaluation 1")
# Display the evaluation (time is already included in the evaluation)
display_eval = eval1["display_evaluation"]
# Make the selection button visible once the evaluation is ready
return display_eval, gr.update(visible=True)
except Exception as e:
logger.error(f"Error getting evaluation 1: {e}")
return f"Error: {str(e)}", gr.update(visible=False)
def get_evaluation2(
text_input: str,
claim_input: str,
single_text_input: str,
policy_input: str,
policy_output: str,
policy_assertion: str,
test_type: str,
judge_manager: JudgeManager,
) -> Tuple[str, Any, Any]:
"""Get evaluation from the second judge."""
global eval2, selected_judges
try:
if not selected_judges or len(selected_judges) < 2:
return (
"No judges selected",
gr.update(
visible=False,
),
gr.update(
visible=False,
),
)
logger.info(
f"Starting evaluation 2 with judge {selected_judges[1]['name']}",
)
# Format inputs based on test type
input_text, output_text = format_inputs_for_evaluation(
text_input,
claim_input,
single_text_input,
policy_input,
policy_output,
policy_assertion,
test_type,
)
# Get evaluation from the second judge
eval2 = judge_manager.get_evaluation(
selected_judges[1],
input_text,
output_text,
test_type,
use_shared_result=True,
)
logger.info("Completed evaluation 2")
# Display the evaluation (time is already included in the evaluation)
display_eval = eval2["display_evaluation"]
return (
display_eval,
gr.update(visible=True),
gr.update(visible=True),
)
except Exception as e:
logger.error(f"Error getting evaluation 2: {e}")
return (
f"Error: {str(e)}",
gr.update(
visible=False,
),
gr.update(
visible=False,
),
)
def format_inputs_for_evaluation(
text_input: str,
claim_input: str,
single_text_input: str,
policy_input: str,
policy_output: str,
policy_assertion: str,
test_type: str,
) -> Tuple[str, str]:
"""Format inputs based on test type to be compatible with the evaluation function."""
if test_type == "grounding":
input_text = text_input
output_text = claim_input
elif test_type in ["prompt_injections", "safety"]:
input_text = single_text_input
output_text = ""
elif test_type == "policy":
input_text = f"Input: {policy_input}\nAssertion: {policy_assertion}"
output_text = policy_output
else:
# Default fallback - this should not happen with the UI constraints
input_text = text_input or single_text_input or policy_input
output_text = claim_input or policy_output
return input_text, output_text
def save_to_history(
input_text: str,
output_text: str,
judge1_id: str,
judge1_name: str,
judge1_evaluation: str,
judge1_time: float,
judge2_id: str,
judge2_name: str,
judge2_evaluation: str,
judge2_time: float,
winner_id: str,
) -> None:
"""Save the evaluation results to history CSV file."""
try:
# Create a new row for the history
history_row = {
"timestamp": datetime.datetime.now().isoformat(),
"input": input_text,
"output": output_text,
"judge1_id": judge1_id,
"judge1_name": judge1_name,
"judge1_evaluation": judge1_evaluation,
"judge1_time": judge1_time,
"judge2_id": judge2_id,
"judge2_name": judge2_name,
"judge2_evaluation": judge2_evaluation,
"judge2_time": judge2_time,
"winner_id": winner_id,
}
# Try to load existing history
try:
history_df = pd.read_csv(HISTORY_PATH)
except (FileNotFoundError, pd.errors.EmptyDataError):
# Create a new history dataframe if file doesn't exist or is empty
history_df = pd.DataFrame(columns=list(history_row.keys()))
# Append the new row
history_df = pd.concat(
[history_df, pd.DataFrame([history_row])],
ignore_index=True,
)
# Save to CSV
history_df.to_csv(HISTORY_PATH, index=False)
logger.info("Saved evaluation to history")
except Exception as e:
logger.error(f"Error saving to history: {e}")
def select_winner(choice: str, judge_manager: JudgeManager) -> str:
"""Select a winner from the evaluations."""
global eval1, eval2, current_test_type
try:
if not eval1 or not eval2:
return "Error: No evaluations available"
# Get the input and output text that was evaluated
input_text, output_text = "", ""
if "input_text" in eval1 and "output_text" in eval1:
input_text = eval1.get("input_text", "")
output_text = eval1.get("output_text", "")
if choice == "Evaluation 1":
winner_eval = eval1
loser_eval = eval2
winner_id = eval1["judge"]["id"]
else:
winner_eval = eval2
loser_eval = eval1
winner_id = eval2["judge"]["id"]
# Update leaderboard
updated_board = judge_manager.update_leaderboard(
winner_eval["judge"]["id"],
loser_eval["judge"]["id"],
result_type="win",
)
# Save to history
save_to_history(
input_text=input_text,
output_text=output_text,
judge1_id=eval1["judge"]["id"],
judge1_name=eval1["judge"]["name"],
judge1_evaluation=eval1["anonymous_evaluation"],
judge1_time=eval1["elapsed_time"],
judge2_id=eval2["judge"]["id"],
judge2_name=eval2["judge"]["name"],
judge2_evaluation=eval2["anonymous_evaluation"],
judge2_time=eval2["elapsed_time"],
winner_id=winner_id,
)
# Construct result message with revealed judges' names
result_message = f"You selected: {choice}\n\n"
result_message += f"Evaluation 1 was by: {eval1['judge']['name']} (took {eval1['elapsed_time']:.2f} seconds)\n"
result_message += (
f"Evaluation 2 was by: {eval2['judge']['name']} (took {eval2['elapsed_time']:.2f} seconds)\n\n"
)
# Get the winner's new ELO score
winner_mask = updated_board["judge_id"] == winner_id
winner_elo = updated_board[winner_mask]["elo_score"].values[0]
result_message += f"Winner: {winner_eval['judge']['name']} "
result_message += f"(New ELO: {winner_elo:.2f})\n"
result_message += f"Test Type: {current_test_type}\n"
return result_message
except Exception as e:
logger.error(f"Error selecting winner: {e}")
return f"Error: {str(e)}"
def handle_both_correct(judge_manager: JudgeManager) -> str:
"""Handle case where both evaluations are correct."""
global eval1, eval2, current_test_type
try:
if not eval1 or not eval2:
return "Error: No evaluations available"
# Get the input and output text that was evaluated
input_text, output_text = "", ""
if "input_text" in eval1 and "output_text" in eval1:
input_text = eval1.get("input_text", "")
output_text = eval1.get("output_text", "")
# Update leaderboard for both judges
updated_board = judge_manager.update_leaderboard(
eval1["judge"]["id"],
eval2["judge"]["id"],
result_type="both_correct",
)
# Save to history with both as winners
save_to_history(
input_text=input_text,
output_text=output_text,
judge1_id=eval1["judge"]["id"],
judge1_name=eval1["judge"]["name"],
judge1_evaluation=eval1["anonymous_evaluation"],
judge1_time=eval1["elapsed_time"],
judge2_id=eval2["judge"]["id"],
judge2_name=eval2["judge"]["name"],
judge2_evaluation=eval2["anonymous_evaluation"],
judge2_time=eval2["elapsed_time"],
winner_id="both",
)
# Construct result message with revealed judges' names
result_message = "You selected: Both Correct\n\n"
result_message += f"Evaluation 1 was by: {eval1['judge']['name']} (took {eval1['elapsed_time']:.2f} seconds)\n"
result_message += (
f"Evaluation 2 was by: {eval2['judge']['name']} (took {eval2['elapsed_time']:.2f} seconds)\n\n"
)
# Get the new ELO scores
judge1_mask = updated_board["judge_id"] == eval1["judge"]["id"]
judge2_mask = updated_board["judge_id"] == eval2["judge"]["id"]
judge1_elo = updated_board[judge1_mask]["elo_score"].values[0]
judge2_elo = updated_board[judge2_mask]["elo_score"].values[0]
result_message += "\nBoth judges performed well!\n"
result_message += f"{eval1['judge']['name']} new ELO: {judge1_elo:.2f}\n"
result_message += f"{eval2['judge']['name']} new ELO: {judge2_elo:.2f}\n"
result_message += f"Test Type: {current_test_type}\n"
return result_message
except Exception as e:
logger.error(f"Error handling both correct: {e}")
return f"Error: {str(e)}"
def handle_both_incorrect(judge_manager: JudgeManager) -> str:
"""Handle case where both evaluations are incorrect."""
global eval1, eval2, current_test_type
try:
if not eval1 or not eval2:
return "Error: No evaluations available"
# Get the input and output text that was evaluated
input_text, output_text = "", ""
if "input_text" in eval1 and "output_text" in eval1:
input_text = eval1.get("input_text", "")
output_text = eval1.get("output_text", "")
# Update leaderboard for both judges
updated_board = judge_manager.update_leaderboard(
eval1["judge"]["id"],
eval2["judge"]["id"],
result_type="both_incorrect",
)
# Save to history with neither as winner
save_to_history(
input_text=input_text,
output_text=output_text,
judge1_id=eval1["judge"]["id"],
judge1_name=eval1["judge"]["name"],
judge1_evaluation=eval1["anonymous_evaluation"],
judge1_time=eval1["elapsed_time"],
judge2_id=eval2["judge"]["id"],
judge2_name=eval2["judge"]["name"],
judge2_evaluation=eval2["anonymous_evaluation"],
judge2_time=eval2["elapsed_time"],
winner_id="none",
)
# Construct result message with revealed judges' names
result_message = "You selected: Both Incorrect\n\n"
result_message += f"Evaluation 1 was by: {eval1['judge']['name']} (took {eval1['elapsed_time']:.2f} seconds)\n"
result_message += (
f"Evaluation 2 was by: {eval2['judge']['name']} (took {eval2['elapsed_time']:.2f} seconds)\n\n"
)
# Get the new ELO scores
judge1_mask = updated_board["judge_id"] == eval1["judge"]["id"]
judge2_mask = updated_board["judge_id"] == eval2["judge"]["id"]
judge1_elo = updated_board[judge1_mask]["elo_score"].values[0]
judge2_elo = updated_board[judge2_mask]["elo_score"].values[0]
result_message += "\nBoth judges need improvement.\n"
result_message += f"{eval1['judge']['name']} new ELO: {judge1_elo:.2f}\n"
result_message += f"{eval2['judge']['name']} new ELO: {judge2_elo:.2f}\n"
result_message += f"Test Type: {current_test_type}\n"
return result_message
except Exception as e:
logger.error(f"Error handling both incorrect: {e}")
return f"Error: {str(e)}"
def main():
"""Initialize the application."""
demo = initialize()
demo.launch(server_name="0.0.0.0")
if __name__ == "__main__":
main()