EvalArena

Sleeping

App Files Files Community

EvalArena / src /app.py

dror44

wip

6b070cd 7 months ago

raw

history blame contribute delete

26 kB

	import datetime
	import os
	from typing import Any, Dict, Optional, Tuple

	import gradio as gr
	import pandas as pd
	from loguru import logger

	from src.config import HISTORY_PATH
	from src.data_manager import get_random_example, load_models
	from src.judge import JudgeManager
	from src.ui import UI

	# Global state for evaluations
	eval1: Optional[Dict[str, Any]] = None
	eval2: Optional[Dict[str, Any]] = None
	selected_judges: list = []
	current_test_type: str = "grounding"

	# Add more detailed logging
	logger.info("EvalArena starting up")
	# Check if benchmarks directory exists
	if os.path.exists("benchmarks") and os.path.isdir("benchmarks"):
	benchmark_dirs = [d for d in os.listdir("benchmarks") if os.path.isdir(os.path.join("benchmarks", d))]
	logger.info(f"Found benchmark directories: {benchmark_dirs}")

	# Log CSV files in each directory
	for d in benchmark_dirs:
	dir_path = os.path.join("benchmarks", d)
	files = [f for f in os.listdir(dir_path) if f.endswith("-judges-metrics.csv")]
	logger.info(f"Benchmark directory '{d}' contains files: {files}")
	else:
	logger.warning("Benchmarks directory not found or not accessible")


	def format_leaderboard_for_display(df: pd.DataFrame) -> pd.DataFrame:
	"""Format the leaderboard dataframe for display in the UI.
	This ensures consistent display across environments
	like Huggingface Spaces."""
	# Create a copy of the dataframe with only the columns we want to display
	display_df = pd.DataFrame()
	display_df["Judge Name"] = df["judge_name"]
	display_df["ELO Score"] = df["elo_score"]
	display_df["Wins"] = df["wins"]
	display_df["Losses"] = df["losses"]
	display_df["Total Evaluations"] = df["total_evaluations"]
	return display_df


	def load_benchmark_data(benchmark_type: str, dataset_name: str) -> Tuple[pd.DataFrame, str]:
	"""Load benchmark data for the selected benchmark type and dataset.

	Args:
	benchmark_type: The type of benchmark (e.g., 'prompt-injections')
	dataset_name: The name of the dataset (e.g., 'allenai-wildjailbreak')

	Returns:
	Tuple containing:
	- DataFrame formatted for display
	- Markdown string with benchmark information
	"""
	# Create empty dataframe with the expected columns
	empty_df = pd.DataFrame(
	columns=["Judge Name", "F1 Score", "Balanced Accuracy", "Avg Latency (s)", "Correct", "Total"]
	)

	# Handle case when None or empty values are passed
	if not benchmark_type or not dataset_name:
	logger.warning(f"Invalid benchmark parameters: type={benchmark_type}, dataset={dataset_name}")
	return empty_df, "Please select both a benchmark type and dataset"

	try:
	# Construct the path to the benchmark metrics file
	metrics_file = os.path.join("benchmarks", benchmark_type, f"{dataset_name}-judges-metrics.csv")

	logger.info(f"Loading benchmark from {metrics_file}")

	if not os.path.exists(metrics_file):
	error_message = f"Error: Could not find metrics file at {metrics_file}"
	logger.error(error_message)
	return empty_df, error_message

	# Load the CSV file
	df = pd.read_csv(metrics_file)
	logger.info(f"Loaded benchmark with {len(df)} rows")

	# Check if the file has the required columns
	required_columns = ["judge_name", "f1", "bacc", "avg_latency", "correct", "count"]
	missing_columns = [col for col in required_columns if col not in df.columns]

	if missing_columns:
	error_message = f"Error: CSV file missing required columns: {', '.join(missing_columns)}"
	logger.error(error_message)
	return empty_df, error_message

	# Format the dataframe for display
	display_df = pd.DataFrame()
	display_df["Judge Name"] = df["judge_name"]
	display_df["F1 Score"] = df["f1"].round(3)
	display_df["Balanced Accuracy"] = df["bacc"].round(3)
	display_df["Avg Latency (s)"] = df["avg_latency"].round(2)
	display_df["Correct"] = df["correct"]
	display_df["Total"] = df["count"]

	# Sort by balanced accuracy descending
	display_df = display_df.sort_values("Balanced Accuracy", ascending=False)

	# Generate information about the benchmark
	total_samples = df["count"].iloc[0] if not df.empty else 0
	info_md = f"""
	# Benchmark: {dataset_name}

	Type: {benchmark_type}
	Total Samples: {total_samples}

	This table shows how different AI judge models performed on this benchmark.
	Higher F1 score and balanced accuracy indicate better performance.
	"""

	return display_df, info_md

	except pd.errors.EmptyDataError:
	error_message = "Error: The CSV file is empty"
	logger.error(error_message)
	return empty_df, error_message
	except pd.errors.ParserError:
	error_message = "Error: Could not parse the CSV file - it may be corrupted or not in CSV format"
	logger.error(error_message)
	return empty_df, error_message
	except Exception as e:
	error_message = f"Error loading benchmark data: {str(e)}"
	logger.error(error_message)
	return empty_df, error_message


	def initialize():
	"""Initialize the application."""
	# Load models from file
	judges = load_models()
	logger.info(f"Loaded {len(judges)} judges")

	# Initialize judge manager
	judge_manager = JudgeManager(judges)

	# Set default test type
	default_test_type = "grounding"
	global current_test_type
	current_test_type = default_test_type

	# Create UI
	ui = UI(
	refresh_fn=lambda test_type: refresh_example(test_type, judge_manager),
	submit_fn=lambda text_input, claim_input, single_text_input, policy_input, policy_output, policy_assertion, test_type: submit_example(
	text_input,
	claim_input,
	single_text_input,
	policy_input,
	policy_output,
	policy_assertion,
	test_type,
	judge_manager,
	),
	evaluate1_fn=lambda text_input, claim_input, single_text_input, policy_input, policy_output, policy_assertion, test_type: get_evaluation1(
	text_input,
	claim_input,
	single_text_input,
	policy_input,
	policy_output,
	policy_assertion,
	test_type,
	judge_manager,
	),
	evaluate2_fn=lambda text_input, claim_input, single_text_input, policy_input, policy_output, policy_assertion, test_type: get_evaluation2(
	text_input,
	claim_input,
	single_text_input,
	policy_input,
	policy_output,
	policy_assertion,
	test_type,
	judge_manager,
	),
	winner1_fn=lambda: select_winner("Evaluation 1", judge_manager),
	winner2_fn=lambda: select_winner("Evaluation 2", judge_manager),
	both_correct_fn=lambda: handle_both_correct(judge_manager),
	both_incorrect_fn=lambda: handle_both_incorrect(judge_manager),
	refresh_leaderboard_fn=lambda: format_leaderboard_for_display(
	judge_manager.leaderboard_df,
	),
	leaderboard_df=format_leaderboard_for_display(
	judge_manager.leaderboard_df,
	),
	load_benchmark_fn=load_benchmark_data,
	)

	return ui.create_interface()


	def refresh_example(test_type: str, judge_manager: JudgeManager) -> Tuple:
	"""Get a random example for the given test type."""
	try:
	# Get example from the dataset
	logger.info(f"Getting example for test type: {test_type}")
	example = get_random_example(test_type)

	# Default values for all return fields
	input_text = ""
	output_text = ""
	text_input = ""
	claim_input = ""
	single_text_input = ""
	policy_input = ""
	policy_output = ""
	policy_assertion = ""

	# Populate fields based on test type
	if test_type == "grounding":
	text_input = example["text"]
	claim_input = example["claim"]
	elif test_type in ["prompt_injections", "safety"]:
	single_text_input = example["text"]
	elif test_type == "policy":
	policy_input = example["input"]
	policy_output = example["output"]
	policy_assertion = example["assertion"]
	else:
	# Legacy format
	input_text = example.get("text", f"Sample input for {test_type}")
	output_text = example.get("claim", f"Sample output for {test_type}")

	return (
	input_text,
	output_text,
	text_input,
	claim_input,
	single_text_input,
	policy_input,
	policy_output,
	policy_assertion,
	)
	except Exception as e:
	logger.error(f"Error getting example: {e}")
	# Return empty strings for all fields
	return (
	"",
	"",
	"",
	"",
	"",
	"",
	"",
	"",
	)


	def submit_example(
	text_input: str,
	claim_input: str,
	single_text_input: str,
	policy_input: str,
	policy_output: str,
	policy_assertion: str,
	test_type: str,
	judge_manager: JudgeManager,
	) -> Tuple:
	"""Prepare for evaluation and select random judges."""
	global selected_judges, current_test_type, eval1, eval2

	try:
	logger.info(f"Preparing evaluation for test type: {test_type}")
	current_test_type = test_type

	# Reset evaluations
	eval1 = None
	eval2 = None

	# Select random judges
	selected_judges = judge_manager.pick_random_judges()

	if len(selected_judges) < 2:
	return (
	"Error: Not enough judges available",
	"Error: Not enough judges available",
	None,
	None,
	None,
	None,
	None,
	None,
	None,
	gr.update(visible=False),
	)

	# Format inputs for Qualifire evaluation
	input_text, output_text = format_inputs_for_evaluation(
	text_input,
	claim_input,
	single_text_input,
	policy_input,
	policy_output,
	policy_assertion,
	test_type,
	)

	# Get a single Qualifire evaluation to be shared by both judges
	try:
	qualifire_result, time_elapsed = judge_manager.evaluate_with_qualifire(
	input_text,
	output_text,
	test_type,
	as_raw=True, # Get raw result to share between judges
	)
	logger.info("Completed Qualifire evaluation")

	# Store the Qualifire result for both judges to use
	judge_manager.shared_qualifire_result = qualifire_result
	judge_manager.shared_qualifire_time = time_elapsed
	except Exception as e:
	logger.error(f"Error during Qualifire evaluation: {str(e)}")
	# Continue even if Qualifire fails - judges can still work without it

	# Show loading messages while evaluations are in progress
	status_text = "Evaluations starting... Both judges will evaluate in parallel."
	return (
	"Loading evaluation 1...",
	"Loading evaluation 2...",
	gr.update(value=text_input),
	gr.update(value=claim_input),
	gr.update(value=single_text_input),
	gr.update(value=policy_input),
	gr.update(value=policy_output),
	gr.update(value=policy_assertion),
	gr.update(value=test_type),
	gr.update(visible=True, value=status_text),
	)
	except Exception as e:
	logger.error(f"Error preparing evaluation: {e}")
	return (
	f"Error: {str(e)}",
	f"Error: {str(e)}",
	gr.update(value=text_input),
	gr.update(value=claim_input),
	gr.update(value=single_text_input),
	gr.update(value=policy_input),
	gr.update(value=policy_output),
	gr.update(value=policy_assertion),
	gr.update(value=test_type),
	gr.update(visible=False),
	)


	def get_evaluation1(
	text_input: str,
	claim_input: str,
	single_text_input: str,
	policy_input: str,
	policy_output: str,
	policy_assertion: str,
	test_type: str,
	judge_manager: JudgeManager,
	) -> Tuple[str, Any]:
	"""Get evaluation from the first judge."""
	global eval1, selected_judges

	try:
	if not selected_judges or len(selected_judges) < 1:
	return "No judges selected", gr.update(visible=False)

	logger.info(f"Starting evaluation 1 with judge {selected_judges[0]['name']}")

	# Format inputs based on test type
	input_text, output_text = format_inputs_for_evaluation(
	text_input,
	claim_input,
	single_text_input,
	policy_input,
	policy_output,
	policy_assertion,
	test_type,
	)

	# Get evaluation from the first judge
	eval1 = judge_manager.get_evaluation(
	selected_judges[0],
	input_text,
	output_text,
	test_type,
	use_shared_result=True,
	)
	logger.info("Completed evaluation 1")

	# Display the evaluation (time is already included in the evaluation)
	display_eval = eval1["display_evaluation"]

	# Make the selection button visible once the evaluation is ready
	return display_eval, gr.update(visible=True)
	except Exception as e:
	logger.error(f"Error getting evaluation 1: {e}")
	return f"Error: {str(e)}", gr.update(visible=False)


	def get_evaluation2(
	text_input: str,
	claim_input: str,
	single_text_input: str,
	policy_input: str,
	policy_output: str,
	policy_assertion: str,
	test_type: str,
	judge_manager: JudgeManager,
	) -> Tuple[str, Any, Any]:
	"""Get evaluation from the second judge."""
	global eval2, selected_judges

	try:
	if not selected_judges or len(selected_judges) < 2:
	return (
	"No judges selected",
	gr.update(
	visible=False,
	),
	gr.update(
	visible=False,
	),
	)

	logger.info(
	f"Starting evaluation 2 with judge {selected_judges[1]['name']}",
	)

	# Format inputs based on test type
	input_text, output_text = format_inputs_for_evaluation(
	text_input,
	claim_input,
	single_text_input,
	policy_input,
	policy_output,
	policy_assertion,
	test_type,
	)

	# Get evaluation from the second judge
	eval2 = judge_manager.get_evaluation(
	selected_judges[1],
	input_text,
	output_text,
	test_type,
	use_shared_result=True,
	)
	logger.info("Completed evaluation 2")

	# Display the evaluation (time is already included in the evaluation)
	display_eval = eval2["display_evaluation"]

	return (
	display_eval,
	gr.update(visible=True),
	gr.update(visible=True),
	)
	except Exception as e:
	logger.error(f"Error getting evaluation 2: {e}")
	return (
	f"Error: {str(e)}",
	gr.update(
	visible=False,
	),
	gr.update(
	visible=False,
	),
	)


	def format_inputs_for_evaluation(
	text_input: str,
	claim_input: str,
	single_text_input: str,
	policy_input: str,
	policy_output: str,
	policy_assertion: str,
	test_type: str,
	) -> Tuple[str, str]:
	"""Format inputs based on test type to be compatible with the evaluation function."""
	if test_type == "grounding":
	input_text = text_input
	output_text = claim_input
	elif test_type in ["prompt_injections", "safety"]:
	input_text = single_text_input
	output_text = ""
	elif test_type == "policy":
	input_text = f"Input: {policy_input}\nAssertion: {policy_assertion}"
	output_text = policy_output
	else:
	# Default fallback - this should not happen with the UI constraints
	input_text = text_input or single_text_input or policy_input
	output_text = claim_input or policy_output

	return input_text, output_text


	def save_to_history(
	input_text: str,
	output_text: str,
	judge1_id: str,
	judge1_name: str,
	judge1_evaluation: str,
	judge1_time: float,
	judge2_id: str,
	judge2_name: str,
	judge2_evaluation: str,
	judge2_time: float,
	winner_id: str,
	) -> None:
	"""Save the evaluation results to history CSV file."""
	try:
	# Create a new row for the history
	history_row = {
	"timestamp": datetime.datetime.now().isoformat(),
	"input": input_text,
	"output": output_text,
	"judge1_id": judge1_id,
	"judge1_name": judge1_name,
	"judge1_evaluation": judge1_evaluation,
	"judge1_time": judge1_time,
	"judge2_id": judge2_id,
	"judge2_name": judge2_name,
	"judge2_evaluation": judge2_evaluation,
	"judge2_time": judge2_time,
	"winner_id": winner_id,
	}

	# Try to load existing history
	try:
	history_df = pd.read_csv(HISTORY_PATH)
	except (FileNotFoundError, pd.errors.EmptyDataError):
	# Create a new history dataframe if file doesn't exist or is empty
	history_df = pd.DataFrame(columns=list(history_row.keys()))

	# Append the new row
	history_df = pd.concat(
	[history_df, pd.DataFrame([history_row])],
	ignore_index=True,
	)

	# Save to CSV
	history_df.to_csv(HISTORY_PATH, index=False)
	logger.info("Saved evaluation to history")
	except Exception as e:
	logger.error(f"Error saving to history: {e}")


	def select_winner(choice: str, judge_manager: JudgeManager) -> str:
	"""Select a winner from the evaluations."""
	global eval1, eval2, current_test_type

	try:
	if not eval1 or not eval2:
	return "Error: No evaluations available"

	# Get the input and output text that was evaluated
	input_text, output_text = "", ""
	if "input_text" in eval1 and "output_text" in eval1:
	input_text = eval1.get("input_text", "")
	output_text = eval1.get("output_text", "")

	if choice == "Evaluation 1":
	winner_eval = eval1
	loser_eval = eval2
	winner_id = eval1["judge"]["id"]
	else:
	winner_eval = eval2
	loser_eval = eval1
	winner_id = eval2["judge"]["id"]

	# Update leaderboard
	updated_board = judge_manager.update_leaderboard(
	winner_eval["judge"]["id"],
	loser_eval["judge"]["id"],
	result_type="win",
	)

	# Save to history
	save_to_history(
	input_text=input_text,
	output_text=output_text,
	judge1_id=eval1["judge"]["id"],
	judge1_name=eval1["judge"]["name"],
	judge1_evaluation=eval1["anonymous_evaluation"],
	judge1_time=eval1["elapsed_time"],
	judge2_id=eval2["judge"]["id"],
	judge2_name=eval2["judge"]["name"],
	judge2_evaluation=eval2["anonymous_evaluation"],
	judge2_time=eval2["elapsed_time"],
	winner_id=winner_id,
	)

	# Construct result message with revealed judges' names
	result_message = f"You selected: {choice}\n\n"
	result_message += f"Evaluation 1 was by: {eval1['judge']['name']} (took {eval1['elapsed_time']:.2f} seconds)\n"
	result_message += (
	f"Evaluation 2 was by: {eval2['judge']['name']} (took {eval2['elapsed_time']:.2f} seconds)\n\n"
	)

	# Get the winner's new ELO score
	winner_mask = updated_board["judge_id"] == winner_id
	winner_elo = updated_board[winner_mask]["elo_score"].values[0]

	result_message += f"Winner: {winner_eval['judge']['name']} "
	result_message += f"(New ELO: {winner_elo:.2f})\n"
	result_message += f"Test Type: {current_test_type}\n"

	return result_message
	except Exception as e:
	logger.error(f"Error selecting winner: {e}")
	return f"Error: {str(e)}"


	def handle_both_correct(judge_manager: JudgeManager) -> str:
	"""Handle case where both evaluations are correct."""
	global eval1, eval2, current_test_type

	try:
	if not eval1 or not eval2:
	return "Error: No evaluations available"

	# Get the input and output text that was evaluated
	input_text, output_text = "", ""
	if "input_text" in eval1 and "output_text" in eval1:
	input_text = eval1.get("input_text", "")
	output_text = eval1.get("output_text", "")

	# Update leaderboard for both judges
	updated_board = judge_manager.update_leaderboard(
	eval1["judge"]["id"],
	eval2["judge"]["id"],
	result_type="both_correct",
	)

	# Save to history with both as winners
	save_to_history(
	input_text=input_text,
	output_text=output_text,
	judge1_id=eval1["judge"]["id"],
	judge1_name=eval1["judge"]["name"],
	judge1_evaluation=eval1["anonymous_evaluation"],
	judge1_time=eval1["elapsed_time"],
	judge2_id=eval2["judge"]["id"],
	judge2_name=eval2["judge"]["name"],
	judge2_evaluation=eval2["anonymous_evaluation"],
	judge2_time=eval2["elapsed_time"],
	winner_id="both",
	)

	# Construct result message with revealed judges' names
	result_message = "You selected: Both Correct\n\n"
	result_message += f"Evaluation 1 was by: {eval1['judge']['name']} (took {eval1['elapsed_time']:.2f} seconds)\n"
	result_message += (
	f"Evaluation 2 was by: {eval2['judge']['name']} (took {eval2['elapsed_time']:.2f} seconds)\n\n"
	)

	# Get the new ELO scores
	judge1_mask = updated_board["judge_id"] == eval1["judge"]["id"]
	judge2_mask = updated_board["judge_id"] == eval2["judge"]["id"]

	judge1_elo = updated_board[judge1_mask]["elo_score"].values[0]
	judge2_elo = updated_board[judge2_mask]["elo_score"].values[0]

	result_message += "\nBoth judges performed well!\n"
	result_message += f"{eval1['judge']['name']} new ELO: {judge1_elo:.2f}\n"
	result_message += f"{eval2['judge']['name']} new ELO: {judge2_elo:.2f}\n"
	result_message += f"Test Type: {current_test_type}\n"

	return result_message
	except Exception as e:
	logger.error(f"Error handling both correct: {e}")
	return f"Error: {str(e)}"


	def handle_both_incorrect(judge_manager: JudgeManager) -> str:
	"""Handle case where both evaluations are incorrect."""
	global eval1, eval2, current_test_type

	try:
	if not eval1 or not eval2:
	return "Error: No evaluations available"

	# Get the input and output text that was evaluated
	input_text, output_text = "", ""
	if "input_text" in eval1 and "output_text" in eval1:
	input_text = eval1.get("input_text", "")
	output_text = eval1.get("output_text", "")

	# Update leaderboard for both judges
	updated_board = judge_manager.update_leaderboard(
	eval1["judge"]["id"],
	eval2["judge"]["id"],
	result_type="both_incorrect",
	)

	# Save to history with neither as winner
	save_to_history(
	input_text=input_text,
	output_text=output_text,
	judge1_id=eval1["judge"]["id"],
	judge1_name=eval1["judge"]["name"],
	judge1_evaluation=eval1["anonymous_evaluation"],
	judge1_time=eval1["elapsed_time"],
	judge2_id=eval2["judge"]["id"],
	judge2_name=eval2["judge"]["name"],
	judge2_evaluation=eval2["anonymous_evaluation"],
	judge2_time=eval2["elapsed_time"],
	winner_id="none",
	)

	# Construct result message with revealed judges' names
	result_message = "You selected: Both Incorrect\n\n"
	result_message += f"Evaluation 1 was by: {eval1['judge']['name']} (took {eval1['elapsed_time']:.2f} seconds)\n"
	result_message += (
	f"Evaluation 2 was by: {eval2['judge']['name']} (took {eval2['elapsed_time']:.2f} seconds)\n\n"
	)

	# Get the new ELO scores
	judge1_mask = updated_board["judge_id"] == eval1["judge"]["id"]
	judge2_mask = updated_board["judge_id"] == eval2["judge"]["id"]

	judge1_elo = updated_board[judge1_mask]["elo_score"].values[0]
	judge2_elo = updated_board[judge2_mask]["elo_score"].values[0]

	result_message += "\nBoth judges need improvement.\n"
	result_message += f"{eval1['judge']['name']} new ELO: {judge1_elo:.2f}\n"
	result_message += f"{eval2['judge']['name']} new ELO: {judge2_elo:.2f}\n"
	result_message += f"Test Type: {current_test_type}\n"

	return result_message
	except Exception as e:
	logger.error(f"Error handling both incorrect: {e}")
	return f"Error: {str(e)}"


	def main():
	"""Initialize the application."""
	demo = initialize()
	demo.launch(server_name="0.0.0.0")


	if __name__ == "__main__":
	main()