|
|
import datetime |
|
|
import os |
|
|
from typing import Any, Dict, Optional, Tuple |
|
|
|
|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
from loguru import logger |
|
|
|
|
|
from src.config import HISTORY_PATH |
|
|
from src.data_manager import get_random_example, load_models |
|
|
from src.judge import JudgeManager |
|
|
from src.ui import UI |
|
|
|
|
|
|
|
|
eval1: Optional[Dict[str, Any]] = None |
|
|
eval2: Optional[Dict[str, Any]] = None |
|
|
selected_judges: list = [] |
|
|
current_test_type: str = "grounding" |
|
|
|
|
|
|
|
|
logger.info("EvalArena starting up") |
|
|
|
|
|
if os.path.exists("benchmarks") and os.path.isdir("benchmarks"): |
|
|
benchmark_dirs = [d for d in os.listdir("benchmarks") if os.path.isdir(os.path.join("benchmarks", d))] |
|
|
logger.info(f"Found benchmark directories: {benchmark_dirs}") |
|
|
|
|
|
|
|
|
for d in benchmark_dirs: |
|
|
dir_path = os.path.join("benchmarks", d) |
|
|
files = [f for f in os.listdir(dir_path) if f.endswith("-judges-metrics.csv")] |
|
|
logger.info(f"Benchmark directory '{d}' contains files: {files}") |
|
|
else: |
|
|
logger.warning("Benchmarks directory not found or not accessible") |
|
|
|
|
|
|
|
|
def format_leaderboard_for_display(df: pd.DataFrame) -> pd.DataFrame: |
|
|
"""Format the leaderboard dataframe for display in the UI. |
|
|
This ensures consistent display across environments |
|
|
like Huggingface Spaces.""" |
|
|
|
|
|
display_df = pd.DataFrame() |
|
|
display_df["Judge Name"] = df["judge_name"] |
|
|
display_df["ELO Score"] = df["elo_score"] |
|
|
display_df["Wins"] = df["wins"] |
|
|
display_df["Losses"] = df["losses"] |
|
|
display_df["Total Evaluations"] = df["total_evaluations"] |
|
|
return display_df |
|
|
|
|
|
|
|
|
def load_benchmark_data(benchmark_type: str, dataset_name: str) -> Tuple[pd.DataFrame, str]: |
|
|
"""Load benchmark data for the selected benchmark type and dataset. |
|
|
|
|
|
Args: |
|
|
benchmark_type: The type of benchmark (e.g., 'prompt-injections') |
|
|
dataset_name: The name of the dataset (e.g., 'allenai-wildjailbreak') |
|
|
|
|
|
Returns: |
|
|
Tuple containing: |
|
|
- DataFrame formatted for display |
|
|
- Markdown string with benchmark information |
|
|
""" |
|
|
|
|
|
empty_df = pd.DataFrame( |
|
|
columns=["Judge Name", "F1 Score", "Balanced Accuracy", "Avg Latency (s)", "Correct", "Total"] |
|
|
) |
|
|
|
|
|
|
|
|
if not benchmark_type or not dataset_name: |
|
|
logger.warning(f"Invalid benchmark parameters: type={benchmark_type}, dataset={dataset_name}") |
|
|
return empty_df, "Please select both a benchmark type and dataset" |
|
|
|
|
|
try: |
|
|
|
|
|
metrics_file = os.path.join("benchmarks", benchmark_type, f"{dataset_name}-judges-metrics.csv") |
|
|
|
|
|
logger.info(f"Loading benchmark from {metrics_file}") |
|
|
|
|
|
if not os.path.exists(metrics_file): |
|
|
error_message = f"Error: Could not find metrics file at {metrics_file}" |
|
|
logger.error(error_message) |
|
|
return empty_df, error_message |
|
|
|
|
|
|
|
|
df = pd.read_csv(metrics_file) |
|
|
logger.info(f"Loaded benchmark with {len(df)} rows") |
|
|
|
|
|
|
|
|
required_columns = ["judge_name", "f1", "bacc", "avg_latency", "correct", "count"] |
|
|
missing_columns = [col for col in required_columns if col not in df.columns] |
|
|
|
|
|
if missing_columns: |
|
|
error_message = f"Error: CSV file missing required columns: {', '.join(missing_columns)}" |
|
|
logger.error(error_message) |
|
|
return empty_df, error_message |
|
|
|
|
|
|
|
|
display_df = pd.DataFrame() |
|
|
display_df["Judge Name"] = df["judge_name"] |
|
|
display_df["F1 Score"] = df["f1"].round(3) |
|
|
display_df["Balanced Accuracy"] = df["bacc"].round(3) |
|
|
display_df["Avg Latency (s)"] = df["avg_latency"].round(2) |
|
|
display_df["Correct"] = df["correct"] |
|
|
display_df["Total"] = df["count"] |
|
|
|
|
|
|
|
|
display_df = display_df.sort_values("Balanced Accuracy", ascending=False) |
|
|
|
|
|
|
|
|
total_samples = df["count"].iloc[0] if not df.empty else 0 |
|
|
info_md = f""" |
|
|
# Benchmark: {dataset_name} |
|
|
|
|
|
**Type**: {benchmark_type} |
|
|
**Total Samples**: {total_samples} |
|
|
|
|
|
This table shows how different AI judge models performed on this benchmark. |
|
|
Higher F1 score and balanced accuracy indicate better performance. |
|
|
""" |
|
|
|
|
|
return display_df, info_md |
|
|
|
|
|
except pd.errors.EmptyDataError: |
|
|
error_message = "Error: The CSV file is empty" |
|
|
logger.error(error_message) |
|
|
return empty_df, error_message |
|
|
except pd.errors.ParserError: |
|
|
error_message = "Error: Could not parse the CSV file - it may be corrupted or not in CSV format" |
|
|
logger.error(error_message) |
|
|
return empty_df, error_message |
|
|
except Exception as e: |
|
|
error_message = f"Error loading benchmark data: {str(e)}" |
|
|
logger.error(error_message) |
|
|
return empty_df, error_message |
|
|
|
|
|
|
|
|
def initialize(): |
|
|
"""Initialize the application.""" |
|
|
|
|
|
judges = load_models() |
|
|
logger.info(f"Loaded {len(judges)} judges") |
|
|
|
|
|
|
|
|
judge_manager = JudgeManager(judges) |
|
|
|
|
|
|
|
|
default_test_type = "grounding" |
|
|
global current_test_type |
|
|
current_test_type = default_test_type |
|
|
|
|
|
|
|
|
ui = UI( |
|
|
refresh_fn=lambda test_type: refresh_example(test_type, judge_manager), |
|
|
submit_fn=lambda text_input, claim_input, single_text_input, policy_input, policy_output, policy_assertion, test_type: submit_example( |
|
|
text_input, |
|
|
claim_input, |
|
|
single_text_input, |
|
|
policy_input, |
|
|
policy_output, |
|
|
policy_assertion, |
|
|
test_type, |
|
|
judge_manager, |
|
|
), |
|
|
evaluate1_fn=lambda text_input, claim_input, single_text_input, policy_input, policy_output, policy_assertion, test_type: get_evaluation1( |
|
|
text_input, |
|
|
claim_input, |
|
|
single_text_input, |
|
|
policy_input, |
|
|
policy_output, |
|
|
policy_assertion, |
|
|
test_type, |
|
|
judge_manager, |
|
|
), |
|
|
evaluate2_fn=lambda text_input, claim_input, single_text_input, policy_input, policy_output, policy_assertion, test_type: get_evaluation2( |
|
|
text_input, |
|
|
claim_input, |
|
|
single_text_input, |
|
|
policy_input, |
|
|
policy_output, |
|
|
policy_assertion, |
|
|
test_type, |
|
|
judge_manager, |
|
|
), |
|
|
winner1_fn=lambda: select_winner("Evaluation 1", judge_manager), |
|
|
winner2_fn=lambda: select_winner("Evaluation 2", judge_manager), |
|
|
both_correct_fn=lambda: handle_both_correct(judge_manager), |
|
|
both_incorrect_fn=lambda: handle_both_incorrect(judge_manager), |
|
|
refresh_leaderboard_fn=lambda: format_leaderboard_for_display( |
|
|
judge_manager.leaderboard_df, |
|
|
), |
|
|
leaderboard_df=format_leaderboard_for_display( |
|
|
judge_manager.leaderboard_df, |
|
|
), |
|
|
load_benchmark_fn=load_benchmark_data, |
|
|
) |
|
|
|
|
|
return ui.create_interface() |
|
|
|
|
|
|
|
|
def refresh_example(test_type: str, judge_manager: JudgeManager) -> Tuple: |
|
|
"""Get a random example for the given test type.""" |
|
|
try: |
|
|
|
|
|
logger.info(f"Getting example for test type: {test_type}") |
|
|
example = get_random_example(test_type) |
|
|
|
|
|
|
|
|
input_text = "" |
|
|
output_text = "" |
|
|
text_input = "" |
|
|
claim_input = "" |
|
|
single_text_input = "" |
|
|
policy_input = "" |
|
|
policy_output = "" |
|
|
policy_assertion = "" |
|
|
|
|
|
|
|
|
if test_type == "grounding": |
|
|
text_input = example["text"] |
|
|
claim_input = example["claim"] |
|
|
elif test_type in ["prompt_injections", "safety"]: |
|
|
single_text_input = example["text"] |
|
|
elif test_type == "policy": |
|
|
policy_input = example["input"] |
|
|
policy_output = example["output"] |
|
|
policy_assertion = example["assertion"] |
|
|
else: |
|
|
|
|
|
input_text = example.get("text", f"Sample input for {test_type}") |
|
|
output_text = example.get("claim", f"Sample output for {test_type}") |
|
|
|
|
|
return ( |
|
|
input_text, |
|
|
output_text, |
|
|
text_input, |
|
|
claim_input, |
|
|
single_text_input, |
|
|
policy_input, |
|
|
policy_output, |
|
|
policy_assertion, |
|
|
) |
|
|
except Exception as e: |
|
|
logger.error(f"Error getting example: {e}") |
|
|
|
|
|
return ( |
|
|
"", |
|
|
"", |
|
|
"", |
|
|
"", |
|
|
"", |
|
|
"", |
|
|
"", |
|
|
"", |
|
|
) |
|
|
|
|
|
|
|
|
def submit_example( |
|
|
text_input: str, |
|
|
claim_input: str, |
|
|
single_text_input: str, |
|
|
policy_input: str, |
|
|
policy_output: str, |
|
|
policy_assertion: str, |
|
|
test_type: str, |
|
|
judge_manager: JudgeManager, |
|
|
) -> Tuple: |
|
|
"""Prepare for evaluation and select random judges.""" |
|
|
global selected_judges, current_test_type, eval1, eval2 |
|
|
|
|
|
try: |
|
|
logger.info(f"Preparing evaluation for test type: {test_type}") |
|
|
current_test_type = test_type |
|
|
|
|
|
|
|
|
eval1 = None |
|
|
eval2 = None |
|
|
|
|
|
|
|
|
selected_judges = judge_manager.pick_random_judges() |
|
|
|
|
|
if len(selected_judges) < 2: |
|
|
return ( |
|
|
"Error: Not enough judges available", |
|
|
"Error: Not enough judges available", |
|
|
None, |
|
|
None, |
|
|
None, |
|
|
None, |
|
|
None, |
|
|
None, |
|
|
None, |
|
|
gr.update(visible=False), |
|
|
) |
|
|
|
|
|
|
|
|
input_text, output_text = format_inputs_for_evaluation( |
|
|
text_input, |
|
|
claim_input, |
|
|
single_text_input, |
|
|
policy_input, |
|
|
policy_output, |
|
|
policy_assertion, |
|
|
test_type, |
|
|
) |
|
|
|
|
|
|
|
|
try: |
|
|
qualifire_result, time_elapsed = judge_manager.evaluate_with_qualifire( |
|
|
input_text, |
|
|
output_text, |
|
|
test_type, |
|
|
as_raw=True, |
|
|
) |
|
|
logger.info("Completed Qualifire evaluation") |
|
|
|
|
|
|
|
|
judge_manager.shared_qualifire_result = qualifire_result |
|
|
judge_manager.shared_qualifire_time = time_elapsed |
|
|
except Exception as e: |
|
|
logger.error(f"Error during Qualifire evaluation: {str(e)}") |
|
|
|
|
|
|
|
|
|
|
|
status_text = "Evaluations starting... Both judges will evaluate in parallel." |
|
|
return ( |
|
|
"Loading evaluation 1...", |
|
|
"Loading evaluation 2...", |
|
|
gr.update(value=text_input), |
|
|
gr.update(value=claim_input), |
|
|
gr.update(value=single_text_input), |
|
|
gr.update(value=policy_input), |
|
|
gr.update(value=policy_output), |
|
|
gr.update(value=policy_assertion), |
|
|
gr.update(value=test_type), |
|
|
gr.update(visible=True, value=status_text), |
|
|
) |
|
|
except Exception as e: |
|
|
logger.error(f"Error preparing evaluation: {e}") |
|
|
return ( |
|
|
f"Error: {str(e)}", |
|
|
f"Error: {str(e)}", |
|
|
gr.update(value=text_input), |
|
|
gr.update(value=claim_input), |
|
|
gr.update(value=single_text_input), |
|
|
gr.update(value=policy_input), |
|
|
gr.update(value=policy_output), |
|
|
gr.update(value=policy_assertion), |
|
|
gr.update(value=test_type), |
|
|
gr.update(visible=False), |
|
|
) |
|
|
|
|
|
|
|
|
def get_evaluation1( |
|
|
text_input: str, |
|
|
claim_input: str, |
|
|
single_text_input: str, |
|
|
policy_input: str, |
|
|
policy_output: str, |
|
|
policy_assertion: str, |
|
|
test_type: str, |
|
|
judge_manager: JudgeManager, |
|
|
) -> Tuple[str, Any]: |
|
|
"""Get evaluation from the first judge.""" |
|
|
global eval1, selected_judges |
|
|
|
|
|
try: |
|
|
if not selected_judges or len(selected_judges) < 1: |
|
|
return "No judges selected", gr.update(visible=False) |
|
|
|
|
|
logger.info(f"Starting evaluation 1 with judge {selected_judges[0]['name']}") |
|
|
|
|
|
|
|
|
input_text, output_text = format_inputs_for_evaluation( |
|
|
text_input, |
|
|
claim_input, |
|
|
single_text_input, |
|
|
policy_input, |
|
|
policy_output, |
|
|
policy_assertion, |
|
|
test_type, |
|
|
) |
|
|
|
|
|
|
|
|
eval1 = judge_manager.get_evaluation( |
|
|
selected_judges[0], |
|
|
input_text, |
|
|
output_text, |
|
|
test_type, |
|
|
use_shared_result=True, |
|
|
) |
|
|
logger.info("Completed evaluation 1") |
|
|
|
|
|
|
|
|
display_eval = eval1["display_evaluation"] |
|
|
|
|
|
|
|
|
return display_eval, gr.update(visible=True) |
|
|
except Exception as e: |
|
|
logger.error(f"Error getting evaluation 1: {e}") |
|
|
return f"Error: {str(e)}", gr.update(visible=False) |
|
|
|
|
|
|
|
|
def get_evaluation2( |
|
|
text_input: str, |
|
|
claim_input: str, |
|
|
single_text_input: str, |
|
|
policy_input: str, |
|
|
policy_output: str, |
|
|
policy_assertion: str, |
|
|
test_type: str, |
|
|
judge_manager: JudgeManager, |
|
|
) -> Tuple[str, Any, Any]: |
|
|
"""Get evaluation from the second judge.""" |
|
|
global eval2, selected_judges |
|
|
|
|
|
try: |
|
|
if not selected_judges or len(selected_judges) < 2: |
|
|
return ( |
|
|
"No judges selected", |
|
|
gr.update( |
|
|
visible=False, |
|
|
), |
|
|
gr.update( |
|
|
visible=False, |
|
|
), |
|
|
) |
|
|
|
|
|
logger.info( |
|
|
f"Starting evaluation 2 with judge {selected_judges[1]['name']}", |
|
|
) |
|
|
|
|
|
|
|
|
input_text, output_text = format_inputs_for_evaluation( |
|
|
text_input, |
|
|
claim_input, |
|
|
single_text_input, |
|
|
policy_input, |
|
|
policy_output, |
|
|
policy_assertion, |
|
|
test_type, |
|
|
) |
|
|
|
|
|
|
|
|
eval2 = judge_manager.get_evaluation( |
|
|
selected_judges[1], |
|
|
input_text, |
|
|
output_text, |
|
|
test_type, |
|
|
use_shared_result=True, |
|
|
) |
|
|
logger.info("Completed evaluation 2") |
|
|
|
|
|
|
|
|
display_eval = eval2["display_evaluation"] |
|
|
|
|
|
return ( |
|
|
display_eval, |
|
|
gr.update(visible=True), |
|
|
gr.update(visible=True), |
|
|
) |
|
|
except Exception as e: |
|
|
logger.error(f"Error getting evaluation 2: {e}") |
|
|
return ( |
|
|
f"Error: {str(e)}", |
|
|
gr.update( |
|
|
visible=False, |
|
|
), |
|
|
gr.update( |
|
|
visible=False, |
|
|
), |
|
|
) |
|
|
|
|
|
|
|
|
def format_inputs_for_evaluation( |
|
|
text_input: str, |
|
|
claim_input: str, |
|
|
single_text_input: str, |
|
|
policy_input: str, |
|
|
policy_output: str, |
|
|
policy_assertion: str, |
|
|
test_type: str, |
|
|
) -> Tuple[str, str]: |
|
|
"""Format inputs based on test type to be compatible with the evaluation function.""" |
|
|
if test_type == "grounding": |
|
|
input_text = text_input |
|
|
output_text = claim_input |
|
|
elif test_type in ["prompt_injections", "safety"]: |
|
|
input_text = single_text_input |
|
|
output_text = "" |
|
|
elif test_type == "policy": |
|
|
input_text = f"Input: {policy_input}\nAssertion: {policy_assertion}" |
|
|
output_text = policy_output |
|
|
else: |
|
|
|
|
|
input_text = text_input or single_text_input or policy_input |
|
|
output_text = claim_input or policy_output |
|
|
|
|
|
return input_text, output_text |
|
|
|
|
|
|
|
|
def save_to_history( |
|
|
input_text: str, |
|
|
output_text: str, |
|
|
judge1_id: str, |
|
|
judge1_name: str, |
|
|
judge1_evaluation: str, |
|
|
judge1_time: float, |
|
|
judge2_id: str, |
|
|
judge2_name: str, |
|
|
judge2_evaluation: str, |
|
|
judge2_time: float, |
|
|
winner_id: str, |
|
|
) -> None: |
|
|
"""Save the evaluation results to history CSV file.""" |
|
|
try: |
|
|
|
|
|
history_row = { |
|
|
"timestamp": datetime.datetime.now().isoformat(), |
|
|
"input": input_text, |
|
|
"output": output_text, |
|
|
"judge1_id": judge1_id, |
|
|
"judge1_name": judge1_name, |
|
|
"judge1_evaluation": judge1_evaluation, |
|
|
"judge1_time": judge1_time, |
|
|
"judge2_id": judge2_id, |
|
|
"judge2_name": judge2_name, |
|
|
"judge2_evaluation": judge2_evaluation, |
|
|
"judge2_time": judge2_time, |
|
|
"winner_id": winner_id, |
|
|
} |
|
|
|
|
|
|
|
|
try: |
|
|
history_df = pd.read_csv(HISTORY_PATH) |
|
|
except (FileNotFoundError, pd.errors.EmptyDataError): |
|
|
|
|
|
history_df = pd.DataFrame(columns=list(history_row.keys())) |
|
|
|
|
|
|
|
|
history_df = pd.concat( |
|
|
[history_df, pd.DataFrame([history_row])], |
|
|
ignore_index=True, |
|
|
) |
|
|
|
|
|
|
|
|
history_df.to_csv(HISTORY_PATH, index=False) |
|
|
logger.info("Saved evaluation to history") |
|
|
except Exception as e: |
|
|
logger.error(f"Error saving to history: {e}") |
|
|
|
|
|
|
|
|
def select_winner(choice: str, judge_manager: JudgeManager) -> str: |
|
|
"""Select a winner from the evaluations.""" |
|
|
global eval1, eval2, current_test_type |
|
|
|
|
|
try: |
|
|
if not eval1 or not eval2: |
|
|
return "Error: No evaluations available" |
|
|
|
|
|
|
|
|
input_text, output_text = "", "" |
|
|
if "input_text" in eval1 and "output_text" in eval1: |
|
|
input_text = eval1.get("input_text", "") |
|
|
output_text = eval1.get("output_text", "") |
|
|
|
|
|
if choice == "Evaluation 1": |
|
|
winner_eval = eval1 |
|
|
loser_eval = eval2 |
|
|
winner_id = eval1["judge"]["id"] |
|
|
else: |
|
|
winner_eval = eval2 |
|
|
loser_eval = eval1 |
|
|
winner_id = eval2["judge"]["id"] |
|
|
|
|
|
|
|
|
updated_board = judge_manager.update_leaderboard( |
|
|
winner_eval["judge"]["id"], |
|
|
loser_eval["judge"]["id"], |
|
|
result_type="win", |
|
|
) |
|
|
|
|
|
|
|
|
save_to_history( |
|
|
input_text=input_text, |
|
|
output_text=output_text, |
|
|
judge1_id=eval1["judge"]["id"], |
|
|
judge1_name=eval1["judge"]["name"], |
|
|
judge1_evaluation=eval1["anonymous_evaluation"], |
|
|
judge1_time=eval1["elapsed_time"], |
|
|
judge2_id=eval2["judge"]["id"], |
|
|
judge2_name=eval2["judge"]["name"], |
|
|
judge2_evaluation=eval2["anonymous_evaluation"], |
|
|
judge2_time=eval2["elapsed_time"], |
|
|
winner_id=winner_id, |
|
|
) |
|
|
|
|
|
|
|
|
result_message = f"You selected: {choice}\n\n" |
|
|
result_message += f"Evaluation 1 was by: {eval1['judge']['name']} (took {eval1['elapsed_time']:.2f} seconds)\n" |
|
|
result_message += ( |
|
|
f"Evaluation 2 was by: {eval2['judge']['name']} (took {eval2['elapsed_time']:.2f} seconds)\n\n" |
|
|
) |
|
|
|
|
|
|
|
|
winner_mask = updated_board["judge_id"] == winner_id |
|
|
winner_elo = updated_board[winner_mask]["elo_score"].values[0] |
|
|
|
|
|
result_message += f"Winner: {winner_eval['judge']['name']} " |
|
|
result_message += f"(New ELO: {winner_elo:.2f})\n" |
|
|
result_message += f"Test Type: {current_test_type}\n" |
|
|
|
|
|
return result_message |
|
|
except Exception as e: |
|
|
logger.error(f"Error selecting winner: {e}") |
|
|
return f"Error: {str(e)}" |
|
|
|
|
|
|
|
|
def handle_both_correct(judge_manager: JudgeManager) -> str: |
|
|
"""Handle case where both evaluations are correct.""" |
|
|
global eval1, eval2, current_test_type |
|
|
|
|
|
try: |
|
|
if not eval1 or not eval2: |
|
|
return "Error: No evaluations available" |
|
|
|
|
|
|
|
|
input_text, output_text = "", "" |
|
|
if "input_text" in eval1 and "output_text" in eval1: |
|
|
input_text = eval1.get("input_text", "") |
|
|
output_text = eval1.get("output_text", "") |
|
|
|
|
|
|
|
|
updated_board = judge_manager.update_leaderboard( |
|
|
eval1["judge"]["id"], |
|
|
eval2["judge"]["id"], |
|
|
result_type="both_correct", |
|
|
) |
|
|
|
|
|
|
|
|
save_to_history( |
|
|
input_text=input_text, |
|
|
output_text=output_text, |
|
|
judge1_id=eval1["judge"]["id"], |
|
|
judge1_name=eval1["judge"]["name"], |
|
|
judge1_evaluation=eval1["anonymous_evaluation"], |
|
|
judge1_time=eval1["elapsed_time"], |
|
|
judge2_id=eval2["judge"]["id"], |
|
|
judge2_name=eval2["judge"]["name"], |
|
|
judge2_evaluation=eval2["anonymous_evaluation"], |
|
|
judge2_time=eval2["elapsed_time"], |
|
|
winner_id="both", |
|
|
) |
|
|
|
|
|
|
|
|
result_message = "You selected: Both Correct\n\n" |
|
|
result_message += f"Evaluation 1 was by: {eval1['judge']['name']} (took {eval1['elapsed_time']:.2f} seconds)\n" |
|
|
result_message += ( |
|
|
f"Evaluation 2 was by: {eval2['judge']['name']} (took {eval2['elapsed_time']:.2f} seconds)\n\n" |
|
|
) |
|
|
|
|
|
|
|
|
judge1_mask = updated_board["judge_id"] == eval1["judge"]["id"] |
|
|
judge2_mask = updated_board["judge_id"] == eval2["judge"]["id"] |
|
|
|
|
|
judge1_elo = updated_board[judge1_mask]["elo_score"].values[0] |
|
|
judge2_elo = updated_board[judge2_mask]["elo_score"].values[0] |
|
|
|
|
|
result_message += "\nBoth judges performed well!\n" |
|
|
result_message += f"{eval1['judge']['name']} new ELO: {judge1_elo:.2f}\n" |
|
|
result_message += f"{eval2['judge']['name']} new ELO: {judge2_elo:.2f}\n" |
|
|
result_message += f"Test Type: {current_test_type}\n" |
|
|
|
|
|
return result_message |
|
|
except Exception as e: |
|
|
logger.error(f"Error handling both correct: {e}") |
|
|
return f"Error: {str(e)}" |
|
|
|
|
|
|
|
|
def handle_both_incorrect(judge_manager: JudgeManager) -> str: |
|
|
"""Handle case where both evaluations are incorrect.""" |
|
|
global eval1, eval2, current_test_type |
|
|
|
|
|
try: |
|
|
if not eval1 or not eval2: |
|
|
return "Error: No evaluations available" |
|
|
|
|
|
|
|
|
input_text, output_text = "", "" |
|
|
if "input_text" in eval1 and "output_text" in eval1: |
|
|
input_text = eval1.get("input_text", "") |
|
|
output_text = eval1.get("output_text", "") |
|
|
|
|
|
|
|
|
updated_board = judge_manager.update_leaderboard( |
|
|
eval1["judge"]["id"], |
|
|
eval2["judge"]["id"], |
|
|
result_type="both_incorrect", |
|
|
) |
|
|
|
|
|
|
|
|
save_to_history( |
|
|
input_text=input_text, |
|
|
output_text=output_text, |
|
|
judge1_id=eval1["judge"]["id"], |
|
|
judge1_name=eval1["judge"]["name"], |
|
|
judge1_evaluation=eval1["anonymous_evaluation"], |
|
|
judge1_time=eval1["elapsed_time"], |
|
|
judge2_id=eval2["judge"]["id"], |
|
|
judge2_name=eval2["judge"]["name"], |
|
|
judge2_evaluation=eval2["anonymous_evaluation"], |
|
|
judge2_time=eval2["elapsed_time"], |
|
|
winner_id="none", |
|
|
) |
|
|
|
|
|
|
|
|
result_message = "You selected: Both Incorrect\n\n" |
|
|
result_message += f"Evaluation 1 was by: {eval1['judge']['name']} (took {eval1['elapsed_time']:.2f} seconds)\n" |
|
|
result_message += ( |
|
|
f"Evaluation 2 was by: {eval2['judge']['name']} (took {eval2['elapsed_time']:.2f} seconds)\n\n" |
|
|
) |
|
|
|
|
|
|
|
|
judge1_mask = updated_board["judge_id"] == eval1["judge"]["id"] |
|
|
judge2_mask = updated_board["judge_id"] == eval2["judge"]["id"] |
|
|
|
|
|
judge1_elo = updated_board[judge1_mask]["elo_score"].values[0] |
|
|
judge2_elo = updated_board[judge2_mask]["elo_score"].values[0] |
|
|
|
|
|
result_message += "\nBoth judges need improvement.\n" |
|
|
result_message += f"{eval1['judge']['name']} new ELO: {judge1_elo:.2f}\n" |
|
|
result_message += f"{eval2['judge']['name']} new ELO: {judge2_elo:.2f}\n" |
|
|
result_message += f"Test Type: {current_test_type}\n" |
|
|
|
|
|
return result_message |
|
|
except Exception as e: |
|
|
logger.error(f"Error handling both incorrect: {e}") |
|
|
return f"Error: {str(e)}" |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Initialize the application.""" |
|
|
demo = initialize() |
|
|
demo.launch(server_name="0.0.0.0") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|