|
|
import glob |
|
|
import os |
|
|
from typing import Callable |
|
|
|
|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
from loguru import logger |
|
|
|
|
|
from src.config import TEST_TYPES |
|
|
|
|
|
|
|
|
class UI: |
|
|
"""Handles the Gradio UI components and interface""" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
refresh_fn: Callable, |
|
|
submit_fn: Callable, |
|
|
evaluate1_fn: Callable, |
|
|
evaluate2_fn: Callable, |
|
|
winner1_fn: Callable, |
|
|
winner2_fn: Callable, |
|
|
both_correct_fn: Callable, |
|
|
both_incorrect_fn: Callable, |
|
|
refresh_leaderboard_fn: Callable, |
|
|
leaderboard_df: pd.DataFrame, |
|
|
load_benchmark_fn: Callable = None, |
|
|
): |
|
|
self.refresh_fn = refresh_fn |
|
|
self.submit_fn = submit_fn |
|
|
self.evaluate1_fn = evaluate1_fn |
|
|
self.evaluate2_fn = evaluate2_fn |
|
|
self.winner1_fn = winner1_fn |
|
|
self.winner2_fn = winner2_fn |
|
|
self.both_correct_fn = both_correct_fn |
|
|
self.both_incorrect_fn = both_incorrect_fn |
|
|
self.refresh_leaderboard_fn = refresh_leaderboard_fn |
|
|
self.leaderboard_df = leaderboard_df |
|
|
self.load_benchmark_fn = load_benchmark_fn |
|
|
|
|
|
def refresh_benchmark_types( |
|
|
self, |
|
|
): |
|
|
try: |
|
|
new_benchmark_types = [d for d in os.listdir("benchmarks") if os.path.isdir(os.path.join("benchmarks", d))] |
|
|
|
|
|
logger.info(f"Refreshed benchmark types: {new_benchmark_types}") |
|
|
|
|
|
|
|
|
if new_benchmark_types: |
|
|
|
|
|
return gr.update(choices=new_benchmark_types, value=new_benchmark_types[0]) |
|
|
else: |
|
|
return gr.update(choices=[], value=None) |
|
|
except (FileNotFoundError, PermissionError) as e: |
|
|
logger.error(f"Error refreshing benchmark types: {e}") |
|
|
return gr.update(choices=[], value=None) |
|
|
|
|
|
|
|
|
def get_benchmark_datasets(self, benchmark_type): |
|
|
if not benchmark_type: |
|
|
return gr.update(choices=[], value=None) |
|
|
|
|
|
try: |
|
|
|
|
|
pattern = os.path.join("benchmarks", benchmark_type, "*-judges-metrics.csv") |
|
|
files = glob.glob(pattern) |
|
|
|
|
|
|
|
|
datasets = [] |
|
|
for file in files: |
|
|
basename = os.path.basename(file) |
|
|
dataset_name = basename.replace("-judges-metrics.csv", "") |
|
|
datasets.append(dataset_name) |
|
|
|
|
|
logger.info(f"Found datasets for {benchmark_type}: {datasets}") |
|
|
|
|
|
if datasets: |
|
|
return gr.update(choices=datasets, value=datasets[0]) |
|
|
else: |
|
|
return gr.update(choices=[], value=None) |
|
|
except Exception as e: |
|
|
logger.error(f"Error getting benchmark datasets: {e}") |
|
|
return gr.update(choices=[], value=None) |
|
|
|
|
|
def create_interface(self) -> gr.Blocks: |
|
|
"""Create the Gradio interface""" |
|
|
with gr.Blocks( |
|
|
title="AI Evaluators Arena", |
|
|
theme=gr.themes.Soft( |
|
|
primary_hue=gr.themes.Color( |
|
|
c50="#ECE9FB", |
|
|
c100="#ECE9FB", |
|
|
c200="#ECE9FB", |
|
|
c300="#6B63BF", |
|
|
c400="#494199", |
|
|
c500="#A5183A", |
|
|
c600="#332E68", |
|
|
c700="#272350", |
|
|
c800="#201E44", |
|
|
c900="#1C1A3D", |
|
|
c950="#100F24", |
|
|
), |
|
|
secondary_hue=gr.themes.Color( |
|
|
c50="#ECE9FB", |
|
|
c100="#ECE9FB", |
|
|
c200="#ECE9FB", |
|
|
c300="#6B63BF", |
|
|
c400="#494199", |
|
|
c500="#A5183A", |
|
|
c600="#A5183A", |
|
|
c700="#272350", |
|
|
c800="#201E44", |
|
|
c900="#1C1A3D", |
|
|
c950="#100F24", |
|
|
), |
|
|
neutral_hue=gr.themes.Color( |
|
|
c50="#ECE9FB", |
|
|
c100="#ECE9FB", |
|
|
c200="#ECE9FB", |
|
|
c300="#6B63BF", |
|
|
c400="#494199", |
|
|
c500="#A5183A", |
|
|
c600="#332E68", |
|
|
c700="#272350", |
|
|
c800="#201E44", |
|
|
c900="#1C1A3D", |
|
|
c950="#100F24", |
|
|
), |
|
|
font=[ |
|
|
gr.themes.GoogleFont("Mulish"), |
|
|
"Arial", |
|
|
"sans-serif", |
|
|
], |
|
|
), |
|
|
) as demo: |
|
|
gr.Markdown("# AI Evaluators Arena") |
|
|
gr.Markdown( |
|
|
"Choose which AI judge provides better evaluation of the output. " |
|
|
"This is a blind evaluation - judges' identities are hidden until after you make your selection." |
|
|
) |
|
|
|
|
|
with gr.Tab("🧑⚖️ Evaluators Arena"): |
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
test_type_dropdown = gr.Dropdown( |
|
|
choices=list(TEST_TYPES.keys()), |
|
|
value="grounding", |
|
|
label="Test Type", |
|
|
info="Select the type of test to evaluate", |
|
|
) |
|
|
test_type_description = gr.Markdown(TEST_TYPES["grounding"]) |
|
|
refresh_button = gr.Button("Load from a dataset") |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=2): |
|
|
|
|
|
text_input = gr.Textbox(label="Text", lines=4, visible=True) |
|
|
claim_input = gr.Textbox(label="Claim", lines=2, visible=True) |
|
|
|
|
|
|
|
|
policy_input = gr.Textbox(label="Input", lines=3, visible=False) |
|
|
policy_output = gr.Textbox(label="Output", lines=4, visible=False) |
|
|
policy_assertion = gr.Textbox(label="Assertion", lines=2, visible=False) |
|
|
|
|
|
|
|
|
single_text_input = gr.Textbox(label="Text", lines=6, visible=False) |
|
|
|
|
|
|
|
|
input_text = gr.Textbox(label="Input", lines=4, visible=False) |
|
|
output_text = gr.Textbox(label="Output", lines=6, visible=False) |
|
|
|
|
|
submit_button = gr.Button("Evaluate") |
|
|
status_message = gr.Markdown(visible=False) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
evaluation1 = gr.Textbox(label="Anonymous Evaluation 1", lines=10) |
|
|
select_eval1 = gr.Button("Select Evaluation 1", visible=False) |
|
|
|
|
|
with gr.Column(): |
|
|
evaluation2 = gr.Textbox(label="Anonymous Evaluation 2", lines=10) |
|
|
select_eval2 = gr.Button("Select Evaluation 2", visible=False) |
|
|
|
|
|
with gr.Row(visible=False) as additional_buttons_row: |
|
|
with gr.Column(): |
|
|
both_correct_btn = gr.Button("Both Correct", variant="secondary") |
|
|
with gr.Column(): |
|
|
both_incorrect_btn = gr.Button("Both Incorrect", variant="secondary") |
|
|
|
|
|
result_text = gr.Textbox(label="Result", lines=6) |
|
|
|
|
|
with gr.Tab("🏆 Leaderboard"): |
|
|
leaderboard_dataframe = gr.DataFrame( |
|
|
value=self.leaderboard_df, |
|
|
headers=["Judge Name", "ELO Score", "Wins", "Losses", "Total Evaluations"], |
|
|
datatype=["str", "number", "number", "number", "number"], |
|
|
col_count=(5, "fixed"), |
|
|
interactive=False, |
|
|
) |
|
|
refresh_leaderboard = gr.Button("Refresh Leaderboard") |
|
|
|
|
|
|
|
|
with gr.Tab("📊 Benchmarks"): |
|
|
types = self.refresh_benchmark_types() |
|
|
for t in types: |
|
|
self.get_benchmark_datasets(t) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
|
|
|
try: |
|
|
benchmark_types = [ |
|
|
d for d in os.listdir("benchmarks") if os.path.isdir(os.path.join("benchmarks", d)) |
|
|
] |
|
|
except (FileNotFoundError, PermissionError): |
|
|
|
|
|
benchmark_types = [] |
|
|
logger.error("Failed to read benchmarks directory") |
|
|
|
|
|
benchmark_type_dropdown = gr.Dropdown( |
|
|
choices=benchmark_types, |
|
|
label="Benchmark Type", |
|
|
info="Select the type of benchmark to view", |
|
|
value=benchmark_types[0] if benchmark_types else None, |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
|
|
|
benchmark_dataset_dropdown = gr.Dropdown( |
|
|
label="Benchmark Dataset", |
|
|
info="Select the benchmark dataset to view", |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
benchmark_dataframe = gr.DataFrame( |
|
|
headers=[ |
|
|
"Judge Name", |
|
|
"F1 Score", |
|
|
"Balanced Accuracy", |
|
|
"Avg Latency (s)", |
|
|
"Correct", |
|
|
"Total", |
|
|
], |
|
|
label="Benchmark Results", |
|
|
interactive=False, |
|
|
) |
|
|
|
|
|
benchmark_info = gr.Markdown("Select a benchmark dataset to view results") |
|
|
|
|
|
|
|
|
refresh_benchmarks_btn = gr.Button("Refresh Benchmark List") |
|
|
|
|
|
with gr.Tab("About"): |
|
|
self._create_about_tab() |
|
|
|
|
|
|
|
|
refresh_button.click( |
|
|
self.refresh_fn, |
|
|
[test_type_dropdown], |
|
|
[ |
|
|
input_text, |
|
|
output_text, |
|
|
text_input, |
|
|
claim_input, |
|
|
single_text_input, |
|
|
policy_input, |
|
|
policy_output, |
|
|
policy_assertion, |
|
|
], |
|
|
) |
|
|
|
|
|
|
|
|
test_type_dropdown.change( |
|
|
self._update_input_visibility, |
|
|
[test_type_dropdown], |
|
|
[ |
|
|
text_input, |
|
|
claim_input, |
|
|
single_text_input, |
|
|
policy_input, |
|
|
policy_output, |
|
|
policy_assertion, |
|
|
input_text, |
|
|
output_text, |
|
|
], |
|
|
) |
|
|
|
|
|
|
|
|
test_type_dropdown.change( |
|
|
lambda test_type: TEST_TYPES[test_type], |
|
|
[test_type_dropdown], |
|
|
[test_type_description], |
|
|
) |
|
|
|
|
|
|
|
|
submit_event = submit_button.click( |
|
|
self.submit_fn, |
|
|
[ |
|
|
text_input, |
|
|
claim_input, |
|
|
single_text_input, |
|
|
policy_input, |
|
|
policy_output, |
|
|
policy_assertion, |
|
|
test_type_dropdown, |
|
|
], |
|
|
[ |
|
|
evaluation1, |
|
|
evaluation2, |
|
|
text_input, |
|
|
claim_input, |
|
|
single_text_input, |
|
|
policy_input, |
|
|
policy_output, |
|
|
policy_assertion, |
|
|
test_type_dropdown, |
|
|
status_message, |
|
|
], |
|
|
) |
|
|
|
|
|
|
|
|
submit_event.then( |
|
|
self.evaluate1_fn, |
|
|
[ |
|
|
text_input, |
|
|
claim_input, |
|
|
single_text_input, |
|
|
policy_input, |
|
|
policy_output, |
|
|
policy_assertion, |
|
|
test_type_dropdown, |
|
|
], |
|
|
[evaluation1, select_eval1], |
|
|
queue=False, |
|
|
) |
|
|
|
|
|
submit_event.then( |
|
|
self.evaluate2_fn, |
|
|
[ |
|
|
text_input, |
|
|
claim_input, |
|
|
single_text_input, |
|
|
policy_input, |
|
|
policy_output, |
|
|
policy_assertion, |
|
|
test_type_dropdown, |
|
|
], |
|
|
[evaluation2, select_eval2, additional_buttons_row], |
|
|
queue=False, |
|
|
) |
|
|
|
|
|
|
|
|
select_eval1.click( |
|
|
self.winner1_fn, |
|
|
[], |
|
|
[result_text], |
|
|
) |
|
|
|
|
|
select_eval2.click( |
|
|
self.winner2_fn, |
|
|
[], |
|
|
[result_text], |
|
|
) |
|
|
|
|
|
both_correct_btn.click( |
|
|
self.both_correct_fn, |
|
|
[], |
|
|
[result_text], |
|
|
) |
|
|
|
|
|
both_incorrect_btn.click( |
|
|
self.both_incorrect_fn, |
|
|
[], |
|
|
[result_text], |
|
|
) |
|
|
|
|
|
refresh_leaderboard.click( |
|
|
self.refresh_leaderboard_fn, |
|
|
[], |
|
|
[leaderboard_dataframe], |
|
|
) |
|
|
|
|
|
|
|
|
benchmark_type_dropdown.change( |
|
|
self.get_benchmark_datasets, |
|
|
[benchmark_type_dropdown], |
|
|
[benchmark_dataset_dropdown], |
|
|
) |
|
|
|
|
|
|
|
|
refresh_benchmarks_btn.click( |
|
|
self.refresh_benchmark_types, |
|
|
[], |
|
|
[benchmark_type_dropdown], |
|
|
).then( |
|
|
self.get_benchmark_datasets, |
|
|
[benchmark_type_dropdown], |
|
|
[benchmark_dataset_dropdown], |
|
|
) |
|
|
|
|
|
|
|
|
if self.load_benchmark_fn: |
|
|
benchmark_dataset_dropdown.change( |
|
|
self.load_benchmark_fn, |
|
|
[benchmark_type_dropdown, benchmark_dataset_dropdown], |
|
|
[benchmark_dataframe, benchmark_info], |
|
|
) |
|
|
|
|
|
|
|
|
if benchmark_types: |
|
|
initial_benchmark_type = benchmark_types[0] |
|
|
logger.info(f"Loading initial datasets for benchmark type: {initial_benchmark_type}") |
|
|
benchmark_type_dropdown.value = initial_benchmark_type |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
gr.HTML( |
|
|
""" |
|
|
<div style="text-align:center; margin-top:20px; padding:10px;"> |
|
|
made with ❤️ by <a href="https://qualifire.ai" target="_blank">Qualifire</a> |
|
|
</div> |
|
|
""" |
|
|
) |
|
|
|
|
|
return demo |
|
|
|
|
|
def _create_about_tab(self) -> None: |
|
|
"""Create the About tab content""" |
|
|
gr.Markdown( |
|
|
""" |
|
|
# About AI Evaluators Arena |
|
|
|
|
|
This platform allows you to evaluate and compare different AI judges in their ability to assess various types of content. |
|
|
|
|
|
## How it works |
|
|
1. Choose a test type from the dropdown |
|
|
2. Fill in the input fields or load a random example from our dataset |
|
|
3. Click "Evaluate" to get assessments from two randomly selected judges |
|
|
4. Choose which evaluation you think is better |
|
|
5. See which judge provided each evaluation |
|
|
|
|
|
## Test Types |
|
|
- **Grounding**: Evaluate if a claim is grounded in a given text |
|
|
- **Prompt Injections**: Detect attempts to manipulate or jailbreak the model |
|
|
- **Safety**: Identify harmful, offensive, or dangerous content |
|
|
- **Policy**: Determine if output complies with a given policy |
|
|
|
|
|
## Leaderboard |
|
|
The leaderboard tracks judge performance using an ELO rating system, with scores adjusted based on human preferences. |
|
|
""" |
|
|
) |
|
|
|
|
|
def _update_input_visibility(self, test_type): |
|
|
"""Update which input fields are visible based on the selected test type""" |
|
|
if test_type == "grounding": |
|
|
return [ |
|
|
gr.update(visible=True), |
|
|
gr.update(visible=True), |
|
|
gr.update(visible=False), |
|
|
gr.update(visible=False), |
|
|
gr.update(visible=False), |
|
|
gr.update(visible=False), |
|
|
gr.update(visible=False), |
|
|
gr.update(visible=False), |
|
|
] |
|
|
elif test_type in ["prompt_injections", "safety"]: |
|
|
return [ |
|
|
gr.update(visible=False), |
|
|
gr.update(visible=False), |
|
|
gr.update(visible=True), |
|
|
gr.update(visible=False), |
|
|
gr.update(visible=False), |
|
|
gr.update(visible=False), |
|
|
gr.update(visible=False), |
|
|
gr.update(visible=False), |
|
|
] |
|
|
elif test_type == "policy": |
|
|
return [ |
|
|
gr.update(visible=False), |
|
|
gr.update(visible=False), |
|
|
gr.update(visible=False), |
|
|
gr.update(visible=True), |
|
|
gr.update(visible=True), |
|
|
gr.update(visible=True), |
|
|
gr.update(visible=False), |
|
|
gr.update(visible=False), |
|
|
] |
|
|
else: |
|
|
|
|
|
return [ |
|
|
gr.update(visible=False), |
|
|
gr.update(visible=False), |
|
|
gr.update(visible=False), |
|
|
gr.update(visible=False), |
|
|
gr.update(visible=False), |
|
|
gr.update(visible=False), |
|
|
gr.update(visible=True), |
|
|
gr.update(visible=True), |
|
|
] |
|
|
|