Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| from src.about import ( | |
| CITATION_BUTTON_LABEL, | |
| CITATION_BUTTON_TEXT, | |
| TITLE, | |
| ) | |
| from src.envs import EVAL_RESULTS_PATH, RESULTS_REPO | |
| from src.populate import get_leaderboard_df | |
| from src.utils import initialize_file | |
| # Initialize the results file | |
| initialize_file(project_repo=RESULTS_REPO, file_path=EVAL_RESULTS_PATH) | |
| # Get leaderboard | |
| LEADERBOARD_DF = get_leaderboard_df(f"{EVAL_RESULTS_PATH}/results.tsv") | |
| columns = LEADERBOARD_DF.columns.tolist() | |
| demo = gr.Blocks(theme=gr.themes.Monochrome()) | |
| # Choices for the filters | |
| unselectable_columns = ["model"] | |
| select_column_choices = list(columns) | |
| for unselectable_column in unselectable_columns: | |
| select_column_choices.remove(unselectable_column) | |
| # Option for the filters | |
| filter_model_choices = LEADERBOARD_DF["model"].unique().tolist() | |
| filter_task_choices = LEADERBOARD_DF["task"].unique().tolist() | |
| filter_skill_choices = [ | |
| "Dialogue", | |
| "Long Context", | |
| "Numerical Reasoning", | |
| "Question Answering", | |
| "Summarisation", | |
| "Tabular Reasoning", | |
| ] | |
| with demo: | |
| gr.HTML(TITLE) | |
| gr.Markdown( | |
| "This is a collection of AveniBench results - a permissively licensed benchmark that tests a group of six key " | |
| "finance-related skills: tabular reasoning, numerical reasoning, question answering, long context modelling, " | |
| "summarisation and dialogue.", elem_classes="markdown-text", | |
| ) | |
| gr.Markdown("Open an issue or contact the Authors to include your model into the leaderboard.", elem_classes="markdown-text") | |
| with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
| with gr.TabItem("🏅 AveniBench Benchmark", elem_id="llm-benchmark-tab-table", id=0): | |
| with gr.Row(): | |
| filter_task = gr.CheckboxGroup( | |
| label="Select Tasks", | |
| choices=filter_task_choices, | |
| interactive=True, | |
| value=filter_task_choices, | |
| elem_id="filter_task", | |
| scale=6 | |
| ) | |
| with gr.Column(): | |
| select_all_tasks = gr.Button( | |
| value="Select all tasks", | |
| elem_id="select-all-tasks", | |
| size="sm", | |
| scale=1 | |
| ) | |
| deselect_all_tasks = gr.ClearButton( | |
| filter_task, | |
| value="Deselect all tasks", | |
| elem_id="deselect-all-tasks", | |
| size="sm", | |
| scale=1 | |
| ) | |
| with gr.Row(): | |
| filter_skills = gr.CheckboxGroup( | |
| label="Select Skills", | |
| choices=filter_skill_choices, | |
| value=filter_skill_choices, | |
| interactive=True, | |
| elem_id="filter-language", | |
| scale=6 | |
| ) | |
| with gr.Column(): | |
| select_all_skills = gr.Button( | |
| value="Select all skills", | |
| elem_id="select-all-skills", | |
| size="sm", | |
| scale=1 | |
| ) | |
| deselect_all_skills = gr.ClearButton( | |
| filter_skills, | |
| value="Deselect all skills", | |
| elem_id="deselect-all-skills", | |
| size="sm", | |
| scale=1 | |
| ) | |
| with gr.Column(): | |
| leaderboard_table = gr.Dataframe( | |
| value=LEADERBOARD_DF, | |
| interactive=False, | |
| type="pandas", | |
| visible=True, | |
| label="Leaderboard", | |
| elem_id="leaderboard-title", | |
| ) | |
| with gr.Row(): | |
| with gr.Accordion("📙 Citation", open=False): | |
| citation_button = gr.Textbox( | |
| value=CITATION_BUTTON_TEXT, | |
| label=CITATION_BUTTON_LABEL, | |
| lines=20, | |
| elem_id="citation-button", | |
| show_copy_button=True, | |
| ) | |
| def update_leaderboard(filter_task_items, filter_skills_items): | |
| # Empty tasks/skills set: | |
| if not filter_task_items or not filter_skills_items: | |
| return pd.DataFrame([], columns=["model", "Borda Count"]) | |
| filtered_df: pd.DataFrame = LEADERBOARD_DF.copy() | |
| filtered_df = filtered_df[filtered_df["task"].isin(filter_task_items)] | |
| filtered_df = filtered_df[filtered_df["skill"].apply( | |
| lambda x: any(skill in x for skill in filter_skills_items) | |
| )] | |
| cols = ["model", "task", "score"] | |
| filtered_df = filtered_df[cols] | |
| # Calculate borda count | |
| current_task_items = filtered_df["task"].unique().tolist() | |
| filtered_df["borda-score"] = 0 | |
| for task in current_task_items: | |
| filtered_df["borda-score"] += (filtered_df['score'].where(filtered_df["task"] == task) | |
| .rank(ascending=True, method="max") - 1).fillna(0) | |
| filtered_df = filtered_df.pivot(index="model", columns="task", values=["borda-score", "score"]).reset_index() | |
| filtered_df["borda-score-sum"] = filtered_df["borda-score"].sum(axis=1) | |
| filtered_df["borda-count"] = filtered_df["borda-score-sum"].rank(ascending=False, method="min") | |
| # Reorder columns | |
| filtered_df = filtered_df[["model", "borda-count", "score"]] | |
| filtered_df.columns = ["model", "borda-count"] + sorted(filtered_df.columns.droplevel(level=0)[2:].tolist()) | |
| # Sort by borda count | |
| filtered_df = filtered_df.sort_values(by="borda-count", ascending=True) | |
| # Rename borda count with symbol | |
| filtered_df = filtered_df.rename(columns={ | |
| "borda-count": "Borda Count", | |
| "MultiHiertt EASY": "MHiertt EASY", | |
| "MultiHiertt HARD": "MHiertt HARD", | |
| }) | |
| # Round values | |
| for col in filtered_df.columns: | |
| if col not in ["model", "Borda Count"]: | |
| filtered_df[col] = filtered_df[col].round(2) | |
| return filtered_df | |
| inputs = [filter_task, filter_skills] | |
| outputs = [leaderboard_table] | |
| for component in inputs: | |
| component.change( | |
| fn=update_leaderboard, | |
| inputs=inputs, | |
| outputs=outputs | |
| ) | |
| select_all_tasks.click(lambda: filter_task_choices, inputs=[], outputs=[filter_task]) | |
| select_all_skills.click(lambda: filter_skill_choices, inputs=[], outputs=[filter_skills]) | |
| gr.Blocks.load( | |
| block=demo, | |
| fn=update_leaderboard, | |
| inputs=inputs, | |
| outputs=outputs | |
| ) | |
| demo.queue().launch() | |