import gradio as gr import pandas as pd def model_hyperlink_md(link: str, name: str) -> str: return f"[{name}]({link})" def make_clickable_and_drop_links(df: pd.DataFrame) -> pd.DataFrame: if "Links" not in df.columns: raise ValueError("CSV must include a 'Links' column.") df = df.copy() df["Model"] = df.apply(lambda r: model_hyperlink_md(r["Links"], r["Model"]), axis=1) return df.drop(columns=["Links"]) def datatypes_with_markdown(df: pd.DataFrame): return ["markdown" if c == "Model" else "str" for c in df.columns] # ---------- load data ---------- BASE_CSV = "code_eval_board.csv" INSTRUCT_CSV = "eval_instruct_lms.csv" base_df_raw = pd.read_csv(BASE_CSV) inst_df_raw = pd.read_csv(INSTRUCT_CSV) base_df = make_clickable_and_drop_links(base_df_raw) inst_df = make_clickable_and_drop_links(inst_df_raw) base_dtypes = datatypes_with_markdown(base_df) inst_dtypes = datatypes_with_markdown(inst_df) # ---------- css ---------- custom_css = """ .gradio-container {font-family: Inter, system-ui, -apple-system, Segoe UI, Roboto, sans-serif;} #base-table a, #inst-table a { color: #2a7ae2 !important; text-decoration: underline dotted !important; text-underline-offset: 3px; } #base-table a:hover, #inst-table a:hover { color: #1e5bbf !important; text-decoration: underline solid !important; } """ # ---------- app ---------- demo = gr.Blocks(css=custom_css) with demo: # ---------- HEADER ---------- gr.HTML( """

Diké

Open Diké Leaderboard

Bias and Fairness in Compressed LLMs

Inspired by 🤗 Open LLM Leaderboard and Optimum Leaderboard 🏋️ , we compare the performance of compressed LLMs across fairness, toxicity, ethics, and safety benchmarks. The leaderboard is released as part of the ⚖️ Diké Project.

""" ) # ---------- TABS ---------- with gr.Tabs(): # TAB 1: Base LLMs with gr.TabItem("🟢 Base LLMs Evaluation"): with gr.Row(): base_search = gr.Textbox(placeholder="🔍 Search base models...", show_label=False) def base_search_fn(q): if not q or not q.strip(): return base_df mask = base_df["Model"].str.contains(q, case=False) return base_df[mask] base_table = gr.Dataframe( value=base_df, datatype=base_dtypes, interactive=False, sortable=True, elem_id="base-table", ) base_search.submit(base_search_fn, base_search, base_table) # TAB 2: Instruction-tuned LLMs with gr.TabItem("🔶 Instruction-tuned LLMs Evaluation"): with gr.Row(): inst_search = gr.Textbox(placeholder="🔍 Search instruction-tuned models...", show_label=False) def inst_search_fn(q): if not q or not q.strip(): return inst_df mask = inst_df["Model"].str.contains(q, case=False) return inst_df[mask] inst_table = gr.Dataframe( value=inst_df, datatype=inst_dtypes, interactive=False, sortable=True, elem_id="inst-table", ) inst_search.submit(inst_search_fn, inst_search, inst_table) # TAB 3: About # ---------- TAB 3: About ---------- with gr.TabItem("📘 About"): gr.HTML("""

📊 Benchmarks and Metrics Overview

The Diké Leaderboard evaluates the impact of quantization and compression on bias, fairness, ethics, and safety of large language models (LLMs). Each benchmark measures a specific social or ethical aspect of model behavior.

Perplexity (PPL) - General measure of model fluency and language modeling quality, evaluated on WikiText-2. Lower values indicate better language modeling performance.
HellaSwag - Measures general reasoning and commonsense knowledge via multiple-choice story completion. Dataset: HellaSwag. Metric: Accuracy.
BBQ (Bias Benchmark for QA) - Evaluates bias in ambiguous and disambiguated question-answering contexts across 11 protected categories (BBQ dataset). Metrics: Accuracy, Bias (ambiguous), Bias (disambiguated).
CrowS-Pairs - Minimal stereotype pairs. (CrowS-Pairs dataset). Metric: % of stereotyped continuations.
HolisticBias - 13 demographic axes with sentiment prompts (HolisticBias dataset). Metric: Sentiment skew across identity descriptors.
SoFA (Social Fairness Dataset) - 1.49M bias probes covering religion, gender, race, and disability (SoFA dataset). Metric: Variance of log-perplexity across identity groups.
StereoSet - Triplet format (stereotype, anti-stereotype, unrelated) across gender, race, religion, profession (StereoSet dataset). Metric: Stereotype Score, Language Modeling Score.
ETHICS - Morality judgments across five ethical principles; we use the Commonsense Morality subset (ETHICS dataset). Metric: Accuracy.
Moral Stories - First-person scenarios for moral vs. immoral action selection (Moral Stories dataset). Metrics: Moral preference Accuracy, Refusal rate.
Histoires Morales - French extension of Moral Stories for cross-lingual ethics evaluation. (Moral Stories dataset). Metric: Accuracy, Refusal rate.
RealToxicityPrompts - Measures generation toxicity given neutral prompts (RealToxicityPrompts). Metric: Average toxicity probability.
HarmBench - Evaluates safety by measuring model responses to harmful or unethical prompts (HarmBench). Metric: Unsafe response rate.

All evaluations are implemented via the LM Evaluation Harness and follow consistent zero-shot protocols.

""") gr.HTML( """

Notes
• Click column headers to sort ascending/descending
• Model names are clickable links to Hugging Face pages

Part of the ⚖️ Diké Project.

""" ) demo.launch(server_name="0.0.0.0", server_port=7860)