import gradio as gr
import pandas as pd
def model_hyperlink_md(link: str, name: str) -> str:
return f"[{name}]({link})"
def make_clickable_and_drop_links(df: pd.DataFrame) -> pd.DataFrame:
if "Links" not in df.columns:
raise ValueError("CSV must include a 'Links' column.")
df = df.copy()
df["Model"] = df.apply(lambda r: model_hyperlink_md(r["Links"], r["Model"]), axis=1)
return df.drop(columns=["Links"])
def datatypes_with_markdown(df: pd.DataFrame):
return ["markdown" if c == "Model" else "str" for c in df.columns]
# ---------- load data ----------
BASE_CSV = "code_eval_board.csv"
INSTRUCT_CSV = "eval_instruct_lms.csv"
base_df_raw = pd.read_csv(BASE_CSV)
inst_df_raw = pd.read_csv(INSTRUCT_CSV)
base_df = make_clickable_and_drop_links(base_df_raw)
inst_df = make_clickable_and_drop_links(inst_df_raw)
base_dtypes = datatypes_with_markdown(base_df)
inst_dtypes = datatypes_with_markdown(inst_df)
# ---------- css ----------
custom_css = """
.gradio-container {font-family: Inter, system-ui, -apple-system, Segoe UI, Roboto, sans-serif;}
#base-table a, #inst-table a {
color: #2a7ae2 !important;
text-decoration: underline dotted !important;
text-underline-offset: 3px;
}
#base-table a:hover, #inst-table a:hover {
color: #1e5bbf !important;
text-decoration: underline solid !important;
}
"""
# ---------- app ----------
demo = gr.Blocks(css=custom_css)
with demo:
# ---------- HEADER ----------
gr.HTML(
"""
"""
)
# ---------- TABS ----------
with gr.Tabs():
# TAB 1: Base LLMs
with gr.TabItem("🟢 Base LLMs Evaluation"):
with gr.Row():
base_search = gr.Textbox(placeholder="🔍 Search base models...", show_label=False)
def base_search_fn(q):
if not q or not q.strip():
return base_df
mask = base_df["Model"].str.contains(q, case=False)
return base_df[mask]
base_table = gr.Dataframe(
value=base_df,
datatype=base_dtypes,
interactive=False,
sortable=True,
elem_id="base-table",
)
base_search.submit(base_search_fn, base_search, base_table)
# TAB 2: Instruction-tuned LLMs
with gr.TabItem("🔶 Instruction-tuned LLMs Evaluation"):
with gr.Row():
inst_search = gr.Textbox(placeholder="🔍 Search instruction-tuned models...", show_label=False)
def inst_search_fn(q):
if not q or not q.strip():
return inst_df
mask = inst_df["Model"].str.contains(q, case=False)
return inst_df[mask]
inst_table = gr.Dataframe(
value=inst_df,
datatype=inst_dtypes,
interactive=False,
sortable=True,
elem_id="inst-table",
)
inst_search.submit(inst_search_fn, inst_search, inst_table)
# TAB 3: About
# ---------- TAB 3: About ----------
with gr.TabItem("📘 About"):
gr.HTML("""
📊 Benchmarks and Metrics Overview
The Diké Leaderboard evaluates the impact of quantization and compression
on bias, fairness, ethics, and safety of large language models (LLMs).
Each benchmark measures a specific social or ethical aspect of model behavior.
- Perplexity (PPL) - General measure of model fluency and language modeling quality,
evaluated on WikiText-2.
Lower values indicate better language modeling performance.
- HellaSwag - Measures general reasoning and commonsense knowledge via multiple-choice story completion.
Dataset: HellaSwag.
Metric: Accuracy.
- BBQ (Bias Benchmark for QA) - Evaluates bias in ambiguous and disambiguated
question-answering contexts across 11 protected categories
(BBQ dataset).
Metrics: Accuracy, Bias (ambiguous), Bias (disambiguated).
- CrowS-Pairs - Minimal stereotype pairs.
(CrowS-Pairs dataset).
Metric: % of stereotyped continuations.
- HolisticBias - 13 demographic axes with sentiment prompts
(HolisticBias dataset).
Metric: Sentiment skew across identity descriptors.
- SoFA (Social Fairness Dataset) - 1.49M bias probes covering religion, gender, race, and disability
(SoFA dataset).
Metric: Variance of log-perplexity across identity groups.
- StereoSet - Triplet format (stereotype, anti-stereotype, unrelated)
across gender, race, religion, profession
(StereoSet dataset).
Metric: Stereotype Score, Language Modeling Score.
- ETHICS - Morality judgments across five ethical principles;
we use the Commonsense Morality subset
(ETHICS dataset).
Metric: Accuracy.
- Moral Stories - First-person scenarios for moral vs. immoral action selection
(Moral Stories dataset).
Metrics: Moral preference Accuracy, Refusal rate.
- Histoires Morales - French extension of Moral Stories for cross-lingual ethics evaluation.
(Moral Stories dataset).
Metric: Accuracy, Refusal rate.
- RealToxicityPrompts - Measures generation toxicity given neutral prompts
(RealToxicityPrompts).
Metric: Average toxicity probability.
- HarmBench - Evaluates safety by measuring model responses to harmful or unethical prompts
(HarmBench).
Metric: Unsafe response rate.
All evaluations are implemented via the
LM Evaluation Harness
and follow consistent zero-shot protocols.
""")
gr.HTML(
"""
Notes
• Click column headers to sort ascending/descending
• Model names are clickable links to Hugging Face pages
Part of the
⚖️ Diké Project.
"""
)
demo.launch(server_name="0.0.0.0", server_port=7860)