iproskurina's picture
Update app.py
10b67a4 verified
import gradio as gr
import pandas as pd
def model_hyperlink_md(link: str, name: str) -> str:
return f"[{name}]({link})"
def make_clickable_and_drop_links(df: pd.DataFrame) -> pd.DataFrame:
if "Links" not in df.columns:
raise ValueError("CSV must include a 'Links' column.")
df = df.copy()
df["Model"] = df.apply(lambda r: model_hyperlink_md(r["Links"], r["Model"]), axis=1)
return df.drop(columns=["Links"])
def datatypes_with_markdown(df: pd.DataFrame):
return ["markdown" if c == "Model" else "str" for c in df.columns]
# ---------- load data ----------
BASE_CSV = "code_eval_board.csv"
INSTRUCT_CSV = "eval_instruct_lms.csv"
base_df_raw = pd.read_csv(BASE_CSV)
inst_df_raw = pd.read_csv(INSTRUCT_CSV)
base_df = make_clickable_and_drop_links(base_df_raw)
inst_df = make_clickable_and_drop_links(inst_df_raw)
base_dtypes = datatypes_with_markdown(base_df)
inst_dtypes = datatypes_with_markdown(inst_df)
# ---------- css ----------
custom_css = """
.gradio-container {font-family: Inter, system-ui, -apple-system, Segoe UI, Roboto, sans-serif;}
#base-table a, #inst-table a {
color: #2a7ae2 !important;
text-decoration: underline dotted !important;
text-underline-offset: 3px;
}
#base-table a:hover, #inst-table a:hover {
color: #1e5bbf !important;
text-decoration: underline solid !important;
}
"""
# ---------- app ----------
demo = gr.Blocks(css=custom_css)
with demo:
# ---------- HEADER ----------
gr.HTML(
"""<div id='header' style='text-align:center; margin-top:16px;'>
<div id='title-row'
style='display:flex; align-items:center; justify-content:center; gap:16px; flex-wrap:wrap;'>
<img src='https://legendaryladieshub.com/wp-content/uploads/2023/12/Dike_Greek-goddess-of-justice-and-moral-order_by-LLH-300x300.jpeg'
alt='Diké' width='80'
style='border-radius:50%; object-fit:cover; box-shadow:0 0 8px rgba(0,0,0,0.4); background:transparent;'>
<div style='display:flex; flex-direction:column; align-items:center; text-align:center;'>
<h1 style='font-size:30px; margin:0; font-weight:650;'>Open Diké Leaderboard</h1>
<p style='font-size:18px; margin:4px 0; color:#6c7a89;'>
Bias and Fairness in Compressed LLMs
</p>
</div>
</div>
<p id='subtitle'
style='font-size:14px; color:#8a9aad; margin-top:12px;
max-width:1000px; margin-left:auto; margin-right:auto;
line-height:1.6; text-align:justify;'>
Inspired by
<a href='https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/'
target='_blank'
style='color:#5a8dee; text-decoration:none; font-weight:500;'>
🤗 Open LLM Leaderboard
</a> and
<a href='https://huggingface.co/spaces/optimum/llm-perf-leaderboard'
target='_blank'
style='color:#5a8dee; text-decoration:none; font-weight:500;'>
Optimum Leaderboard 🏋️
</a>, we compare the performance of compressed LLMs across
<b>fairness</b>, <b>toxicity</b>, <b>ethics</b>, and <b>safety</b> benchmarks. The leaderboard is released as part of the
<a href='https://www.anr-dike.fr/' target='_blank'
style='color:#5a8dee; text-decoration:none; font-weight:500;'>⚖️ Diké Project</a>.
</p>
</div>"""
)
# ---------- TABS ----------
with gr.Tabs():
# TAB 1: Base LLMs
with gr.TabItem("🟢 Base LLMs Evaluation"):
with gr.Row():
base_search = gr.Textbox(placeholder="🔍 Search base models...", show_label=False)
def base_search_fn(q):
if not q or not q.strip():
return base_df
mask = base_df["Model"].str.contains(q, case=False)
return base_df[mask]
base_table = gr.Dataframe(
value=base_df,
datatype=base_dtypes,
interactive=False,
sortable=True,
elem_id="base-table",
)
base_search.submit(base_search_fn, base_search, base_table)
# TAB 2: Instruction-tuned LLMs
with gr.TabItem("🔶 Instruction-tuned LLMs Evaluation"):
with gr.Row():
inst_search = gr.Textbox(placeholder="🔍 Search instruction-tuned models...", show_label=False)
def inst_search_fn(q):
if not q or not q.strip():
return inst_df
mask = inst_df["Model"].str.contains(q, case=False)
return inst_df[mask]
inst_table = gr.Dataframe(
value=inst_df,
datatype=inst_dtypes,
interactive=False,
sortable=True,
elem_id="inst-table",
)
inst_search.submit(inst_search_fn, inst_search, inst_table)
# TAB 3: About
# ---------- TAB 3: About ----------
with gr.TabItem("📘 About"):
gr.HTML("""
<div style='max-width:900px; margin:0 auto; text-align:justify; color:#4b5563; line-height:1.6;'>
<h3 style='text-align:center;'>📊 Benchmarks and Metrics Overview</h3>
<p>
The Diké Leaderboard evaluates the impact of <b>quantization</b> and <b>compression</b>
on <b>bias, fairness, ethics, and safety</b> of large language models (LLMs).
Each benchmark measures a specific social or ethical aspect of model behavior.
</p>
<ul style='list-style-type: " "; padding-left: 1em;'>
<li><b>Perplexity (PPL)</b> - General measure of model fluency and language modeling quality,
evaluated on <a href='https://huggingface.co/datasets/Salesforce/wikitext' target='_blank' style='color:#2563eb;'>WikiText-2</a>.
Lower values indicate better language modeling performance.</li>
<li><b>HellaSwag</b> - Measures general reasoning and commonsense knowledge via multiple-choice story completion.
Dataset: <a href='https://huggingface.co/datasets/Rowan/hellaswag' target='_blank' style='color:#2563eb;'>HellaSwag</a>.
Metric: Accuracy.</li>
<li><b>BBQ (Bias Benchmark for QA)</b> - Evaluates bias in ambiguous and disambiguated
question-answering contexts across 11 protected categories
(<a href='https://github.com/nyu-mll/BBQ' target='_blank' style='color:#2563eb;'>BBQ dataset</a>).
Metrics: Accuracy, Bias (ambiguous), Bias (disambiguated).</li>
<li><b>CrowS-Pairs</b> - Minimal stereotype pairs.
(<a href='https://huggingface.co/datasets/nyu-mll/crows_pairs' target='_blank' style='color:#2563eb;'>CrowS-Pairs dataset</a>).
Metric: % of stereotyped continuations.</li>
<li><b>HolisticBias</b> - 13 demographic axes with sentiment prompts
(<a href='https://huggingface.co/datasets/fairnlp/holistic-bias' target='_blank' style='color:#2563eb;'>HolisticBias dataset</a>).
Metric: Sentiment skew across identity descriptors.</li>
<li><b>SoFA (Social Fairness Dataset)</b> - 1.49M bias probes covering religion, gender, race, and disability
(<a href='https://huggingface.co/datasets/copenlu/sofa' target='_blank' style='color:#2563eb;'>SoFA dataset</a>).
Metric: Variance of log-perplexity across identity groups.</li>
<li><b>StereoSet</b> - Triplet format (stereotype, anti-stereotype, unrelated)
across gender, race, religion, profession
(<a href='https://github.com/moinnadeem/StereoSet' target='_blank' style='color:#2563eb;'>StereoSet dataset</a>).
Metric: Stereotype Score, Language Modeling Score.</li>
<li><b>ETHICS</b> - Morality judgments across five ethical principles;
we use the <i>Commonsense Morality</i> subset
(<a href='https://huggingface.co/datasets/hendrycks/ethics' target='_blank' style='color:#2563eb;'>ETHICS dataset</a>).
Metric: Accuracy.</li>
<li><b>Moral Stories</b> - First-person scenarios for moral vs. immoral action selection
(<a href='https://huggingface.co/datasets/demelin/moral_stories' target='_blank' style='color:#2563eb;'>Moral Stories dataset</a>).
Metrics: Moral preference Accuracy, Refusal rate.</li>
<li><b>Histoires Morales</b> - French extension of Moral Stories for cross-lingual ethics evaluation.
(<a href='https://huggingface.co/datasets/LabHC/histoires_morales' target='_blank' style='color:#2563eb;'>Moral Stories dataset</a>).
Metric: Accuracy, Refusal rate.</li>
<li><b>RealToxicityPrompts</b> - Measures generation toxicity given neutral prompts
(<a href='https://huggingface.co/datasets/allenai/real-toxicity-prompts' target='_blank' style='color:#2563eb;'>RealToxicityPrompts</a>).
Metric: Average toxicity probability.</li>
<li><b>HarmBench</b> - Evaluates safety by measuring model responses to harmful or unethical prompts
(<a href='https://huggingface.co/datasets/walledai/HarmBench' target='_blank' style='color:#2563eb;'>HarmBench</a>).
Metric: Unsafe response rate.</li>
</ul>
<p style='margin-top:1.5em;'>
All evaluations are implemented via the
<a href='https://github.com/EleutherAI/lm-evaluation-harness'
target='_blank' style='color:#5a8dee;'>LM Evaluation Harness</a>
and follow consistent zero-shot protocols.
</p>
</div>
""")
gr.HTML(
"""
<div style='text-align:center; margin-top:30px; font-size:14px; color:#777;'>
<b>Notes</b><br>
• Click column headers to sort ascending/descending<br>
• Model names are clickable links to Hugging Face pages<br><br>
Part of the <a href='https://www.anr-dike.fr/' target='_blank' style='color:#5a8dee;'>⚖️ Diké Project</a>.
</div>
"""
)
demo.launch(server_name="0.0.0.0", server_port=7860)