Spaces:
Running
Running
| import gradio as gr | |
| import pandas as pd | |
| def model_hyperlink_md(link: str, name: str) -> str: | |
| return f"[{name}]({link})" | |
| def make_clickable_and_drop_links(df: pd.DataFrame) -> pd.DataFrame: | |
| if "Links" not in df.columns: | |
| raise ValueError("CSV must include a 'Links' column.") | |
| df = df.copy() | |
| df["Model"] = df.apply(lambda r: model_hyperlink_md(r["Links"], r["Model"]), axis=1) | |
| return df.drop(columns=["Links"]) | |
| def datatypes_with_markdown(df: pd.DataFrame): | |
| return ["markdown" if c == "Model" else "str" for c in df.columns] | |
| # ---------- load data ---------- | |
| BASE_CSV = "code_eval_board.csv" | |
| INSTRUCT_CSV = "eval_instruct_lms.csv" | |
| base_df_raw = pd.read_csv(BASE_CSV) | |
| inst_df_raw = pd.read_csv(INSTRUCT_CSV) | |
| base_df = make_clickable_and_drop_links(base_df_raw) | |
| inst_df = make_clickable_and_drop_links(inst_df_raw) | |
| base_dtypes = datatypes_with_markdown(base_df) | |
| inst_dtypes = datatypes_with_markdown(inst_df) | |
| # ---------- css ---------- | |
| custom_css = """ | |
| .gradio-container {font-family: Inter, system-ui, -apple-system, Segoe UI, Roboto, sans-serif;} | |
| #base-table a, #inst-table a { | |
| color: #2a7ae2 !important; | |
| text-decoration: underline dotted !important; | |
| text-underline-offset: 3px; | |
| } | |
| #base-table a:hover, #inst-table a:hover { | |
| color: #1e5bbf !important; | |
| text-decoration: underline solid !important; | |
| } | |
| """ | |
| # ---------- app ---------- | |
| demo = gr.Blocks(css=custom_css) | |
| with demo: | |
| # ---------- HEADER ---------- | |
| gr.HTML( | |
| """<div id='header' style='text-align:center; margin-top:16px;'> | |
| <div id='title-row' | |
| style='display:flex; align-items:center; justify-content:center; gap:16px; flex-wrap:wrap;'> | |
| <img src='https://legendaryladieshub.com/wp-content/uploads/2023/12/Dike_Greek-goddess-of-justice-and-moral-order_by-LLH-300x300.jpeg' | |
| alt='Diké' width='80' | |
| style='border-radius:50%; object-fit:cover; box-shadow:0 0 8px rgba(0,0,0,0.4); background:transparent;'> | |
| <div style='display:flex; flex-direction:column; align-items:center; text-align:center;'> | |
| <h1 style='font-size:30px; margin:0; font-weight:650;'>Open Diké Leaderboard</h1> | |
| <p style='font-size:18px; margin:4px 0; color:#6c7a89;'> | |
| Bias and Fairness in Compressed LLMs | |
| </p> | |
| </div> | |
| </div> | |
| <p id='subtitle' | |
| style='font-size:14px; color:#8a9aad; margin-top:12px; | |
| max-width:1000px; margin-left:auto; margin-right:auto; | |
| line-height:1.6; text-align:justify;'> | |
| Inspired by | |
| <a href='https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/' | |
| target='_blank' | |
| style='color:#5a8dee; text-decoration:none; font-weight:500;'> | |
| 🤗 Open LLM Leaderboard | |
| </a> and | |
| <a href='https://huggingface.co/spaces/optimum/llm-perf-leaderboard' | |
| target='_blank' | |
| style='color:#5a8dee; text-decoration:none; font-weight:500;'> | |
| Optimum Leaderboard 🏋️ | |
| </a>, we compare the performance of compressed LLMs across | |
| <b>fairness</b>, <b>toxicity</b>, <b>ethics</b>, and <b>safety</b> benchmarks. The leaderboard is released as part of the | |
| <a href='https://www.anr-dike.fr/' target='_blank' | |
| style='color:#5a8dee; text-decoration:none; font-weight:500;'>⚖️ Diké Project</a>. | |
| </p> | |
| </div>""" | |
| ) | |
| # ---------- TABS ---------- | |
| with gr.Tabs(): | |
| # TAB 1: Base LLMs | |
| with gr.TabItem("🟢 Base LLMs Evaluation"): | |
| with gr.Row(): | |
| base_search = gr.Textbox(placeholder="🔍 Search base models...", show_label=False) | |
| def base_search_fn(q): | |
| if not q or not q.strip(): | |
| return base_df | |
| mask = base_df["Model"].str.contains(q, case=False) | |
| return base_df[mask] | |
| base_table = gr.Dataframe( | |
| value=base_df, | |
| datatype=base_dtypes, | |
| interactive=False, | |
| sortable=True, | |
| elem_id="base-table", | |
| ) | |
| base_search.submit(base_search_fn, base_search, base_table) | |
| # TAB 2: Instruction-tuned LLMs | |
| with gr.TabItem("🔶 Instruction-tuned LLMs Evaluation"): | |
| with gr.Row(): | |
| inst_search = gr.Textbox(placeholder="🔍 Search instruction-tuned models...", show_label=False) | |
| def inst_search_fn(q): | |
| if not q or not q.strip(): | |
| return inst_df | |
| mask = inst_df["Model"].str.contains(q, case=False) | |
| return inst_df[mask] | |
| inst_table = gr.Dataframe( | |
| value=inst_df, | |
| datatype=inst_dtypes, | |
| interactive=False, | |
| sortable=True, | |
| elem_id="inst-table", | |
| ) | |
| inst_search.submit(inst_search_fn, inst_search, inst_table) | |
| # TAB 3: About | |
| # ---------- TAB 3: About ---------- | |
| with gr.TabItem("📘 About"): | |
| gr.HTML(""" | |
| <div style='max-width:900px; margin:0 auto; text-align:justify; color:#4b5563; line-height:1.6;'> | |
| <h3 style='text-align:center;'>📊 Benchmarks and Metrics Overview</h3> | |
| <p> | |
| The Diké Leaderboard evaluates the impact of <b>quantization</b> and <b>compression</b> | |
| on <b>bias, fairness, ethics, and safety</b> of large language models (LLMs). | |
| Each benchmark measures a specific social or ethical aspect of model behavior. | |
| </p> | |
| <ul style='list-style-type: " "; padding-left: 1em;'> | |
| <li><b>Perplexity (PPL)</b> - General measure of model fluency and language modeling quality, | |
| evaluated on <a href='https://huggingface.co/datasets/Salesforce/wikitext' target='_blank' style='color:#2563eb;'>WikiText-2</a>. | |
| Lower values indicate better language modeling performance.</li> | |
| <li><b>HellaSwag</b> - Measures general reasoning and commonsense knowledge via multiple-choice story completion. | |
| Dataset: <a href='https://huggingface.co/datasets/Rowan/hellaswag' target='_blank' style='color:#2563eb;'>HellaSwag</a>. | |
| Metric: Accuracy.</li> | |
| <li><b>BBQ (Bias Benchmark for QA)</b> - Evaluates bias in ambiguous and disambiguated | |
| question-answering contexts across 11 protected categories | |
| (<a href='https://github.com/nyu-mll/BBQ' target='_blank' style='color:#2563eb;'>BBQ dataset</a>). | |
| Metrics: Accuracy, Bias (ambiguous), Bias (disambiguated).</li> | |
| <li><b>CrowS-Pairs</b> - Minimal stereotype pairs. | |
| (<a href='https://huggingface.co/datasets/nyu-mll/crows_pairs' target='_blank' style='color:#2563eb;'>CrowS-Pairs dataset</a>). | |
| Metric: % of stereotyped continuations.</li> | |
| <li><b>HolisticBias</b> - 13 demographic axes with sentiment prompts | |
| (<a href='https://huggingface.co/datasets/fairnlp/holistic-bias' target='_blank' style='color:#2563eb;'>HolisticBias dataset</a>). | |
| Metric: Sentiment skew across identity descriptors.</li> | |
| <li><b>SoFA (Social Fairness Dataset)</b> - 1.49M bias probes covering religion, gender, race, and disability | |
| (<a href='https://huggingface.co/datasets/copenlu/sofa' target='_blank' style='color:#2563eb;'>SoFA dataset</a>). | |
| Metric: Variance of log-perplexity across identity groups.</li> | |
| <li><b>StereoSet</b> - Triplet format (stereotype, anti-stereotype, unrelated) | |
| across gender, race, religion, profession | |
| (<a href='https://github.com/moinnadeem/StereoSet' target='_blank' style='color:#2563eb;'>StereoSet dataset</a>). | |
| Metric: Stereotype Score, Language Modeling Score.</li> | |
| <li><b>ETHICS</b> - Morality judgments across five ethical principles; | |
| we use the <i>Commonsense Morality</i> subset | |
| (<a href='https://huggingface.co/datasets/hendrycks/ethics' target='_blank' style='color:#2563eb;'>ETHICS dataset</a>). | |
| Metric: Accuracy.</li> | |
| <li><b>Moral Stories</b> - First-person scenarios for moral vs. immoral action selection | |
| (<a href='https://huggingface.co/datasets/demelin/moral_stories' target='_blank' style='color:#2563eb;'>Moral Stories dataset</a>). | |
| Metrics: Moral preference Accuracy, Refusal rate.</li> | |
| <li><b>Histoires Morales</b> - French extension of Moral Stories for cross-lingual ethics evaluation. | |
| (<a href='https://huggingface.co/datasets/LabHC/histoires_morales' target='_blank' style='color:#2563eb;'>Moral Stories dataset</a>). | |
| Metric: Accuracy, Refusal rate.</li> | |
| <li><b>RealToxicityPrompts</b> - Measures generation toxicity given neutral prompts | |
| (<a href='https://huggingface.co/datasets/allenai/real-toxicity-prompts' target='_blank' style='color:#2563eb;'>RealToxicityPrompts</a>). | |
| Metric: Average toxicity probability.</li> | |
| <li><b>HarmBench</b> - Evaluates safety by measuring model responses to harmful or unethical prompts | |
| (<a href='https://huggingface.co/datasets/walledai/HarmBench' target='_blank' style='color:#2563eb;'>HarmBench</a>). | |
| Metric: Unsafe response rate.</li> | |
| </ul> | |
| <p style='margin-top:1.5em;'> | |
| All evaluations are implemented via the | |
| <a href='https://github.com/EleutherAI/lm-evaluation-harness' | |
| target='_blank' style='color:#5a8dee;'>LM Evaluation Harness</a> | |
| and follow consistent zero-shot protocols. | |
| </p> | |
| </div> | |
| """) | |
| gr.HTML( | |
| """ | |
| <div style='text-align:center; margin-top:30px; font-size:14px; color:#777;'> | |
| <b>Notes</b><br> | |
| • Click column headers to sort ascending/descending<br> | |
| • Model names are clickable links to Hugging Face pages<br><br> | |
| Part of the <a href='https://www.anr-dike.fr/' target='_blank' style='color:#5a8dee;'>⚖️ Diké Project</a>. | |
| </div> | |
| """ | |
| ) | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |