Spaces:

iproskurina
/

dike-leaderboard

Running

App Files Files Community

dike-leaderboard / app.py

iproskurina

Update app.py

10b67a4 verified about 1 month ago

raw

history blame contribute delete

10.8 kB

	import gradio as gr
	import pandas as pd


	def model_hyperlink_md(link: str, name: str) -> str:
	return f"[{name}]({link})"


	def make_clickable_and_drop_links(df: pd.DataFrame) -> pd.DataFrame:
	if "Links" not in df.columns:
	raise ValueError("CSV must include a 'Links' column.")
	df = df.copy()
	df["Model"] = df.apply(lambda r: model_hyperlink_md(r["Links"], r["Model"]), axis=1)
	return df.drop(columns=["Links"])


	def datatypes_with_markdown(df: pd.DataFrame):
	return ["markdown" if c == "Model" else "str" for c in df.columns]


	# ---------- load data ----------
	BASE_CSV = "code_eval_board.csv"
	INSTRUCT_CSV = "eval_instruct_lms.csv"

	base_df_raw = pd.read_csv(BASE_CSV)
	inst_df_raw = pd.read_csv(INSTRUCT_CSV)

	base_df = make_clickable_and_drop_links(base_df_raw)
	inst_df = make_clickable_and_drop_links(inst_df_raw)

	base_dtypes = datatypes_with_markdown(base_df)
	inst_dtypes = datatypes_with_markdown(inst_df)

	# ---------- css ----------
	custom_css = """
	.gradio-container {font-family: Inter, system-ui, -apple-system, Segoe UI, Roboto, sans-serif;}
	#base-table a, #inst-table a {
	color: #2a7ae2 !important;
	text-decoration: underline dotted !important;
	text-underline-offset: 3px;
	}
	#base-table a:hover, #inst-table a:hover {
	color: #1e5bbf !important;
	text-decoration: underline solid !important;
	}
	"""

	# ---------- app ----------
	demo = gr.Blocks(css=custom_css)

	with demo:
	# ---------- HEADER ----------
	gr.HTML(
	"""<div id='header' style='text-align:center; margin-top:16px;'>
	<div id='title-row'
	style='display:flex; align-items:center; justify-content:center; gap:16px; flex-wrap:wrap;'>
	<img src='https://legendaryladieshub.com/wp-content/uploads/2023/12/Dike_Greek-goddess-of-justice-and-moral-order_by-LLH-300x300.jpeg'
	alt='Diké' width='80'
	style='border-radius:50%; object-fit:cover; box-shadow:0 0 8px rgba(0,0,0,0.4); background:transparent;'>
	<div style='display:flex; flex-direction:column; align-items:center; text-align:center;'>
	<h1 style='font-size:30px; margin:0; font-weight:650;'>Open Diké Leaderboard</h1>
	<p style='font-size:18px; margin:4px 0; color:#6c7a89;'>
	Bias and Fairness in Compressed LLMs
	</p>
	</div>
	</div>

	<p id='subtitle'
	style='font-size:14px; color:#8a9aad; margin-top:12px;
	max-width:1000px; margin-left:auto; margin-right:auto;
	line-height:1.6; text-align:justify;'>
	Inspired by
	<a href='https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/'
	target='_blank'
	style='color:#5a8dee; text-decoration:none; font-weight:500;'>
	🤗 Open LLM Leaderboard
	</a> and
	<a href='https://huggingface.co/spaces/optimum/llm-perf-leaderboard'
	target='_blank'
	style='color:#5a8dee; text-decoration:none; font-weight:500;'>
	Optimum Leaderboard 🏋️
	</a>, we compare the performance of compressed LLMs across
	<b>fairness</b>, <b>toxicity</b>, <b>ethics</b>, and <b>safety</b> benchmarks. The leaderboard is released as part of the
	<a href='https://www.anr-dike.fr/' target='_blank'
	style='color:#5a8dee; text-decoration:none; font-weight:500;'>⚖️ Diké Project</a>.
	</p>
	</div>"""
	)

	# ---------- TABS ----------
	with gr.Tabs():
	# TAB 1: Base LLMs
	with gr.TabItem("🟢 Base LLMs Evaluation"):
	with gr.Row():
	base_search = gr.Textbox(placeholder="🔍 Search base models...", show_label=False)


	def base_search_fn(q):
	if not q or not q.strip():
	return base_df
	mask = base_df["Model"].str.contains(q, case=False)
	return base_df[mask]


	base_table = gr.Dataframe(
	value=base_df,
	datatype=base_dtypes,
	interactive=False,
	sortable=True,
	elem_id="base-table",
	)
	base_search.submit(base_search_fn, base_search, base_table)

	# TAB 2: Instruction-tuned LLMs
	with gr.TabItem("🔶 Instruction-tuned LLMs Evaluation"):
	with gr.Row():
	inst_search = gr.Textbox(placeholder="🔍 Search instruction-tuned models...", show_label=False)


	def inst_search_fn(q):
	if not q or not q.strip():
	return inst_df
	mask = inst_df["Model"].str.contains(q, case=False)
	return inst_df[mask]


	inst_table = gr.Dataframe(
	value=inst_df,
	datatype=inst_dtypes,
	interactive=False,
	sortable=True,
	elem_id="inst-table",
	)
	inst_search.submit(inst_search_fn, inst_search, inst_table)

	# TAB 3: About
	# ---------- TAB 3: About ----------
	with gr.TabItem("📘 About"):
	gr.HTML("""
	<div style='max-width:900px; margin:0 auto; text-align:justify; color:#4b5563; line-height:1.6;'>
	<h3 style='text-align:center;'>📊 Benchmarks and Metrics Overview</h3>
	<p>
	The Diké Leaderboard evaluates the impact of <b>quantization</b> and <b>compression</b>
	on <b>bias, fairness, ethics, and safety</b> of large language models (LLMs).
	Each benchmark measures a specific social or ethical aspect of model behavior.
	</p>

	<ul style='list-style-type: " "; padding-left: 1em;'>

	<li><b>Perplexity (PPL)</b> - General measure of model fluency and language modeling quality,
	evaluated on <a href='https://huggingface.co/datasets/Salesforce/wikitext' target='_blank' style='color:#2563eb;'>WikiText-2</a>.
	Lower values indicate better language modeling performance.</li>

	<li><b>HellaSwag</b> - Measures general reasoning and commonsense knowledge via multiple-choice story completion.
	Dataset: <a href='https://huggingface.co/datasets/Rowan/hellaswag' target='_blank' style='color:#2563eb;'>HellaSwag</a>.
	Metric: Accuracy.</li>

	<li><b>BBQ (Bias Benchmark for QA)</b> - Evaluates bias in ambiguous and disambiguated
	question-answering contexts across 11 protected categories
	(<a href='https://github.com/nyu-mll/BBQ' target='_blank' style='color:#2563eb;'>BBQ dataset</a>).
	Metrics: Accuracy, Bias (ambiguous), Bias (disambiguated).</li>

	<li><b>CrowS-Pairs</b> - Minimal stereotype pairs.
	(<a href='https://huggingface.co/datasets/nyu-mll/crows_pairs' target='_blank' style='color:#2563eb;'>CrowS-Pairs dataset</a>).
	Metric: % of stereotyped continuations.</li>

	<li><b>HolisticBias</b> - 13 demographic axes with sentiment prompts
	(<a href='https://huggingface.co/datasets/fairnlp/holistic-bias' target='_blank' style='color:#2563eb;'>HolisticBias dataset</a>).
	Metric: Sentiment skew across identity descriptors.</li>

	<li><b>SoFA (Social Fairness Dataset)</b> - 1.49M bias probes covering religion, gender, race, and disability
	(<a href='https://huggingface.co/datasets/copenlu/sofa' target='_blank' style='color:#2563eb;'>SoFA dataset</a>).
	Metric: Variance of log-perplexity across identity groups.</li>

	<li><b>StereoSet</b> - Triplet format (stereotype, anti-stereotype, unrelated)
	across gender, race, religion, profession
	(<a href='https://github.com/moinnadeem/StereoSet' target='_blank' style='color:#2563eb;'>StereoSet dataset</a>).
	Metric: Stereotype Score, Language Modeling Score.</li>

	<li><b>ETHICS</b> - Morality judgments across five ethical principles;
	we use the <i>Commonsense Morality</i> subset
	(<a href='https://huggingface.co/datasets/hendrycks/ethics' target='_blank' style='color:#2563eb;'>ETHICS dataset</a>).
	Metric: Accuracy.</li>

	<li><b>Moral Stories</b> - First-person scenarios for moral vs. immoral action selection
	(<a href='https://huggingface.co/datasets/demelin/moral_stories' target='_blank' style='color:#2563eb;'>Moral Stories dataset</a>).
	Metrics: Moral preference Accuracy, Refusal rate.</li>

	<li><b>Histoires Morales</b> - French extension of Moral Stories for cross-lingual ethics evaluation.
	(<a href='https://huggingface.co/datasets/LabHC/histoires_morales' target='_blank' style='color:#2563eb;'>Moral Stories dataset</a>).
	Metric: Accuracy, Refusal rate.</li>

	<li><b>RealToxicityPrompts</b> - Measures generation toxicity given neutral prompts
	(<a href='https://huggingface.co/datasets/allenai/real-toxicity-prompts' target='_blank' style='color:#2563eb;'>RealToxicityPrompts</a>).
	Metric: Average toxicity probability.</li>

	<li><b>HarmBench</b> - Evaluates safety by measuring model responses to harmful or unethical prompts
	(<a href='https://huggingface.co/datasets/walledai/HarmBench' target='_blank' style='color:#2563eb;'>HarmBench</a>).
	Metric: Unsafe response rate.</li>

	</ul>

	<p style='margin-top:1.5em;'>
	All evaluations are implemented via the
	<a href='https://github.com/EleutherAI/lm-evaluation-harness'
	target='_blank' style='color:#5a8dee;'>LM Evaluation Harness</a>
	and follow consistent zero-shot protocols.
	</p>
	</div>
	""")

	gr.HTML(
	"""
	<div style='text-align:center; margin-top:30px; font-size:14px; color:#777;'>
	<b>Notes</b><br>
	• Click column headers to sort ascending/descending<br>
	• Model names are clickable links to Hugging Face pages<br><br>
	Part of the <a href='https://www.anr-dike.fr/' target='_blank' style='color:#5a8dee;'>⚖️ Diké Project</a>.
	</div>
	"""
	)
	demo.launch(server_name="0.0.0.0", server_port=7860)