Spaces:
Running
Running
update results
Browse files- app.py +12 -12
- src/display/about.py +24 -9
app.py
CHANGED
|
@@ -45,7 +45,7 @@ show_columns_overall = ['R', 'Model', 'type', 'open?','#P(B)', 'SeaExam-pub', 'S
|
|
| 45 |
TYPES_overall = ['number', 'markdown', 'str', 'str', 'number', 'number', 'number', 'number', 'number']
|
| 46 |
|
| 47 |
# Load the data from the csv file
|
| 48 |
-
csv_path = f'{EVAL_RESULTS_PATH}/
|
| 49 |
# csv_path = f'eval-results/SeaExam_results_20241030.csv'
|
| 50 |
df = pd.read_csv(csv_path, skiprows=1, header=0)
|
| 51 |
# df_m3exam, df_mmlu, df_avg = load_data(csv_path)
|
|
@@ -54,7 +54,7 @@ df_seaexam, df_seabench, df_overall = load_data(csv_path)
|
|
| 54 |
demo = gr.Blocks(css=custom_css)
|
| 55 |
with demo:
|
| 56 |
gr.HTML(TITLE)
|
| 57 |
-
gr.HTML(SUB_TITLE)
|
| 58 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 59 |
|
| 60 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
|
@@ -125,18 +125,18 @@ with demo:
|
|
| 125 |
|
| 126 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=3):
|
| 127 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
gr.Markdown(CONTACT_TEXT, elem_classes="markdown-text")
|
| 138 |
|
| 139 |
-
demo.launch()
|
| 140 |
|
| 141 |
scheduler = BackgroundScheduler()
|
| 142 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
|
|
|
| 45 |
TYPES_overall = ['number', 'markdown', 'str', 'str', 'number', 'number', 'number', 'number', 'number']
|
| 46 |
|
| 47 |
# Load the data from the csv file
|
| 48 |
+
csv_path = f'{EVAL_RESULTS_PATH}/SeaExam_results_20241122.csv'
|
| 49 |
# csv_path = f'eval-results/SeaExam_results_20241030.csv'
|
| 50 |
df = pd.read_csv(csv_path, skiprows=1, header=0)
|
| 51 |
# df_m3exam, df_mmlu, df_avg = load_data(csv_path)
|
|
|
|
| 54 |
demo = gr.Blocks(css=custom_css)
|
| 55 |
with demo:
|
| 56 |
gr.HTML(TITLE)
|
| 57 |
+
# gr.HTML(SUB_TITLE)
|
| 58 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 59 |
|
| 60 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
|
|
|
| 125 |
|
| 126 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=3):
|
| 127 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 128 |
+
with gr.Row():
|
| 129 |
+
with gr.Accordion("π Citation", open=False):
|
| 130 |
+
citation_button = gr.Textbox(
|
| 131 |
+
value=CITATION_BUTTON_TEXT,
|
| 132 |
+
label=CITATION_BUTTON_LABEL,
|
| 133 |
+
lines=20,
|
| 134 |
+
elem_id="citation-button",
|
| 135 |
+
show_copy_button=True,
|
| 136 |
+
)
|
| 137 |
gr.Markdown(CONTACT_TEXT, elem_classes="markdown-text")
|
| 138 |
|
| 139 |
+
demo.launch(share=True)
|
| 140 |
|
| 141 |
scheduler = BackgroundScheduler()
|
| 142 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
src/display/about.py
CHANGED
|
@@ -16,10 +16,11 @@ class Tasks(Enum):
|
|
| 16 |
|
| 17 |
|
| 18 |
# Your leaderboard name
|
| 19 |
-
TITLE = """<h1 align="center" id="space-title">π SeaExam and SeaBench Leaderboard</h1>"""
|
|
|
|
| 20 |
|
| 21 |
# subtitle
|
| 22 |
-
SUB_TITLE = """<h2 align="
|
| 23 |
|
| 24 |
# What does your leaderboard evaluate?
|
| 25 |
# INTRODUCTION_TEXT = """
|
|
@@ -34,6 +35,14 @@ INTRODUCTION_TEXT = """
|
|
| 34 |
This leaderboard evaluates Large Language Models (LLMs) on Southeast Asian (SEA) languages through two comprehensive benchmarks: SeaExam and SeaBench. SeaExam assesses world knowledge and reasoning capabilities through exam-style questions, while SeaBench evaluates instruction-following abilities and multi-turn conversational skills. For detailed methodology and results, please refer to the "π About" tab.
|
| 35 |
"""
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
# For additional details such as datasets, evaluation criteria, and reproducibility, please refer to the "π About" tab.
|
| 38 |
|
| 39 |
# Stay tuned for the *SeaBench leaderboard* - focusing on evaluating the model's ability to respond to general human instructions in real-world multi-turn settings.
|
|
@@ -46,7 +55,7 @@ Even though large language models (LLMs) have shown impressive performance on va
|
|
| 46 |
|
| 47 |
|
| 48 |
## Datasets
|
| 49 |
-
The benchmark data can be found in the [SeaExam dataset](https://huggingface.co/datasets/SeaLLMs/SeaExam) and SeaBench dataset
|
| 50 |
- **SeaExam**: a benchmark sourced from real and official human exam questions in multiple-choice format.
|
| 51 |
- **SeaBench**: a manually created benchmark for evaluating the model's ability to follow instructions and engage in multi-turn conversations. The questions are in open-ended format.
|
| 52 |
|
|
@@ -59,7 +68,7 @@ The benchmark data can be found in the [SeaExam dataset](https://huggingface.co/
|
|
| 59 |
_ **SeaBench**:
|
| 60 |
We evaluate the responses of the models with GPT-4o-2024-08-06. Each response is scored on a scale of 1-10.
|
| 61 |
|
| 62 |
-
##
|
| 63 |
How to interpret the leaderboard?
|
| 64 |
* Each numerical value represet the accuracy (%) for SeaExam and score for SeaBench.
|
| 65 |
* The "π
Overall" shows the average results across the three langauges for SeaExam public dataset (SeaExam-pub), SeaExam private dataset (SeaExam-prv), SeaBench public dataset (SeaBench-pub), (SeaBench-prv). This leaderboard is ranked by SeaExam-prv.
|
|
@@ -69,13 +78,13 @@ How to interpret the leaderboard?
|
|
| 69 |
* "open?" column indicates whether the model is open-source or proprietary.
|
| 70 |
|
| 71 |
## Reproducibility
|
| 72 |
-
To reproduce our results, use the script in [
|
| 73 |
-
```python
|
| 74 |
-
python scripts/main.py --model $model_name_or_path
|
| 75 |
-
```
|
| 76 |
-
|
| 77 |
"""
|
| 78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
# You can find the detailed numerical results in the results Hugging Face dataset: https://huggingface.co/datasets/SeaLLMs/SeaExam-results
|
| 80 |
|
| 81 |
EVALUATION_QUEUE_TEXT = """
|
|
@@ -110,6 +119,12 @@ If everything is done, check you can launch the EleutherAIHarness on your model
|
|
| 110 |
|
| 111 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
| 112 |
CITATION_BUTTON_TEXT = r"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
"""
|
| 114 |
|
| 115 |
CONTACT_TEXT = f"""
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
# Your leaderboard name
|
| 19 |
+
# TITLE = """<h1 align="center" id="space-title">π SeaExam and SeaBench Leaderboard</h1>"""
|
| 20 |
+
TITLE = """<h1 align="left" id="space-title">π
LLM Leaderboard for SEA</h1>"""
|
| 21 |
|
| 22 |
# subtitle
|
| 23 |
+
SUB_TITLE = """<h2 align="left" id="space-title">What is the best LLM for Southeast Asian Languagesβ</h1>"""
|
| 24 |
|
| 25 |
# What does your leaderboard evaluate?
|
| 26 |
# INTRODUCTION_TEXT = """
|
|
|
|
| 35 |
This leaderboard evaluates Large Language Models (LLMs) on Southeast Asian (SEA) languages through two comprehensive benchmarks: SeaExam and SeaBench. SeaExam assesses world knowledge and reasoning capabilities through exam-style questions, while SeaBench evaluates instruction-following abilities and multi-turn conversational skills. For detailed methodology and results, please refer to the "π About" tab.
|
| 36 |
"""
|
| 37 |
|
| 38 |
+
INTRODUCTION_TEXT = """
|
| 39 |
+
This leaderboard evaluates Large Language Models (LLMs) on Southeast Asian (SEA) languages through two comprehensive benchmarks: SeaExam and SeaBench:
|
| 40 |
+
* SeaExam assesses world knowledge and reasoning capabilities through exam-style questions [[data (public)](https://huggingface.co/datasets/SeaLLMs/SeaExam)] [[code](https://github.com/DAMO-NLP-SG/SeaExam)]
|
| 41 |
+
* SeaBench evaluates instruction-following abilities and multi-turn conversational skills. [[data (public)](https://huggingface.co/datasets/SeaLLMs/SeaBench)] [[code](https://github.com/DAMO-NLP-SG/SeaBench?tab=readme-ov-file)]
|
| 42 |
+
|
| 43 |
+
Note: "pub" denotes public dataset, and "prv" denotes private dataset.
|
| 44 |
+
For more details, please refer to the "π About" tab.
|
| 45 |
+
"""
|
| 46 |
# For additional details such as datasets, evaluation criteria, and reproducibility, please refer to the "π About" tab.
|
| 47 |
|
| 48 |
# Stay tuned for the *SeaBench leaderboard* - focusing on evaluating the model's ability to respond to general human instructions in real-world multi-turn settings.
|
|
|
|
| 55 |
|
| 56 |
|
| 57 |
## Datasets
|
| 58 |
+
The benchmark data can be found in the [SeaExam dataset](https://huggingface.co/datasets/SeaLLMs/SeaExam) and [SeaBench dataset](https://huggingface.co/datasets/SeaLLMs/SeaBench).
|
| 59 |
- **SeaExam**: a benchmark sourced from real and official human exam questions in multiple-choice format.
|
| 60 |
- **SeaBench**: a manually created benchmark for evaluating the model's ability to follow instructions and engage in multi-turn conversations. The questions are in open-ended format.
|
| 61 |
|
|
|
|
| 68 |
_ **SeaBench**:
|
| 69 |
We evaluate the responses of the models with GPT-4o-2024-08-06. Each response is scored on a scale of 1-10.
|
| 70 |
|
| 71 |
+
## Results
|
| 72 |
How to interpret the leaderboard?
|
| 73 |
* Each numerical value represet the accuracy (%) for SeaExam and score for SeaBench.
|
| 74 |
* The "π
Overall" shows the average results across the three langauges for SeaExam public dataset (SeaExam-pub), SeaExam private dataset (SeaExam-prv), SeaBench public dataset (SeaBench-pub), (SeaBench-prv). This leaderboard is ranked by SeaExam-prv.
|
|
|
|
| 78 |
* "open?" column indicates whether the model is open-source or proprietary.
|
| 79 |
|
| 80 |
## Reproducibility
|
| 81 |
+
To reproduce our results, use the script in [SeaExam](https://github.com/DAMO-NLP-SG/SeaExam/tree/main) and [SeaBench](https://github.com/DAMO-NLP-SG/SeaBench). The script will download the model and tokenizer, and evaluate the model on the benchmark data.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
"""
|
| 83 |
|
| 84 |
+
# ```python
|
| 85 |
+
# python scripts/main.py --model $model_name_or_path
|
| 86 |
+
# ```
|
| 87 |
+
|
| 88 |
# You can find the detailed numerical results in the results Hugging Face dataset: https://huggingface.co/datasets/SeaLLMs/SeaExam-results
|
| 89 |
|
| 90 |
EVALUATION_QUEUE_TEXT = """
|
|
|
|
| 119 |
|
| 120 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
| 121 |
CITATION_BUTTON_TEXT = r"""
|
| 122 |
+
@article{damonlp2024sealeaderboard,
|
| 123 |
+
author = {Chaoqun Liu, Wenxuan Zhang, Jiahao Ying, Mahani Aljunied, Anh Tuan Luu, Lidong Bing},
|
| 124 |
+
title = {SeaExam and SeaBench: Benchmarking LLMs with Local Multilingual Questions in Southeast Asia},
|
| 125 |
+
year = {2024},
|
| 126 |
+
url = {},
|
| 127 |
+
}
|
| 128 |
"""
|
| 129 |
|
| 130 |
CONTACT_TEXT = f"""
|