more work
Browse files- data/history.csv +29 -1
- data/leaderboard.csv +7 -7
- data/models.jsonl +2 -2
- models.jsonl +2 -3
- src/app.py +137 -10
- src/data_manager.py +1 -1
- src/judge.py +29 -2
- src/ui.py +2 -2
data/history.csv
CHANGED
|
@@ -1 +1,29 @@
|
|
| 1 |
-
timestamp,input,output,judge1_id,judge1_name,judge1_evaluation,judge2_id,judge2_name,judge2_evaluation,winner_id,user_ip
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
timestamp,input,output,judge1_id,judge1_name,judge1_evaluation,judge2_id,judge2_name,judge2_evaluation,winner_id,user_ip,judge1_time,judge2_time
|
| 2 |
+
2025-04-24T17:05:59.863028,"Does Neptune And Uranus Have Solid Ground? Credit: Universe Today Neptune and Uranus are gas giants and do not have solid ground. They are mostly made up of hydrogen, helium, and methane. It has been discovered that a mysterious dark storm has formed on Neptune. Hubble has photographed four other mysterious dark vortexes since 1993, the most recent of which was captured on Wednesday.","""solid ground"" can be found on Earth, Jupiter and Mars but not on Neptune and the Sun.",deepseek-v3,DeepSeek V3,"### Evaluation:
|
| 3 |
+
|
| 4 |
+
1. **Factual Support**: 7/10
|
| 5 |
+
- The text explicitly states that Neptune and Uranus are gas giants and do not have solid ground, which directly supports the claim regarding Neptune.
|
| 6 |
+
- However, the text does not mention Earth, Jupiter, Mars, or the Sun, so there is no direct support for those parts of the claim.
|
| 7 |
+
|
| 8 |
+
2. **Logical Connection**: 6/10
|
| 9 |
+
- The logical connection is strong for Neptune, as the text clearly contradicts the idea of solid ground there.
|
| 10 |
+
- For the other celestial bodies (Earth, Jupiter, Mars, Sun), the absence of information in the text means the claim cannot be logically validated or refuted based on this text alone.
|
| 11 |
+
|
| 12 |
+
3. **Claim Accuracy**: 6/10
|
| 13 |
+
- The claim is accurate regarding Neptune (no solid ground), which aligns with the text.
|
| 14 |
+
- The claim about Earth, Jupiter, Mars, and the Sun may or may not be accurate, but the text does not provide evidence either way.
|
| 15 |
+
|
| 16 |
+
### Summary:
|
| 17 |
+
The claim is **partially supported** by the text. The assertion that Neptune lacks solid ground is correct and backed by the text. However, the text does not address the other celestial bodies mentioned (Earth, Jupiter, Mars, Sun), so those parts of the claim remain unverified in this context. A more complete evaluation would require additional sources discussing those planets and the Sun.",meta-llama-3.1-8b-instruct-turbo,Meta Llama 3.1 8B Instruct,"**Evaluation:**
|
| 18 |
+
|
| 19 |
+
1. **Factual Support:** 8/10
|
| 20 |
+
The text provides factual support for the claim that Neptune and Uranus do not have solid ground. It states that they are gas giants and are mostly made up of hydrogen, helium, and methane. However, the text does not explicitly mention the Sun, so it does not provide factual support for the claim regarding the Sun.
|
| 21 |
+
|
| 22 |
+
2. **Logical Connection:** 9/10
|
| 23 |
+
The text provides a logical connection to the claim by stating that Neptune and Uranus are gas giants, which implies that they do not have solid ground. The mention of other planets like Earth, Jupiter, and Mars having solid ground also provides a logical connection to the claim.
|
| 24 |
+
|
| 25 |
+
3. **Claim Accuracy:** 7/10
|
| 26 |
+
The claim is partially accurate. It is correct that Neptune and Uranus do not have solid ground. However, the claim is not accurate regarding the Sun, as the text does not provide any information about the Sun's composition or the presence of solid ground.
|
| 27 |
+
|
| 28 |
+
**Summary:**
|
| 29 |
+
The claim is partially supported by the text, but it is not entirely accurate. The text provides factual support for the claim regarding Neptune and Uranus, but it does not provide factual support for the claim regarding the Sun. The logical connection between the text and the claim is strong, but the claim's accuracy is limited by the omission of information about the Sun.",deepseek-v3,,23.25821089744568,2.207097053527832
|
data/leaderboard.csv
CHANGED
|
@@ -3,29 +3,29 @@ gemma-2-27b-it,Gemma 2 27B,1749.8091372785384,25.0,0.0,25.0,Google,Open Source
|
|
| 3 |
claude-3-opus-latest,Claude 3 Opus,1531.9661669788793,2.0,0.0,2.0,Anthropic,Proprietary
|
| 4 |
mistral-7b-instruct-v0.1,Mistral (7B) Instruct v0.1,1516.736306793522,1.0,0.0,1.0,Mistral AI,Open Source
|
| 5 |
qwen-2.5-7b-instruct-turbo,Qwen 2.5 7B Instruct,1516.0,1.0,0.0,1.0,Alibaba,Open Source
|
| 6 |
-
meta-llama-3.
|
|
|
|
|
|
|
| 7 |
claude-3-haiku-20240307,Claude 3 Haiku,1501.6053648908744,3.0,3.0,6.0,Anthropic,Proprietary
|
| 8 |
gpt-4-turbo,GPT-4 Turbo,1500.736306793522,1.0,1.0,2.0,OpenAI,Proprietary
|
| 9 |
-
deepseek-v3,DeepSeek V3,1500.
|
| 10 |
judge1,EvalGPT,1500.0,0.0,0.0,0.0,OpenAI,Commercial
|
| 11 |
o3-mini, o3-mini,1500.0,0.0,0.0,0.0,OpenAI,Proprietary
|
| 12 |
-
meta-llama-3.3-70B-instruct-turbo,Meta Llama 4 Scout 32K Instruct,1500.0,0.0,0.0,0.0,Meta,Open Source
|
| 13 |
meta-llama-4-scout-17B-16E-instruct,Meta Llama 4 Scout 17B 16E Instruct,1500.0,0.0,0.0,0.0,Meta,Open Source
|
| 14 |
judge5,Mixtral,1500.0,0.0,0.0,0.0,Mistral AI,Commercial
|
| 15 |
judge4,PrecisionJudge,1500.0,0.0,0.0,0.0,Anthropic,Commercial
|
| 16 |
-
judge3,GradeAssist,1500.0,0.0,0.0,0.0,Anthropic,Commercial
|
| 17 |
deepseek-r1,DeepSeek R1,1500.0,0.0,0.0,0.0,DeepSeek,Open Source
|
| 18 |
-
claude-3-sonnet-20240229,Claude 3 Sonnet,1500.0,0.0,0.0,0.0,Anthropic,Proprietary
|
| 19 |
claude-3-5-haiku-latest,Claude 3.5 Haiku,1500.0,0.0,0.0,0.0,Anthropic,Proprietary
|
| 20 |
atla-selene,Atla Selene,1500.0,0.0,0.0,0.0,Atla,Proprietary
|
| 21 |
qwen-2-72b-instruct,Qwen 2 Instruct (72B),1500.0,0.0,0.0,0.0,Alibaba,Open Source
|
| 22 |
gemma-2-9b-it,Gemma 2 9B,1500.0,0.0,0.0,0.0,Google,Open Source
|
| 23 |
judge2,CritiqueBot,1500.0,0.0,0.0,0.0,OpenAI,Commercial
|
|
|
|
| 24 |
meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,1499.263693206478,1.0,1.0,2.0,Meta,Open Source
|
| 25 |
meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,1499.2598341210926,2.0,2.0,4.0,Meta,Open Source
|
| 26 |
mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,1487.3632548757455,0.0,2.0,2.0,Mistral AI,Open Source
|
| 27 |
-
gpt-4.1,GPT-4.1,1484.7701398146428,0.0,1.0,1.0,OpenAI,Proprietary
|
| 28 |
claude-3-5-sonnet-latest,Claude 3.5 Sonnet,1484.0,0.0,1.0,1.0,Anthropic,Proprietary
|
| 29 |
-
|
|
|
|
| 30 |
qwen-2.5-72b-instruct-turbo,Qwen 2.5 72B Instruct,1412.6552679185854,21.0,25.0,46.0,Alibaba,Open Source
|
| 31 |
gpt-3.5-turbo,GPT-3.5 Turbo,1318.2061729482512,0.0,21.0,21.0,OpenAI,Proprietary
|
|
|
|
| 3 |
claude-3-opus-latest,Claude 3 Opus,1531.9661669788793,2.0,0.0,2.0,Anthropic,Proprietary
|
| 4 |
mistral-7b-instruct-v0.1,Mistral (7B) Instruct v0.1,1516.736306793522,1.0,0.0,1.0,Mistral AI,Open Source
|
| 5 |
qwen-2.5-7b-instruct-turbo,Qwen 2.5 7B Instruct,1516.0,1.0,0.0,1.0,Alibaba,Open Source
|
| 6 |
+
meta-llama-3.3-70B-instruct-turbo,Meta Llama 4 Scout 32K Instruct,1516.0,1.0,0.0,1.0,Meta,Open Source
|
| 7 |
+
claude-3-sonnet-20240229,Claude 3 Sonnet,1515.263693206478,1.0,0.0,1.0,Anthropic,Proprietary
|
| 8 |
+
gpt-4.1,GPT-4.1,1502.1692789932397,1.0,1.0,2.0,OpenAI,Proprietary
|
| 9 |
claude-3-haiku-20240307,Claude 3 Haiku,1501.6053648908744,3.0,3.0,6.0,Anthropic,Proprietary
|
| 10 |
gpt-4-turbo,GPT-4 Turbo,1500.736306793522,1.0,1.0,2.0,OpenAI,Proprietary
|
| 11 |
+
deepseek-v3,DeepSeek V3,1500.6365920113653,1.0,1.0,2.0,DeepSeek,Open Source
|
| 12 |
judge1,EvalGPT,1500.0,0.0,0.0,0.0,OpenAI,Commercial
|
| 13 |
o3-mini, o3-mini,1500.0,0.0,0.0,0.0,OpenAI,Proprietary
|
|
|
|
| 14 |
meta-llama-4-scout-17B-16E-instruct,Meta Llama 4 Scout 17B 16E Instruct,1500.0,0.0,0.0,0.0,Meta,Open Source
|
| 15 |
judge5,Mixtral,1500.0,0.0,0.0,0.0,Mistral AI,Commercial
|
| 16 |
judge4,PrecisionJudge,1500.0,0.0,0.0,0.0,Anthropic,Commercial
|
|
|
|
| 17 |
deepseek-r1,DeepSeek R1,1500.0,0.0,0.0,0.0,DeepSeek,Open Source
|
|
|
|
| 18 |
claude-3-5-haiku-latest,Claude 3.5 Haiku,1500.0,0.0,0.0,0.0,Anthropic,Proprietary
|
| 19 |
atla-selene,Atla Selene,1500.0,0.0,0.0,0.0,Atla,Proprietary
|
| 20 |
qwen-2-72b-instruct,Qwen 2 Instruct (72B),1500.0,0.0,0.0,0.0,Alibaba,Open Source
|
| 21 |
gemma-2-9b-it,Gemma 2 9B,1500.0,0.0,0.0,0.0,Google,Open Source
|
| 22 |
judge2,CritiqueBot,1500.0,0.0,0.0,0.0,OpenAI,Commercial
|
| 23 |
+
judge3,GradeAssist,1500.0,0.0,0.0,0.0,Anthropic,Commercial
|
| 24 |
meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,1499.263693206478,1.0,1.0,2.0,Meta,Open Source
|
| 25 |
meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,1499.2598341210926,2.0,2.0,4.0,Meta,Open Source
|
| 26 |
mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,1487.3632548757455,0.0,2.0,2.0,Mistral AI,Open Source
|
|
|
|
| 27 |
claude-3-5-sonnet-latest,Claude 3.5 Sonnet,1484.0,0.0,1.0,1.0,Anthropic,Proprietary
|
| 28 |
+
meta-llama-3.1-8b-instruct-turbo,Meta Llama 3.1 8B Instruct,1481.194128995395,1.0,2.0,3.0,Meta,Open Source
|
| 29 |
+
gpt-4o,GPT-4o,1468.736306793522,0.0,2.0,2.0,OpenAI,Proprietary
|
| 30 |
qwen-2.5-72b-instruct-turbo,Qwen 2.5 72B Instruct,1412.6552679185854,21.0,25.0,46.0,Alibaba,Open Source
|
| 31 |
gpt-3.5-turbo,GPT-3.5 Turbo,1318.2061729482512,0.0,21.0,21.0,OpenAI,Proprietary
|
data/models.jsonl
CHANGED
|
@@ -26,5 +26,5 @@
|
|
| 26 |
{"id": "qwen-2.5-7b-instruct-turbo", "name": "Qwen 2.5 7B Instruct", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-7B-Instruct-Turbo", "provider": "together", "parameters": "7B"}
|
| 27 |
|
| 28 |
{"id": "atla-selene", "name": "Atla Selene", "organization": "Atla", "license": "Proprietary", "api_model": "atla-selene", "provider": "together", "parameters": "N/A"}
|
| 29 |
-
{"id": "deepseek-v3", "name": "DeepSeek V3", "organization": "DeepSeek", "license": "Open Source", "api_model": "deepseek-
|
| 30 |
-
{"id": "deepseek-r1", "name": "DeepSeek R1", "organization": "DeepSeek", "license": "Open Source", "api_model": "deepseek-
|
|
|
|
| 26 |
{"id": "qwen-2.5-7b-instruct-turbo", "name": "Qwen 2.5 7B Instruct", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-7B-Instruct-Turbo", "provider": "together", "parameters": "7B"}
|
| 27 |
|
| 28 |
{"id": "atla-selene", "name": "Atla Selene", "organization": "Atla", "license": "Proprietary", "api_model": "atla-selene", "provider": "together", "parameters": "N/A"}
|
| 29 |
+
{"id": "deepseek-v3", "name": "DeepSeek V3", "organization": "DeepSeek", "license": "Open Source", "api_model": "deepseek-ai/DeepSeek-V3", "provider": "together", "parameters": "671B"}
|
| 30 |
+
{"id": "deepseek-r1", "name": "DeepSeek R1", "organization": "DeepSeek", "license": "Open Source", "api_model": "deepseek-ai/DeepSeek-R1", "provider": "together", "parameters": "671B"}
|
models.jsonl
CHANGED
|
@@ -25,6 +25,5 @@
|
|
| 25 |
{"id": "qwen-2.5-72b-instruct-turbo", "name": "Qwen 2.5 72B Instruct", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-72B-Instruct-Turbo", "provider": "together", "parameters": "72B"}
|
| 26 |
{"id": "qwen-2.5-7b-instruct-turbo", "name": "Qwen 2.5 7B Instruct", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-7B-Instruct-Turbo", "provider": "together", "parameters": "7B"}
|
| 27 |
|
| 28 |
-
{"id": "
|
| 29 |
-
{"id": "deepseek-
|
| 30 |
-
{"id": "deepseek-r1", "name": "DeepSeek R1", "organization": "DeepSeek", "license": "Open Source", "api_model": "deepseek-r1", "provider": "together", "parameters": "671B"}
|
|
|
|
| 25 |
{"id": "qwen-2.5-72b-instruct-turbo", "name": "Qwen 2.5 72B Instruct", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-72B-Instruct-Turbo", "provider": "together", "parameters": "72B"}
|
| 26 |
{"id": "qwen-2.5-7b-instruct-turbo", "name": "Qwen 2.5 7B Instruct", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-7B-Instruct-Turbo", "provider": "together", "parameters": "7B"}
|
| 27 |
|
| 28 |
+
{"id": "deepseek-v3", "name": "DeepSeek V3", "organization": "DeepSeek", "license": "Open Source", "api_model": "deepseek-ai/DeepSeek-V3", "provider": "together", "parameters": "671B"}
|
| 29 |
+
{"id": "deepseek-r1", "name": "DeepSeek R1", "organization": "DeepSeek", "license": "Open Source", "api_model": "deepseek-ai/DeepSeek-R1", "provider": "together", "parameters": "671B"}
|
|
|
src/app.py
CHANGED
|
@@ -1,8 +1,11 @@
|
|
|
|
|
| 1 |
from typing import Any, Dict, Optional, Tuple
|
| 2 |
|
| 3 |
import gradio as gr
|
|
|
|
| 4 |
from loguru import logger
|
| 5 |
|
|
|
|
| 6 |
from src.data_manager import get_random_example, load_models
|
| 7 |
from src.judge import JudgeManager
|
| 8 |
from src.ui import UI
|
|
@@ -221,8 +224,11 @@ def get_evaluation1(
|
|
| 221 |
)
|
| 222 |
logger.info("Completed evaluation 1")
|
| 223 |
|
|
|
|
|
|
|
|
|
|
| 224 |
# Make the selection button visible once the evaluation is ready
|
| 225 |
-
return
|
| 226 |
except Exception as e:
|
| 227 |
logger.error(f"Error getting evaluation 1: {e}")
|
| 228 |
return f"Error: {str(e)}", gr.update(visible=False)
|
|
@@ -261,9 +267,12 @@ def get_evaluation2(
|
|
| 261 |
)
|
| 262 |
logger.info("Completed evaluation 2")
|
| 263 |
|
|
|
|
|
|
|
|
|
|
| 264 |
# Make the selection button visible once the evaluation is ready and show additional buttons
|
| 265 |
return (
|
| 266 |
-
|
| 267 |
gr.update(visible=True),
|
| 268 |
gr.update(visible=True),
|
| 269 |
)
|
|
@@ -299,6 +308,54 @@ def format_inputs_for_evaluation(
|
|
| 299 |
return input_text, output_text
|
| 300 |
|
| 301 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
def select_winner(choice: str, judge_manager: JudgeManager) -> str:
|
| 303 |
"""Select a winner from the evaluations."""
|
| 304 |
global eval1, eval2, current_test_type
|
|
@@ -307,12 +364,20 @@ def select_winner(choice: str, judge_manager: JudgeManager) -> str:
|
|
| 307 |
if not eval1 or not eval2:
|
| 308 |
return "Error: No evaluations available"
|
| 309 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
if choice == "Evaluation 1":
|
| 311 |
winner_eval = eval1
|
| 312 |
loser_eval = eval2
|
|
|
|
| 313 |
else:
|
| 314 |
winner_eval = eval2
|
| 315 |
loser_eval = eval1
|
|
|
|
| 316 |
|
| 317 |
# Update leaderboard
|
| 318 |
updated_board = judge_manager.update_leaderboard(
|
|
@@ -321,13 +386,29 @@ def select_winner(choice: str, judge_manager: JudgeManager) -> str:
|
|
| 321 |
result_type="win",
|
| 322 |
)
|
| 323 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
# Construct result message with revealed judges' names
|
| 325 |
result_message = f"You selected: {choice}\n\n"
|
| 326 |
-
result_message += f"Evaluation 1 was by: {eval1['judge']['name']}\n"
|
| 327 |
-
result_message +=
|
|
|
|
|
|
|
| 328 |
|
| 329 |
# Get the winner's new ELO score
|
| 330 |
-
winner_id = winner_eval["judge"]["id"]
|
| 331 |
winner_mask = updated_board["judge_id"] == winner_id
|
| 332 |
winner_elo = updated_board[winner_mask]["elo_score"].values[0]
|
| 333 |
|
|
@@ -349,6 +430,12 @@ def handle_both_correct(judge_manager: JudgeManager) -> str:
|
|
| 349 |
if not eval1 or not eval2:
|
| 350 |
return "Error: No evaluations available"
|
| 351 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 352 |
# Update leaderboard for both judges
|
| 353 |
updated_board = judge_manager.update_leaderboard(
|
| 354 |
eval1["judge"]["id"],
|
|
@@ -356,10 +443,27 @@ def handle_both_correct(judge_manager: JudgeManager) -> str:
|
|
| 356 |
result_type="both_correct",
|
| 357 |
)
|
| 358 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 359 |
# Construct result message with revealed judges' names
|
| 360 |
result_message = "You selected: Both Correct\n\n"
|
| 361 |
-
result_message += f"Evaluation 1 was by: {eval1['judge']['name']}\n"
|
| 362 |
-
result_message +=
|
|
|
|
|
|
|
| 363 |
|
| 364 |
# Get the new ELO scores
|
| 365 |
judge1_mask = updated_board["judge_id"] == eval1["judge"]["id"]
|
|
@@ -371,7 +475,7 @@ def handle_both_correct(judge_manager: JudgeManager) -> str:
|
|
| 371 |
result_message += "\nBoth judges performed well!\n"
|
| 372 |
result_message += f"{eval1['judge']['name']} new ELO: {judge1_elo:.2f}\n"
|
| 373 |
result_message += f"{eval2['judge']['name']} new ELO: {judge2_elo:.2f}\n"
|
| 374 |
-
result_message += "Test Type: {current_test_type}\n"
|
| 375 |
|
| 376 |
return result_message
|
| 377 |
except Exception as e:
|
|
@@ -387,6 +491,12 @@ def handle_both_incorrect(judge_manager: JudgeManager) -> str:
|
|
| 387 |
if not eval1 or not eval2:
|
| 388 |
return "Error: No evaluations available"
|
| 389 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 390 |
# Update leaderboard for both judges
|
| 391 |
updated_board = judge_manager.update_leaderboard(
|
| 392 |
eval1["judge"]["id"],
|
|
@@ -394,10 +504,27 @@ def handle_both_incorrect(judge_manager: JudgeManager) -> str:
|
|
| 394 |
result_type="both_incorrect",
|
| 395 |
)
|
| 396 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
# Construct result message with revealed judges' names
|
| 398 |
result_message = "You selected: Both Incorrect\n\n"
|
| 399 |
-
result_message += f"Evaluation 1 was by: {eval1['judge']['name']}\n"
|
| 400 |
-
result_message +=
|
|
|
|
|
|
|
| 401 |
|
| 402 |
# Get the new ELO scores
|
| 403 |
judge1_mask = updated_board["judge_id"] == eval1["judge"]["id"]
|
|
|
|
| 1 |
+
import datetime
|
| 2 |
from typing import Any, Dict, Optional, Tuple
|
| 3 |
|
| 4 |
import gradio as gr
|
| 5 |
+
import pandas as pd
|
| 6 |
from loguru import logger
|
| 7 |
|
| 8 |
+
from src.config import HISTORY_PATH
|
| 9 |
from src.data_manager import get_random_example, load_models
|
| 10 |
from src.judge import JudgeManager
|
| 11 |
from src.ui import UI
|
|
|
|
| 224 |
)
|
| 225 |
logger.info("Completed evaluation 1")
|
| 226 |
|
| 227 |
+
# Format evaluation with time
|
| 228 |
+
display_eval = f"Evaluation time: {eval1['elapsed_time']:.2f} seconds\n\n{eval1['display_evaluation']}\n\n"
|
| 229 |
+
|
| 230 |
# Make the selection button visible once the evaluation is ready
|
| 231 |
+
return display_eval, gr.update(visible=True)
|
| 232 |
except Exception as e:
|
| 233 |
logger.error(f"Error getting evaluation 1: {e}")
|
| 234 |
return f"Error: {str(e)}", gr.update(visible=False)
|
|
|
|
| 267 |
)
|
| 268 |
logger.info("Completed evaluation 2")
|
| 269 |
|
| 270 |
+
# Format evaluation with time
|
| 271 |
+
display_eval = f"Evaluation time: {eval2['elapsed_time']:.2f} seconds\n\n{eval2['display_evaluation']}\n"
|
| 272 |
+
|
| 273 |
# Make the selection button visible once the evaluation is ready and show additional buttons
|
| 274 |
return (
|
| 275 |
+
display_eval,
|
| 276 |
gr.update(visible=True),
|
| 277 |
gr.update(visible=True),
|
| 278 |
)
|
|
|
|
| 308 |
return input_text, output_text
|
| 309 |
|
| 310 |
|
| 311 |
+
def save_to_history(
|
| 312 |
+
input_text: str,
|
| 313 |
+
output_text: str,
|
| 314 |
+
judge1_id: str,
|
| 315 |
+
judge1_name: str,
|
| 316 |
+
judge1_evaluation: str,
|
| 317 |
+
judge1_time: float,
|
| 318 |
+
judge2_id: str,
|
| 319 |
+
judge2_name: str,
|
| 320 |
+
judge2_evaluation: str,
|
| 321 |
+
judge2_time: float,
|
| 322 |
+
winner_id: str,
|
| 323 |
+
) -> None:
|
| 324 |
+
"""Save the evaluation results to history CSV file."""
|
| 325 |
+
try:
|
| 326 |
+
# Create a new row for the history
|
| 327 |
+
history_row = {
|
| 328 |
+
"timestamp": datetime.datetime.now().isoformat(),
|
| 329 |
+
"input": input_text,
|
| 330 |
+
"output": output_text,
|
| 331 |
+
"judge1_id": judge1_id,
|
| 332 |
+
"judge1_name": judge1_name,
|
| 333 |
+
"judge1_evaluation": judge1_evaluation,
|
| 334 |
+
"judge1_time": judge1_time,
|
| 335 |
+
"judge2_id": judge2_id,
|
| 336 |
+
"judge2_name": judge2_name,
|
| 337 |
+
"judge2_evaluation": judge2_evaluation,
|
| 338 |
+
"judge2_time": judge2_time,
|
| 339 |
+
"winner_id": winner_id,
|
| 340 |
+
}
|
| 341 |
+
|
| 342 |
+
# Try to load existing history
|
| 343 |
+
try:
|
| 344 |
+
history_df = pd.read_csv(HISTORY_PATH)
|
| 345 |
+
except (FileNotFoundError, pd.errors.EmptyDataError):
|
| 346 |
+
# Create a new history dataframe if file doesn't exist or is empty
|
| 347 |
+
history_df = pd.DataFrame(columns=list(history_row.keys()))
|
| 348 |
+
|
| 349 |
+
# Append the new row
|
| 350 |
+
history_df = pd.concat([history_df, pd.DataFrame([history_row])], ignore_index=True)
|
| 351 |
+
|
| 352 |
+
# Save to CSV
|
| 353 |
+
history_df.to_csv(HISTORY_PATH, index=False)
|
| 354 |
+
logger.info("Saved evaluation to history")
|
| 355 |
+
except Exception as e:
|
| 356 |
+
logger.error(f"Error saving to history: {e}")
|
| 357 |
+
|
| 358 |
+
|
| 359 |
def select_winner(choice: str, judge_manager: JudgeManager) -> str:
|
| 360 |
"""Select a winner from the evaluations."""
|
| 361 |
global eval1, eval2, current_test_type
|
|
|
|
| 364 |
if not eval1 or not eval2:
|
| 365 |
return "Error: No evaluations available"
|
| 366 |
|
| 367 |
+
# Get the input and output text that was evaluated
|
| 368 |
+
input_text, output_text = "", ""
|
| 369 |
+
if "input_text" in eval1 and "output_text" in eval1:
|
| 370 |
+
input_text = eval1.get("input_text", "")
|
| 371 |
+
output_text = eval1.get("output_text", "")
|
| 372 |
+
|
| 373 |
if choice == "Evaluation 1":
|
| 374 |
winner_eval = eval1
|
| 375 |
loser_eval = eval2
|
| 376 |
+
winner_id = eval1["judge"]["id"]
|
| 377 |
else:
|
| 378 |
winner_eval = eval2
|
| 379 |
loser_eval = eval1
|
| 380 |
+
winner_id = eval2["judge"]["id"]
|
| 381 |
|
| 382 |
# Update leaderboard
|
| 383 |
updated_board = judge_manager.update_leaderboard(
|
|
|
|
| 386 |
result_type="win",
|
| 387 |
)
|
| 388 |
|
| 389 |
+
# Save to history
|
| 390 |
+
save_to_history(
|
| 391 |
+
input_text=input_text,
|
| 392 |
+
output_text=output_text,
|
| 393 |
+
judge1_id=eval1["judge"]["id"],
|
| 394 |
+
judge1_name=eval1["judge"]["name"],
|
| 395 |
+
judge1_evaluation=eval1["anonymous_evaluation"],
|
| 396 |
+
judge1_time=eval1["elapsed_time"],
|
| 397 |
+
judge2_id=eval2["judge"]["id"],
|
| 398 |
+
judge2_name=eval2["judge"]["name"],
|
| 399 |
+
judge2_evaluation=eval2["anonymous_evaluation"],
|
| 400 |
+
judge2_time=eval2["elapsed_time"],
|
| 401 |
+
winner_id=winner_id,
|
| 402 |
+
)
|
| 403 |
+
|
| 404 |
# Construct result message with revealed judges' names
|
| 405 |
result_message = f"You selected: {choice}\n\n"
|
| 406 |
+
result_message += f"Evaluation 1 was by: {eval1['judge']['name']} (took {eval1['elapsed_time']:.2f} seconds)\n"
|
| 407 |
+
result_message += (
|
| 408 |
+
f"Evaluation 2 was by: {eval2['judge']['name']} (took {eval2['elapsed_time']:.2f} seconds)\n\n"
|
| 409 |
+
)
|
| 410 |
|
| 411 |
# Get the winner's new ELO score
|
|
|
|
| 412 |
winner_mask = updated_board["judge_id"] == winner_id
|
| 413 |
winner_elo = updated_board[winner_mask]["elo_score"].values[0]
|
| 414 |
|
|
|
|
| 430 |
if not eval1 or not eval2:
|
| 431 |
return "Error: No evaluations available"
|
| 432 |
|
| 433 |
+
# Get the input and output text that was evaluated
|
| 434 |
+
input_text, output_text = "", ""
|
| 435 |
+
if "input_text" in eval1 and "output_text" in eval1:
|
| 436 |
+
input_text = eval1.get("input_text", "")
|
| 437 |
+
output_text = eval1.get("output_text", "")
|
| 438 |
+
|
| 439 |
# Update leaderboard for both judges
|
| 440 |
updated_board = judge_manager.update_leaderboard(
|
| 441 |
eval1["judge"]["id"],
|
|
|
|
| 443 |
result_type="both_correct",
|
| 444 |
)
|
| 445 |
|
| 446 |
+
# Save to history with both as winners
|
| 447 |
+
save_to_history(
|
| 448 |
+
input_text=input_text,
|
| 449 |
+
output_text=output_text,
|
| 450 |
+
judge1_id=eval1["judge"]["id"],
|
| 451 |
+
judge1_name=eval1["judge"]["name"],
|
| 452 |
+
judge1_evaluation=eval1["anonymous_evaluation"],
|
| 453 |
+
judge1_time=eval1["elapsed_time"],
|
| 454 |
+
judge2_id=eval2["judge"]["id"],
|
| 455 |
+
judge2_name=eval2["judge"]["name"],
|
| 456 |
+
judge2_evaluation=eval2["anonymous_evaluation"],
|
| 457 |
+
judge2_time=eval2["elapsed_time"],
|
| 458 |
+
winner_id="both",
|
| 459 |
+
)
|
| 460 |
+
|
| 461 |
# Construct result message with revealed judges' names
|
| 462 |
result_message = "You selected: Both Correct\n\n"
|
| 463 |
+
result_message += f"Evaluation 1 was by: {eval1['judge']['name']} (took {eval1['elapsed_time']:.2f} seconds)\n"
|
| 464 |
+
result_message += (
|
| 465 |
+
f"Evaluation 2 was by: {eval2['judge']['name']} (took {eval2['elapsed_time']:.2f} seconds)\n\n"
|
| 466 |
+
)
|
| 467 |
|
| 468 |
# Get the new ELO scores
|
| 469 |
judge1_mask = updated_board["judge_id"] == eval1["judge"]["id"]
|
|
|
|
| 475 |
result_message += "\nBoth judges performed well!\n"
|
| 476 |
result_message += f"{eval1['judge']['name']} new ELO: {judge1_elo:.2f}\n"
|
| 477 |
result_message += f"{eval2['judge']['name']} new ELO: {judge2_elo:.2f}\n"
|
| 478 |
+
result_message += f"Test Type: {current_test_type}\n"
|
| 479 |
|
| 480 |
return result_message
|
| 481 |
except Exception as e:
|
|
|
|
| 491 |
if not eval1 or not eval2:
|
| 492 |
return "Error: No evaluations available"
|
| 493 |
|
| 494 |
+
# Get the input and output text that was evaluated
|
| 495 |
+
input_text, output_text = "", ""
|
| 496 |
+
if "input_text" in eval1 and "output_text" in eval1:
|
| 497 |
+
input_text = eval1.get("input_text", "")
|
| 498 |
+
output_text = eval1.get("output_text", "")
|
| 499 |
+
|
| 500 |
# Update leaderboard for both judges
|
| 501 |
updated_board = judge_manager.update_leaderboard(
|
| 502 |
eval1["judge"]["id"],
|
|
|
|
| 504 |
result_type="both_incorrect",
|
| 505 |
)
|
| 506 |
|
| 507 |
+
# Save to history with neither as winner
|
| 508 |
+
save_to_history(
|
| 509 |
+
input_text=input_text,
|
| 510 |
+
output_text=output_text,
|
| 511 |
+
judge1_id=eval1["judge"]["id"],
|
| 512 |
+
judge1_name=eval1["judge"]["name"],
|
| 513 |
+
judge1_evaluation=eval1["anonymous_evaluation"],
|
| 514 |
+
judge1_time=eval1["elapsed_time"],
|
| 515 |
+
judge2_id=eval2["judge"]["id"],
|
| 516 |
+
judge2_name=eval2["judge"]["name"],
|
| 517 |
+
judge2_evaluation=eval2["anonymous_evaluation"],
|
| 518 |
+
judge2_time=eval2["elapsed_time"],
|
| 519 |
+
winner_id="none",
|
| 520 |
+
)
|
| 521 |
+
|
| 522 |
# Construct result message with revealed judges' names
|
| 523 |
result_message = "You selected: Both Incorrect\n\n"
|
| 524 |
+
result_message += f"Evaluation 1 was by: {eval1['judge']['name']} (took {eval1['elapsed_time']:.2f} seconds)\n"
|
| 525 |
+
result_message += (
|
| 526 |
+
f"Evaluation 2 was by: {eval2['judge']['name']} (took {eval2['elapsed_time']:.2f} seconds)\n\n"
|
| 527 |
+
)
|
| 528 |
|
| 529 |
# Get the new ELO scores
|
| 530 |
judge1_mask = updated_board["judge_id"] == eval1["judge"]["id"]
|
src/data_manager.py
CHANGED
|
@@ -127,7 +127,7 @@ def get_random_example(test_type: str) -> Dict[str, str]:
|
|
| 127 |
|
| 128 |
# Map fields based on test type
|
| 129 |
if test_type == "grounding":
|
| 130 |
-
result["text"] = example.get("
|
| 131 |
result["claim"] = example.get("claim", "")
|
| 132 |
elif test_type in ["prompt injections", "safety"]:
|
| 133 |
result["text"] = example.get("text", "")
|
|
|
|
| 127 |
|
| 128 |
# Map fields based on test type
|
| 129 |
if test_type == "grounding":
|
| 130 |
+
result["text"] = example.get("doc", "")
|
| 131 |
result["claim"] = example.get("claim", "")
|
| 132 |
elif test_type in ["prompt injections", "safety"]:
|
| 133 |
result["text"] = example.get("text", "")
|
src/judge.py
CHANGED
|
@@ -1,6 +1,9 @@
|
|
| 1 |
import random
|
|
|
|
| 2 |
from typing import Any, Dict, List, Tuple
|
| 3 |
|
|
|
|
|
|
|
| 4 |
import pandas as pd
|
| 5 |
from litellm import completion
|
| 6 |
from loguru import logger
|
|
@@ -8,6 +11,8 @@ from together import Together
|
|
| 8 |
|
| 9 |
from src.config import K_FACTOR, LEADERBOARD_PATH
|
| 10 |
|
|
|
|
|
|
|
| 11 |
|
| 12 |
class JudgeManager:
|
| 13 |
"""Manages judge evaluations and judge data"""
|
|
@@ -79,18 +84,28 @@ class JudgeManager:
|
|
| 79 |
) -> Dict[str, Any]:
|
| 80 |
"""Get an evaluation from a judge"""
|
| 81 |
try:
|
|
|
|
|
|
|
|
|
|
| 82 |
# Create appropriate system prompt based on test type
|
| 83 |
system_prompt = self._get_system_prompt(test_type)
|
| 84 |
|
| 85 |
# Format user message with input and output
|
| 86 |
user_message = self._create_user_message(input_text, output_text, test_type)
|
| 87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
# Get evaluation from the API
|
| 89 |
if judge["provider"].lower() in ["openai", "anthropic"]:
|
| 90 |
api_response = completion(
|
| 91 |
model=judge["api_model"],
|
| 92 |
messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_message}],
|
| 93 |
-
temperature=
|
| 94 |
max_tokens=500,
|
| 95 |
)
|
| 96 |
evaluation = api_response.choices[0].message.content
|
|
@@ -98,7 +113,7 @@ class JudgeManager:
|
|
| 98 |
api_response = self.together_client.chat.completions.create(
|
| 99 |
model=judge["api_model"],
|
| 100 |
messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_message}],
|
| 101 |
-
temperature=
|
| 102 |
max_tokens=500,
|
| 103 |
)
|
| 104 |
evaluation = api_response.choices[0].message.content
|
|
@@ -112,18 +127,27 @@ class JudgeManager:
|
|
| 112 |
# Store the full evaluation with judge name for revealing later
|
| 113 |
full_eval = f"Evaluation by {judge['name']} (ID: {judge['id']}):\n\n" f"{evaluation}"
|
| 114 |
|
|
|
|
|
|
|
|
|
|
| 115 |
return {
|
| 116 |
"judge": judge,
|
| 117 |
"evaluation": full_eval,
|
| 118 |
"display_evaluation": anonymous_eval,
|
| 119 |
"anonymous_evaluation": anonymous_eval,
|
| 120 |
"revealed_evaluation": full_eval,
|
|
|
|
|
|
|
|
|
|
| 121 |
}
|
| 122 |
|
| 123 |
except Exception as e:
|
| 124 |
# Handle API errors gracefully
|
| 125 |
logger.error(f"Error getting evaluation from {judge['name']}: {str(e)}")
|
| 126 |
|
|
|
|
|
|
|
|
|
|
| 127 |
# Create a fallback evaluation
|
| 128 |
metrics = ["Quality: 7/10", "Relevance: 8/10", "Precision: 7/10"]
|
| 129 |
comment = f"[Fallback evaluation due to API error: {str(e)}]"
|
|
@@ -139,6 +163,9 @@ class JudgeManager:
|
|
| 139 |
"anonymous_evaluation": anonymous_eval,
|
| 140 |
"revealed_evaluation": full_eval,
|
| 141 |
"error": str(e),
|
|
|
|
|
|
|
|
|
|
| 142 |
}
|
| 143 |
|
| 144 |
def _create_user_message(self, input_text: str, output_text: str, test_type: str) -> str:
|
|
|
|
| 1 |
import random
|
| 2 |
+
import time
|
| 3 |
from typing import Any, Dict, List, Tuple
|
| 4 |
|
| 5 |
+
# Add litellm configuration to handle unsupported parameters
|
| 6 |
+
import litellm
|
| 7 |
import pandas as pd
|
| 8 |
from litellm import completion
|
| 9 |
from loguru import logger
|
|
|
|
| 11 |
|
| 12 |
from src.config import K_FACTOR, LEADERBOARD_PATH
|
| 13 |
|
| 14 |
+
litellm.drop_params = True
|
| 15 |
+
|
| 16 |
|
| 17 |
class JudgeManager:
|
| 18 |
"""Manages judge evaluations and judge data"""
|
|
|
|
| 84 |
) -> Dict[str, Any]:
|
| 85 |
"""Get an evaluation from a judge"""
|
| 86 |
try:
|
| 87 |
+
# Start timing
|
| 88 |
+
start_time = time.time()
|
| 89 |
+
|
| 90 |
# Create appropriate system prompt based on test type
|
| 91 |
system_prompt = self._get_system_prompt(test_type)
|
| 92 |
|
| 93 |
# Format user message with input and output
|
| 94 |
user_message = self._create_user_message(input_text, output_text, test_type)
|
| 95 |
|
| 96 |
+
# Set temperature based on model
|
| 97 |
+
temperature = 0.2
|
| 98 |
+
# O-series models only support temperature=1
|
| 99 |
+
if judge["provider"].lower() == "openai" and "o3" in judge["api_model"]:
|
| 100 |
+
temperature = 1.0
|
| 101 |
+
logger.info(f"Using temperature=1.0 for O-series model {judge['api_model']}")
|
| 102 |
+
|
| 103 |
# Get evaluation from the API
|
| 104 |
if judge["provider"].lower() in ["openai", "anthropic"]:
|
| 105 |
api_response = completion(
|
| 106 |
model=judge["api_model"],
|
| 107 |
messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_message}],
|
| 108 |
+
temperature=temperature,
|
| 109 |
max_tokens=500,
|
| 110 |
)
|
| 111 |
evaluation = api_response.choices[0].message.content
|
|
|
|
| 113 |
api_response = self.together_client.chat.completions.create(
|
| 114 |
model=judge["api_model"],
|
| 115 |
messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_message}],
|
| 116 |
+
temperature=temperature,
|
| 117 |
max_tokens=500,
|
| 118 |
)
|
| 119 |
evaluation = api_response.choices[0].message.content
|
|
|
|
| 127 |
# Store the full evaluation with judge name for revealing later
|
| 128 |
full_eval = f"Evaluation by {judge['name']} (ID: {judge['id']}):\n\n" f"{evaluation}"
|
| 129 |
|
| 130 |
+
# Calculate elapsed time
|
| 131 |
+
elapsed_time = time.time() - start_time
|
| 132 |
+
|
| 133 |
return {
|
| 134 |
"judge": judge,
|
| 135 |
"evaluation": full_eval,
|
| 136 |
"display_evaluation": anonymous_eval,
|
| 137 |
"anonymous_evaluation": anonymous_eval,
|
| 138 |
"revealed_evaluation": full_eval,
|
| 139 |
+
"elapsed_time": elapsed_time,
|
| 140 |
+
"input_text": input_text,
|
| 141 |
+
"output_text": output_text,
|
| 142 |
}
|
| 143 |
|
| 144 |
except Exception as e:
|
| 145 |
# Handle API errors gracefully
|
| 146 |
logger.error(f"Error getting evaluation from {judge['name']}: {str(e)}")
|
| 147 |
|
| 148 |
+
# End timing for error case
|
| 149 |
+
elapsed_time = time.time() - start_time
|
| 150 |
+
|
| 151 |
# Create a fallback evaluation
|
| 152 |
metrics = ["Quality: 7/10", "Relevance: 8/10", "Precision: 7/10"]
|
| 153 |
comment = f"[Fallback evaluation due to API error: {str(e)}]"
|
|
|
|
| 163 |
"anonymous_evaluation": anonymous_eval,
|
| 164 |
"revealed_evaluation": full_eval,
|
| 165 |
"error": str(e),
|
| 166 |
+
"elapsed_time": elapsed_time,
|
| 167 |
+
"input_text": input_text,
|
| 168 |
+
"output_text": output_text,
|
| 169 |
}
|
| 170 |
|
| 171 |
def _create_user_message(self, input_text: str, output_text: str, test_type: str) -> str:
|
src/ui.py
CHANGED
|
@@ -99,7 +99,7 @@ class UI:
|
|
| 99 |
label="Test Type",
|
| 100 |
info="Select the type of test to evaluate",
|
| 101 |
)
|
| 102 |
-
refresh_button = gr.Button("
|
| 103 |
|
| 104 |
# Create different input layouts based on test type
|
| 105 |
with gr.Row():
|
|
@@ -120,7 +120,7 @@ class UI:
|
|
| 120 |
input_text = gr.Textbox(label="Input", lines=4, visible=False)
|
| 121 |
output_text = gr.Textbox(label="Output", lines=6, visible=False)
|
| 122 |
|
| 123 |
-
submit_button = gr.Button("
|
| 124 |
status_message = gr.Markdown(visible=False)
|
| 125 |
|
| 126 |
with gr.Row():
|
|
|
|
| 99 |
label="Test Type",
|
| 100 |
info="Select the type of test to evaluate",
|
| 101 |
)
|
| 102 |
+
refresh_button = gr.Button("Load from a dataset")
|
| 103 |
|
| 104 |
# Create different input layouts based on test type
|
| 105 |
with gr.Row():
|
|
|
|
| 120 |
input_text = gr.Textbox(label="Input", lines=4, visible=False)
|
| 121 |
output_text = gr.Textbox(label="Output", lines=6, visible=False)
|
| 122 |
|
| 123 |
+
submit_button = gr.Button("Evaluate")
|
| 124 |
status_message = gr.Markdown(visible=False)
|
| 125 |
|
| 126 |
with gr.Row():
|