dror44 commited on
Commit
3df66f9
·
1 Parent(s): 94407ab
Files changed (8) hide show
  1. data/history.csv +29 -1
  2. data/leaderboard.csv +7 -7
  3. data/models.jsonl +2 -2
  4. models.jsonl +2 -3
  5. src/app.py +137 -10
  6. src/data_manager.py +1 -1
  7. src/judge.py +29 -2
  8. src/ui.py +2 -2
data/history.csv CHANGED
@@ -1 +1,29 @@
1
- timestamp,input,output,judge1_id,judge1_name,judge1_evaluation,judge2_id,judge2_name,judge2_evaluation,winner_id,user_ip
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ timestamp,input,output,judge1_id,judge1_name,judge1_evaluation,judge2_id,judge2_name,judge2_evaluation,winner_id,user_ip,judge1_time,judge2_time
2
+ 2025-04-24T17:05:59.863028,"Does Neptune And Uranus Have Solid Ground? Credit: Universe Today Neptune and Uranus are gas giants and do not have solid ground. They are mostly made up of hydrogen, helium, and methane. It has been discovered that a mysterious dark storm has formed on Neptune. Hubble has photographed four other mysterious dark vortexes since 1993, the most recent of which was captured on Wednesday.","""solid ground"" can be found on Earth, Jupiter and Mars but not on Neptune and the Sun.",deepseek-v3,DeepSeek V3,"### Evaluation:
3
+
4
+ 1. **Factual Support**: 7/10
5
+ - The text explicitly states that Neptune and Uranus are gas giants and do not have solid ground, which directly supports the claim regarding Neptune.
6
+ - However, the text does not mention Earth, Jupiter, Mars, or the Sun, so there is no direct support for those parts of the claim.
7
+
8
+ 2. **Logical Connection**: 6/10
9
+ - The logical connection is strong for Neptune, as the text clearly contradicts the idea of solid ground there.
10
+ - For the other celestial bodies (Earth, Jupiter, Mars, Sun), the absence of information in the text means the claim cannot be logically validated or refuted based on this text alone.
11
+
12
+ 3. **Claim Accuracy**: 6/10
13
+ - The claim is accurate regarding Neptune (no solid ground), which aligns with the text.
14
+ - The claim about Earth, Jupiter, Mars, and the Sun may or may not be accurate, but the text does not provide evidence either way.
15
+
16
+ ### Summary:
17
+ The claim is **partially supported** by the text. The assertion that Neptune lacks solid ground is correct and backed by the text. However, the text does not address the other celestial bodies mentioned (Earth, Jupiter, Mars, Sun), so those parts of the claim remain unverified in this context. A more complete evaluation would require additional sources discussing those planets and the Sun.",meta-llama-3.1-8b-instruct-turbo,Meta Llama 3.1 8B Instruct,"**Evaluation:**
18
+
19
+ 1. **Factual Support:** 8/10
20
+ The text provides factual support for the claim that Neptune and Uranus do not have solid ground. It states that they are gas giants and are mostly made up of hydrogen, helium, and methane. However, the text does not explicitly mention the Sun, so it does not provide factual support for the claim regarding the Sun.
21
+
22
+ 2. **Logical Connection:** 9/10
23
+ The text provides a logical connection to the claim by stating that Neptune and Uranus are gas giants, which implies that they do not have solid ground. The mention of other planets like Earth, Jupiter, and Mars having solid ground also provides a logical connection to the claim.
24
+
25
+ 3. **Claim Accuracy:** 7/10
26
+ The claim is partially accurate. It is correct that Neptune and Uranus do not have solid ground. However, the claim is not accurate regarding the Sun, as the text does not provide any information about the Sun's composition or the presence of solid ground.
27
+
28
+ **Summary:**
29
+ The claim is partially supported by the text, but it is not entirely accurate. The text provides factual support for the claim regarding Neptune and Uranus, but it does not provide factual support for the claim regarding the Sun. The logical connection between the text and the claim is strong, but the claim's accuracy is limited by the omission of information about the Sun.",deepseek-v3,,23.25821089744568,2.207097053527832
data/leaderboard.csv CHANGED
@@ -3,29 +3,29 @@ gemma-2-27b-it,Gemma 2 27B,1749.8091372785384,25.0,0.0,25.0,Google,Open Source
3
  claude-3-opus-latest,Claude 3 Opus,1531.9661669788793,2.0,0.0,2.0,Anthropic,Proprietary
4
  mistral-7b-instruct-v0.1,Mistral (7B) Instruct v0.1,1516.736306793522,1.0,0.0,1.0,Mistral AI,Open Source
5
  qwen-2.5-7b-instruct-turbo,Qwen 2.5 7B Instruct,1516.0,1.0,0.0,1.0,Alibaba,Open Source
6
- meta-llama-3.1-8b-instruct-turbo,Meta Llama 3.1 8B Instruct,1515.2298601853572,1.0,0.0,1.0,Meta,Open Source
 
 
7
  claude-3-haiku-20240307,Claude 3 Haiku,1501.6053648908744,3.0,3.0,6.0,Anthropic,Proprietary
8
  gpt-4-turbo,GPT-4 Turbo,1500.736306793522,1.0,1.0,2.0,OpenAI,Proprietary
9
- deepseek-v3,DeepSeek V3,1500.0,0.0,0.0,0.0,DeepSeek,Open Source
10
  judge1,EvalGPT,1500.0,0.0,0.0,0.0,OpenAI,Commercial
11
  o3-mini, o3-mini,1500.0,0.0,0.0,0.0,OpenAI,Proprietary
12
- meta-llama-3.3-70B-instruct-turbo,Meta Llama 4 Scout 32K Instruct,1500.0,0.0,0.0,0.0,Meta,Open Source
13
  meta-llama-4-scout-17B-16E-instruct,Meta Llama 4 Scout 17B 16E Instruct,1500.0,0.0,0.0,0.0,Meta,Open Source
14
  judge5,Mixtral,1500.0,0.0,0.0,0.0,Mistral AI,Commercial
15
  judge4,PrecisionJudge,1500.0,0.0,0.0,0.0,Anthropic,Commercial
16
- judge3,GradeAssist,1500.0,0.0,0.0,0.0,Anthropic,Commercial
17
  deepseek-r1,DeepSeek R1,1500.0,0.0,0.0,0.0,DeepSeek,Open Source
18
- claude-3-sonnet-20240229,Claude 3 Sonnet,1500.0,0.0,0.0,0.0,Anthropic,Proprietary
19
  claude-3-5-haiku-latest,Claude 3.5 Haiku,1500.0,0.0,0.0,0.0,Anthropic,Proprietary
20
  atla-selene,Atla Selene,1500.0,0.0,0.0,0.0,Atla,Proprietary
21
  qwen-2-72b-instruct,Qwen 2 Instruct (72B),1500.0,0.0,0.0,0.0,Alibaba,Open Source
22
  gemma-2-9b-it,Gemma 2 9B,1500.0,0.0,0.0,0.0,Google,Open Source
23
  judge2,CritiqueBot,1500.0,0.0,0.0,0.0,OpenAI,Commercial
 
24
  meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,1499.263693206478,1.0,1.0,2.0,Meta,Open Source
25
  meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,1499.2598341210926,2.0,2.0,4.0,Meta,Open Source
26
  mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,1487.3632548757455,0.0,2.0,2.0,Mistral AI,Open Source
27
- gpt-4.1,GPT-4.1,1484.7701398146428,0.0,1.0,1.0,OpenAI,Proprietary
28
  claude-3-5-sonnet-latest,Claude 3.5 Sonnet,1484.0,0.0,1.0,1.0,Anthropic,Proprietary
29
- gpt-4o,GPT-4o,1484.0,0.0,1.0,1.0,OpenAI,Proprietary
 
30
  qwen-2.5-72b-instruct-turbo,Qwen 2.5 72B Instruct,1412.6552679185854,21.0,25.0,46.0,Alibaba,Open Source
31
  gpt-3.5-turbo,GPT-3.5 Turbo,1318.2061729482512,0.0,21.0,21.0,OpenAI,Proprietary
 
3
  claude-3-opus-latest,Claude 3 Opus,1531.9661669788793,2.0,0.0,2.0,Anthropic,Proprietary
4
  mistral-7b-instruct-v0.1,Mistral (7B) Instruct v0.1,1516.736306793522,1.0,0.0,1.0,Mistral AI,Open Source
5
  qwen-2.5-7b-instruct-turbo,Qwen 2.5 7B Instruct,1516.0,1.0,0.0,1.0,Alibaba,Open Source
6
+ meta-llama-3.3-70B-instruct-turbo,Meta Llama 4 Scout 32K Instruct,1516.0,1.0,0.0,1.0,Meta,Open Source
7
+ claude-3-sonnet-20240229,Claude 3 Sonnet,1515.263693206478,1.0,0.0,1.0,Anthropic,Proprietary
8
+ gpt-4.1,GPT-4.1,1502.1692789932397,1.0,1.0,2.0,OpenAI,Proprietary
9
  claude-3-haiku-20240307,Claude 3 Haiku,1501.6053648908744,3.0,3.0,6.0,Anthropic,Proprietary
10
  gpt-4-turbo,GPT-4 Turbo,1500.736306793522,1.0,1.0,2.0,OpenAI,Proprietary
11
+ deepseek-v3,DeepSeek V3,1500.6365920113653,1.0,1.0,2.0,DeepSeek,Open Source
12
  judge1,EvalGPT,1500.0,0.0,0.0,0.0,OpenAI,Commercial
13
  o3-mini, o3-mini,1500.0,0.0,0.0,0.0,OpenAI,Proprietary
 
14
  meta-llama-4-scout-17B-16E-instruct,Meta Llama 4 Scout 17B 16E Instruct,1500.0,0.0,0.0,0.0,Meta,Open Source
15
  judge5,Mixtral,1500.0,0.0,0.0,0.0,Mistral AI,Commercial
16
  judge4,PrecisionJudge,1500.0,0.0,0.0,0.0,Anthropic,Commercial
 
17
  deepseek-r1,DeepSeek R1,1500.0,0.0,0.0,0.0,DeepSeek,Open Source
 
18
  claude-3-5-haiku-latest,Claude 3.5 Haiku,1500.0,0.0,0.0,0.0,Anthropic,Proprietary
19
  atla-selene,Atla Selene,1500.0,0.0,0.0,0.0,Atla,Proprietary
20
  qwen-2-72b-instruct,Qwen 2 Instruct (72B),1500.0,0.0,0.0,0.0,Alibaba,Open Source
21
  gemma-2-9b-it,Gemma 2 9B,1500.0,0.0,0.0,0.0,Google,Open Source
22
  judge2,CritiqueBot,1500.0,0.0,0.0,0.0,OpenAI,Commercial
23
+ judge3,GradeAssist,1500.0,0.0,0.0,0.0,Anthropic,Commercial
24
  meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,1499.263693206478,1.0,1.0,2.0,Meta,Open Source
25
  meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,1499.2598341210926,2.0,2.0,4.0,Meta,Open Source
26
  mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,1487.3632548757455,0.0,2.0,2.0,Mistral AI,Open Source
 
27
  claude-3-5-sonnet-latest,Claude 3.5 Sonnet,1484.0,0.0,1.0,1.0,Anthropic,Proprietary
28
+ meta-llama-3.1-8b-instruct-turbo,Meta Llama 3.1 8B Instruct,1481.194128995395,1.0,2.0,3.0,Meta,Open Source
29
+ gpt-4o,GPT-4o,1468.736306793522,0.0,2.0,2.0,OpenAI,Proprietary
30
  qwen-2.5-72b-instruct-turbo,Qwen 2.5 72B Instruct,1412.6552679185854,21.0,25.0,46.0,Alibaba,Open Source
31
  gpt-3.5-turbo,GPT-3.5 Turbo,1318.2061729482512,0.0,21.0,21.0,OpenAI,Proprietary
data/models.jsonl CHANGED
@@ -26,5 +26,5 @@
26
  {"id": "qwen-2.5-7b-instruct-turbo", "name": "Qwen 2.5 7B Instruct", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-7B-Instruct-Turbo", "provider": "together", "parameters": "7B"}
27
 
28
  {"id": "atla-selene", "name": "Atla Selene", "organization": "Atla", "license": "Proprietary", "api_model": "atla-selene", "provider": "together", "parameters": "N/A"}
29
- {"id": "deepseek-v3", "name": "DeepSeek V3", "organization": "DeepSeek", "license": "Open Source", "api_model": "deepseek-v3", "provider": "together", "parameters": "671B"}
30
- {"id": "deepseek-r1", "name": "DeepSeek R1", "organization": "DeepSeek", "license": "Open Source", "api_model": "deepseek-r1", "provider": "together", "parameters": "671B"}
 
26
  {"id": "qwen-2.5-7b-instruct-turbo", "name": "Qwen 2.5 7B Instruct", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-7B-Instruct-Turbo", "provider": "together", "parameters": "7B"}
27
 
28
  {"id": "atla-selene", "name": "Atla Selene", "organization": "Atla", "license": "Proprietary", "api_model": "atla-selene", "provider": "together", "parameters": "N/A"}
29
+ {"id": "deepseek-v3", "name": "DeepSeek V3", "organization": "DeepSeek", "license": "Open Source", "api_model": "deepseek-ai/DeepSeek-V3", "provider": "together", "parameters": "671B"}
30
+ {"id": "deepseek-r1", "name": "DeepSeek R1", "organization": "DeepSeek", "license": "Open Source", "api_model": "deepseek-ai/DeepSeek-R1", "provider": "together", "parameters": "671B"}
models.jsonl CHANGED
@@ -25,6 +25,5 @@
25
  {"id": "qwen-2.5-72b-instruct-turbo", "name": "Qwen 2.5 72B Instruct", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-72B-Instruct-Turbo", "provider": "together", "parameters": "72B"}
26
  {"id": "qwen-2.5-7b-instruct-turbo", "name": "Qwen 2.5 7B Instruct", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-7B-Instruct-Turbo", "provider": "together", "parameters": "7B"}
27
 
28
- {"id": "atla-selene", "name": "Atla Selene", "organization": "Atla", "license": "Proprietary", "api_model": "atla-selene", "provider": "together", "parameters": "N/A"}
29
- {"id": "deepseek-v3", "name": "DeepSeek V3", "organization": "DeepSeek", "license": "Open Source", "api_model": "deepseek-v3", "provider": "together", "parameters": "671B"}
30
- {"id": "deepseek-r1", "name": "DeepSeek R1", "organization": "DeepSeek", "license": "Open Source", "api_model": "deepseek-r1", "provider": "together", "parameters": "671B"}
 
25
  {"id": "qwen-2.5-72b-instruct-turbo", "name": "Qwen 2.5 72B Instruct", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-72B-Instruct-Turbo", "provider": "together", "parameters": "72B"}
26
  {"id": "qwen-2.5-7b-instruct-turbo", "name": "Qwen 2.5 7B Instruct", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-7B-Instruct-Turbo", "provider": "together", "parameters": "7B"}
27
 
28
+ {"id": "deepseek-v3", "name": "DeepSeek V3", "organization": "DeepSeek", "license": "Open Source", "api_model": "deepseek-ai/DeepSeek-V3", "provider": "together", "parameters": "671B"}
29
+ {"id": "deepseek-r1", "name": "DeepSeek R1", "organization": "DeepSeek", "license": "Open Source", "api_model": "deepseek-ai/DeepSeek-R1", "provider": "together", "parameters": "671B"}
 
src/app.py CHANGED
@@ -1,8 +1,11 @@
 
1
  from typing import Any, Dict, Optional, Tuple
2
 
3
  import gradio as gr
 
4
  from loguru import logger
5
 
 
6
  from src.data_manager import get_random_example, load_models
7
  from src.judge import JudgeManager
8
  from src.ui import UI
@@ -221,8 +224,11 @@ def get_evaluation1(
221
  )
222
  logger.info("Completed evaluation 1")
223
 
 
 
 
224
  # Make the selection button visible once the evaluation is ready
225
- return eval1["display_evaluation"], gr.update(visible=True)
226
  except Exception as e:
227
  logger.error(f"Error getting evaluation 1: {e}")
228
  return f"Error: {str(e)}", gr.update(visible=False)
@@ -261,9 +267,12 @@ def get_evaluation2(
261
  )
262
  logger.info("Completed evaluation 2")
263
 
 
 
 
264
  # Make the selection button visible once the evaluation is ready and show additional buttons
265
  return (
266
- eval2["display_evaluation"],
267
  gr.update(visible=True),
268
  gr.update(visible=True),
269
  )
@@ -299,6 +308,54 @@ def format_inputs_for_evaluation(
299
  return input_text, output_text
300
 
301
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
  def select_winner(choice: str, judge_manager: JudgeManager) -> str:
303
  """Select a winner from the evaluations."""
304
  global eval1, eval2, current_test_type
@@ -307,12 +364,20 @@ def select_winner(choice: str, judge_manager: JudgeManager) -> str:
307
  if not eval1 or not eval2:
308
  return "Error: No evaluations available"
309
 
 
 
 
 
 
 
310
  if choice == "Evaluation 1":
311
  winner_eval = eval1
312
  loser_eval = eval2
 
313
  else:
314
  winner_eval = eval2
315
  loser_eval = eval1
 
316
 
317
  # Update leaderboard
318
  updated_board = judge_manager.update_leaderboard(
@@ -321,13 +386,29 @@ def select_winner(choice: str, judge_manager: JudgeManager) -> str:
321
  result_type="win",
322
  )
323
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
  # Construct result message with revealed judges' names
325
  result_message = f"You selected: {choice}\n\n"
326
- result_message += f"Evaluation 1 was by: {eval1['judge']['name']}\n"
327
- result_message += f"Evaluation 2 was by: {eval2['judge']['name']}\n\n"
 
 
328
 
329
  # Get the winner's new ELO score
330
- winner_id = winner_eval["judge"]["id"]
331
  winner_mask = updated_board["judge_id"] == winner_id
332
  winner_elo = updated_board[winner_mask]["elo_score"].values[0]
333
 
@@ -349,6 +430,12 @@ def handle_both_correct(judge_manager: JudgeManager) -> str:
349
  if not eval1 or not eval2:
350
  return "Error: No evaluations available"
351
 
 
 
 
 
 
 
352
  # Update leaderboard for both judges
353
  updated_board = judge_manager.update_leaderboard(
354
  eval1["judge"]["id"],
@@ -356,10 +443,27 @@ def handle_both_correct(judge_manager: JudgeManager) -> str:
356
  result_type="both_correct",
357
  )
358
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
  # Construct result message with revealed judges' names
360
  result_message = "You selected: Both Correct\n\n"
361
- result_message += f"Evaluation 1 was by: {eval1['judge']['name']}\n"
362
- result_message += f"Evaluation 2 was by: {eval2['judge']['name']}\n\n"
 
 
363
 
364
  # Get the new ELO scores
365
  judge1_mask = updated_board["judge_id"] == eval1["judge"]["id"]
@@ -371,7 +475,7 @@ def handle_both_correct(judge_manager: JudgeManager) -> str:
371
  result_message += "\nBoth judges performed well!\n"
372
  result_message += f"{eval1['judge']['name']} new ELO: {judge1_elo:.2f}\n"
373
  result_message += f"{eval2['judge']['name']} new ELO: {judge2_elo:.2f}\n"
374
- result_message += "Test Type: {current_test_type}\n"
375
 
376
  return result_message
377
  except Exception as e:
@@ -387,6 +491,12 @@ def handle_both_incorrect(judge_manager: JudgeManager) -> str:
387
  if not eval1 or not eval2:
388
  return "Error: No evaluations available"
389
 
 
 
 
 
 
 
390
  # Update leaderboard for both judges
391
  updated_board = judge_manager.update_leaderboard(
392
  eval1["judge"]["id"],
@@ -394,10 +504,27 @@ def handle_both_incorrect(judge_manager: JudgeManager) -> str:
394
  result_type="both_incorrect",
395
  )
396
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
  # Construct result message with revealed judges' names
398
  result_message = "You selected: Both Incorrect\n\n"
399
- result_message += f"Evaluation 1 was by: {eval1['judge']['name']}\n"
400
- result_message += f"Evaluation 2 was by: {eval2['judge']['name']}\n\n"
 
 
401
 
402
  # Get the new ELO scores
403
  judge1_mask = updated_board["judge_id"] == eval1["judge"]["id"]
 
1
+ import datetime
2
  from typing import Any, Dict, Optional, Tuple
3
 
4
  import gradio as gr
5
+ import pandas as pd
6
  from loguru import logger
7
 
8
+ from src.config import HISTORY_PATH
9
  from src.data_manager import get_random_example, load_models
10
  from src.judge import JudgeManager
11
  from src.ui import UI
 
224
  )
225
  logger.info("Completed evaluation 1")
226
 
227
+ # Format evaluation with time
228
+ display_eval = f"Evaluation time: {eval1['elapsed_time']:.2f} seconds\n\n{eval1['display_evaluation']}\n\n"
229
+
230
  # Make the selection button visible once the evaluation is ready
231
+ return display_eval, gr.update(visible=True)
232
  except Exception as e:
233
  logger.error(f"Error getting evaluation 1: {e}")
234
  return f"Error: {str(e)}", gr.update(visible=False)
 
267
  )
268
  logger.info("Completed evaluation 2")
269
 
270
+ # Format evaluation with time
271
+ display_eval = f"Evaluation time: {eval2['elapsed_time']:.2f} seconds\n\n{eval2['display_evaluation']}\n"
272
+
273
  # Make the selection button visible once the evaluation is ready and show additional buttons
274
  return (
275
+ display_eval,
276
  gr.update(visible=True),
277
  gr.update(visible=True),
278
  )
 
308
  return input_text, output_text
309
 
310
 
311
+ def save_to_history(
312
+ input_text: str,
313
+ output_text: str,
314
+ judge1_id: str,
315
+ judge1_name: str,
316
+ judge1_evaluation: str,
317
+ judge1_time: float,
318
+ judge2_id: str,
319
+ judge2_name: str,
320
+ judge2_evaluation: str,
321
+ judge2_time: float,
322
+ winner_id: str,
323
+ ) -> None:
324
+ """Save the evaluation results to history CSV file."""
325
+ try:
326
+ # Create a new row for the history
327
+ history_row = {
328
+ "timestamp": datetime.datetime.now().isoformat(),
329
+ "input": input_text,
330
+ "output": output_text,
331
+ "judge1_id": judge1_id,
332
+ "judge1_name": judge1_name,
333
+ "judge1_evaluation": judge1_evaluation,
334
+ "judge1_time": judge1_time,
335
+ "judge2_id": judge2_id,
336
+ "judge2_name": judge2_name,
337
+ "judge2_evaluation": judge2_evaluation,
338
+ "judge2_time": judge2_time,
339
+ "winner_id": winner_id,
340
+ }
341
+
342
+ # Try to load existing history
343
+ try:
344
+ history_df = pd.read_csv(HISTORY_PATH)
345
+ except (FileNotFoundError, pd.errors.EmptyDataError):
346
+ # Create a new history dataframe if file doesn't exist or is empty
347
+ history_df = pd.DataFrame(columns=list(history_row.keys()))
348
+
349
+ # Append the new row
350
+ history_df = pd.concat([history_df, pd.DataFrame([history_row])], ignore_index=True)
351
+
352
+ # Save to CSV
353
+ history_df.to_csv(HISTORY_PATH, index=False)
354
+ logger.info("Saved evaluation to history")
355
+ except Exception as e:
356
+ logger.error(f"Error saving to history: {e}")
357
+
358
+
359
  def select_winner(choice: str, judge_manager: JudgeManager) -> str:
360
  """Select a winner from the evaluations."""
361
  global eval1, eval2, current_test_type
 
364
  if not eval1 or not eval2:
365
  return "Error: No evaluations available"
366
 
367
+ # Get the input and output text that was evaluated
368
+ input_text, output_text = "", ""
369
+ if "input_text" in eval1 and "output_text" in eval1:
370
+ input_text = eval1.get("input_text", "")
371
+ output_text = eval1.get("output_text", "")
372
+
373
  if choice == "Evaluation 1":
374
  winner_eval = eval1
375
  loser_eval = eval2
376
+ winner_id = eval1["judge"]["id"]
377
  else:
378
  winner_eval = eval2
379
  loser_eval = eval1
380
+ winner_id = eval2["judge"]["id"]
381
 
382
  # Update leaderboard
383
  updated_board = judge_manager.update_leaderboard(
 
386
  result_type="win",
387
  )
388
 
389
+ # Save to history
390
+ save_to_history(
391
+ input_text=input_text,
392
+ output_text=output_text,
393
+ judge1_id=eval1["judge"]["id"],
394
+ judge1_name=eval1["judge"]["name"],
395
+ judge1_evaluation=eval1["anonymous_evaluation"],
396
+ judge1_time=eval1["elapsed_time"],
397
+ judge2_id=eval2["judge"]["id"],
398
+ judge2_name=eval2["judge"]["name"],
399
+ judge2_evaluation=eval2["anonymous_evaluation"],
400
+ judge2_time=eval2["elapsed_time"],
401
+ winner_id=winner_id,
402
+ )
403
+
404
  # Construct result message with revealed judges' names
405
  result_message = f"You selected: {choice}\n\n"
406
+ result_message += f"Evaluation 1 was by: {eval1['judge']['name']} (took {eval1['elapsed_time']:.2f} seconds)\n"
407
+ result_message += (
408
+ f"Evaluation 2 was by: {eval2['judge']['name']} (took {eval2['elapsed_time']:.2f} seconds)\n\n"
409
+ )
410
 
411
  # Get the winner's new ELO score
 
412
  winner_mask = updated_board["judge_id"] == winner_id
413
  winner_elo = updated_board[winner_mask]["elo_score"].values[0]
414
 
 
430
  if not eval1 or not eval2:
431
  return "Error: No evaluations available"
432
 
433
+ # Get the input and output text that was evaluated
434
+ input_text, output_text = "", ""
435
+ if "input_text" in eval1 and "output_text" in eval1:
436
+ input_text = eval1.get("input_text", "")
437
+ output_text = eval1.get("output_text", "")
438
+
439
  # Update leaderboard for both judges
440
  updated_board = judge_manager.update_leaderboard(
441
  eval1["judge"]["id"],
 
443
  result_type="both_correct",
444
  )
445
 
446
+ # Save to history with both as winners
447
+ save_to_history(
448
+ input_text=input_text,
449
+ output_text=output_text,
450
+ judge1_id=eval1["judge"]["id"],
451
+ judge1_name=eval1["judge"]["name"],
452
+ judge1_evaluation=eval1["anonymous_evaluation"],
453
+ judge1_time=eval1["elapsed_time"],
454
+ judge2_id=eval2["judge"]["id"],
455
+ judge2_name=eval2["judge"]["name"],
456
+ judge2_evaluation=eval2["anonymous_evaluation"],
457
+ judge2_time=eval2["elapsed_time"],
458
+ winner_id="both",
459
+ )
460
+
461
  # Construct result message with revealed judges' names
462
  result_message = "You selected: Both Correct\n\n"
463
+ result_message += f"Evaluation 1 was by: {eval1['judge']['name']} (took {eval1['elapsed_time']:.2f} seconds)\n"
464
+ result_message += (
465
+ f"Evaluation 2 was by: {eval2['judge']['name']} (took {eval2['elapsed_time']:.2f} seconds)\n\n"
466
+ )
467
 
468
  # Get the new ELO scores
469
  judge1_mask = updated_board["judge_id"] == eval1["judge"]["id"]
 
475
  result_message += "\nBoth judges performed well!\n"
476
  result_message += f"{eval1['judge']['name']} new ELO: {judge1_elo:.2f}\n"
477
  result_message += f"{eval2['judge']['name']} new ELO: {judge2_elo:.2f}\n"
478
+ result_message += f"Test Type: {current_test_type}\n"
479
 
480
  return result_message
481
  except Exception as e:
 
491
  if not eval1 or not eval2:
492
  return "Error: No evaluations available"
493
 
494
+ # Get the input and output text that was evaluated
495
+ input_text, output_text = "", ""
496
+ if "input_text" in eval1 and "output_text" in eval1:
497
+ input_text = eval1.get("input_text", "")
498
+ output_text = eval1.get("output_text", "")
499
+
500
  # Update leaderboard for both judges
501
  updated_board = judge_manager.update_leaderboard(
502
  eval1["judge"]["id"],
 
504
  result_type="both_incorrect",
505
  )
506
 
507
+ # Save to history with neither as winner
508
+ save_to_history(
509
+ input_text=input_text,
510
+ output_text=output_text,
511
+ judge1_id=eval1["judge"]["id"],
512
+ judge1_name=eval1["judge"]["name"],
513
+ judge1_evaluation=eval1["anonymous_evaluation"],
514
+ judge1_time=eval1["elapsed_time"],
515
+ judge2_id=eval2["judge"]["id"],
516
+ judge2_name=eval2["judge"]["name"],
517
+ judge2_evaluation=eval2["anonymous_evaluation"],
518
+ judge2_time=eval2["elapsed_time"],
519
+ winner_id="none",
520
+ )
521
+
522
  # Construct result message with revealed judges' names
523
  result_message = "You selected: Both Incorrect\n\n"
524
+ result_message += f"Evaluation 1 was by: {eval1['judge']['name']} (took {eval1['elapsed_time']:.2f} seconds)\n"
525
+ result_message += (
526
+ f"Evaluation 2 was by: {eval2['judge']['name']} (took {eval2['elapsed_time']:.2f} seconds)\n\n"
527
+ )
528
 
529
  # Get the new ELO scores
530
  judge1_mask = updated_board["judge_id"] == eval1["judge"]["id"]
src/data_manager.py CHANGED
@@ -127,7 +127,7 @@ def get_random_example(test_type: str) -> Dict[str, str]:
127
 
128
  # Map fields based on test type
129
  if test_type == "grounding":
130
- result["text"] = example.get("text", "")
131
  result["claim"] = example.get("claim", "")
132
  elif test_type in ["prompt injections", "safety"]:
133
  result["text"] = example.get("text", "")
 
127
 
128
  # Map fields based on test type
129
  if test_type == "grounding":
130
+ result["text"] = example.get("doc", "")
131
  result["claim"] = example.get("claim", "")
132
  elif test_type in ["prompt injections", "safety"]:
133
  result["text"] = example.get("text", "")
src/judge.py CHANGED
@@ -1,6 +1,9 @@
1
  import random
 
2
  from typing import Any, Dict, List, Tuple
3
 
 
 
4
  import pandas as pd
5
  from litellm import completion
6
  from loguru import logger
@@ -8,6 +11,8 @@ from together import Together
8
 
9
  from src.config import K_FACTOR, LEADERBOARD_PATH
10
 
 
 
11
 
12
  class JudgeManager:
13
  """Manages judge evaluations and judge data"""
@@ -79,18 +84,28 @@ class JudgeManager:
79
  ) -> Dict[str, Any]:
80
  """Get an evaluation from a judge"""
81
  try:
 
 
 
82
  # Create appropriate system prompt based on test type
83
  system_prompt = self._get_system_prompt(test_type)
84
 
85
  # Format user message with input and output
86
  user_message = self._create_user_message(input_text, output_text, test_type)
87
 
 
 
 
 
 
 
 
88
  # Get evaluation from the API
89
  if judge["provider"].lower() in ["openai", "anthropic"]:
90
  api_response = completion(
91
  model=judge["api_model"],
92
  messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_message}],
93
- temperature=0.2,
94
  max_tokens=500,
95
  )
96
  evaluation = api_response.choices[0].message.content
@@ -98,7 +113,7 @@ class JudgeManager:
98
  api_response = self.together_client.chat.completions.create(
99
  model=judge["api_model"],
100
  messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_message}],
101
- temperature=0.2,
102
  max_tokens=500,
103
  )
104
  evaluation = api_response.choices[0].message.content
@@ -112,18 +127,27 @@ class JudgeManager:
112
  # Store the full evaluation with judge name for revealing later
113
  full_eval = f"Evaluation by {judge['name']} (ID: {judge['id']}):\n\n" f"{evaluation}"
114
 
 
 
 
115
  return {
116
  "judge": judge,
117
  "evaluation": full_eval,
118
  "display_evaluation": anonymous_eval,
119
  "anonymous_evaluation": anonymous_eval,
120
  "revealed_evaluation": full_eval,
 
 
 
121
  }
122
 
123
  except Exception as e:
124
  # Handle API errors gracefully
125
  logger.error(f"Error getting evaluation from {judge['name']}: {str(e)}")
126
 
 
 
 
127
  # Create a fallback evaluation
128
  metrics = ["Quality: 7/10", "Relevance: 8/10", "Precision: 7/10"]
129
  comment = f"[Fallback evaluation due to API error: {str(e)}]"
@@ -139,6 +163,9 @@ class JudgeManager:
139
  "anonymous_evaluation": anonymous_eval,
140
  "revealed_evaluation": full_eval,
141
  "error": str(e),
 
 
 
142
  }
143
 
144
  def _create_user_message(self, input_text: str, output_text: str, test_type: str) -> str:
 
1
  import random
2
+ import time
3
  from typing import Any, Dict, List, Tuple
4
 
5
+ # Add litellm configuration to handle unsupported parameters
6
+ import litellm
7
  import pandas as pd
8
  from litellm import completion
9
  from loguru import logger
 
11
 
12
  from src.config import K_FACTOR, LEADERBOARD_PATH
13
 
14
+ litellm.drop_params = True
15
+
16
 
17
  class JudgeManager:
18
  """Manages judge evaluations and judge data"""
 
84
  ) -> Dict[str, Any]:
85
  """Get an evaluation from a judge"""
86
  try:
87
+ # Start timing
88
+ start_time = time.time()
89
+
90
  # Create appropriate system prompt based on test type
91
  system_prompt = self._get_system_prompt(test_type)
92
 
93
  # Format user message with input and output
94
  user_message = self._create_user_message(input_text, output_text, test_type)
95
 
96
+ # Set temperature based on model
97
+ temperature = 0.2
98
+ # O-series models only support temperature=1
99
+ if judge["provider"].lower() == "openai" and "o3" in judge["api_model"]:
100
+ temperature = 1.0
101
+ logger.info(f"Using temperature=1.0 for O-series model {judge['api_model']}")
102
+
103
  # Get evaluation from the API
104
  if judge["provider"].lower() in ["openai", "anthropic"]:
105
  api_response = completion(
106
  model=judge["api_model"],
107
  messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_message}],
108
+ temperature=temperature,
109
  max_tokens=500,
110
  )
111
  evaluation = api_response.choices[0].message.content
 
113
  api_response = self.together_client.chat.completions.create(
114
  model=judge["api_model"],
115
  messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_message}],
116
+ temperature=temperature,
117
  max_tokens=500,
118
  )
119
  evaluation = api_response.choices[0].message.content
 
127
  # Store the full evaluation with judge name for revealing later
128
  full_eval = f"Evaluation by {judge['name']} (ID: {judge['id']}):\n\n" f"{evaluation}"
129
 
130
+ # Calculate elapsed time
131
+ elapsed_time = time.time() - start_time
132
+
133
  return {
134
  "judge": judge,
135
  "evaluation": full_eval,
136
  "display_evaluation": anonymous_eval,
137
  "anonymous_evaluation": anonymous_eval,
138
  "revealed_evaluation": full_eval,
139
+ "elapsed_time": elapsed_time,
140
+ "input_text": input_text,
141
+ "output_text": output_text,
142
  }
143
 
144
  except Exception as e:
145
  # Handle API errors gracefully
146
  logger.error(f"Error getting evaluation from {judge['name']}: {str(e)}")
147
 
148
+ # End timing for error case
149
+ elapsed_time = time.time() - start_time
150
+
151
  # Create a fallback evaluation
152
  metrics = ["Quality: 7/10", "Relevance: 8/10", "Precision: 7/10"]
153
  comment = f"[Fallback evaluation due to API error: {str(e)}]"
 
163
  "anonymous_evaluation": anonymous_eval,
164
  "revealed_evaluation": full_eval,
165
  "error": str(e),
166
+ "elapsed_time": elapsed_time,
167
+ "input_text": input_text,
168
+ "output_text": output_text,
169
  }
170
 
171
  def _create_user_message(self, input_text: str, output_text: str, test_type: str) -> str:
src/ui.py CHANGED
@@ -99,7 +99,7 @@ class UI:
99
  label="Test Type",
100
  info="Select the type of test to evaluate",
101
  )
102
- refresh_button = gr.Button("Get Random Example")
103
 
104
  # Create different input layouts based on test type
105
  with gr.Row():
@@ -120,7 +120,7 @@ class UI:
120
  input_text = gr.Textbox(label="Input", lines=4, visible=False)
121
  output_text = gr.Textbox(label="Output", lines=6, visible=False)
122
 
123
- submit_button = gr.Button("Get Judge Evaluations")
124
  status_message = gr.Markdown(visible=False)
125
 
126
  with gr.Row():
 
99
  label="Test Type",
100
  info="Select the type of test to evaluate",
101
  )
102
+ refresh_button = gr.Button("Load from a dataset")
103
 
104
  # Create different input layouts based on test type
105
  with gr.Row():
 
120
  input_text = gr.Textbox(label="Input", lines=4, visible=False)
121
  output_text = gr.Textbox(label="Output", lines=6, visible=False)
122
 
123
+ submit_button = gr.Button("Evaluate")
124
  status_message = gr.Markdown(visible=False)
125
 
126
  with gr.Row():