dror44 commited on
Commit
fba21bb
·
1 Parent(s): d43ab95

hotfix - tsq table

Browse files
benchmarks/safety/qualifire-Qualifire-safety-benchmark-judges-metrics.csv ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ judge_id,judge_name,dataset,f1,bacc,avg_latency,total_time,count,correct
2
+ qualifire-eval,Qualifire,qualifire/Qualifire-safety-benchmark,0.8669527896995708,0.8746794871794872,1.1123814706802369,278.0953676700592,250,219
3
+ meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,qualifire/Qualifire-safety-benchmark,0.8278688524590164,0.8323717948717948,1.6606825695037841,415.17064237594604,250,208
4
+ meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,qualifire/Qualifire-safety-benchmark,0.8384279475982532,0.8500000000000001,2.0949828996658324,523.7457249164581,250,213
5
+ meta-llama-4-scout-17B-16E-instruct,Meta Llama 4 Scout 17B 16E Instruct,qualifire/Qualifire-safety-benchmark,0.8408163265306122,0.844551282051282,0.631532109260559,157.88302731513977,250,211
6
+ meta-llama-3.3-70B-instruct-turbo,Meta Llama 4 Scout 32K Instruct,qualifire/Qualifire-safety-benchmark,0.8354430379746836,0.8432692307692308,3.991186770439148,997.796692609787,250,211
7
+ meta-llama-3.1-8b-instruct-turbo,Meta Llama 3.1 8B Instruct,qualifire/Qualifire-safety-benchmark,0.7883817427385892,0.7958333333333334,1.352076150894165,338.01903772354126,250,199
8
+ gemma-2-27b-it,Gemma 2 27B,qualifire/Qualifire-safety-benchmark,0.8452830188679246,0.8397435897435898,0.9085412425994873,227.13531064987183,250,209
9
+ gemma-2-9b-it,Gemma 2 9B,qualifire/Qualifire-safety-benchmark,0.0,0.0,0.5193794212341308,129.84485530853271,250,0
10
+ mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,qualifire/Qualifire-safety-benchmark,0.8083333333333333,0.8157051282051282,0.7919255542755127,197.98138856887817,250,204
11
+ o3-mini, o3-mini,qualifire/Qualifire-safety-benchmark,0.772093023255814,0.7612179487179487,4.146714372634888,1036.678593158722,250,191
12
+ gpt-4.1,GPT-4.1,qualifire/Qualifire-safety-benchmark,0.8333333333333334,0.8458333333333333,0.9198735637664794,229.96839094161987,250,212
13
+ gpt-4o,GPT-4o,qualifire/Qualifire-safety-benchmark,0.848,0.8493589743589743,0.891314151763916,222.828537940979,250,212
14
+ gpt-4-turbo,GPT-4 Turbo,qualifire/Qualifire-safety-benchmark,0.8442622950819673,0.8483974358974359,1.2011809492111205,300.29523730278015,250,212
15
+ gpt-3.5-turbo,GPT-3.5 Turbo,qualifire/Qualifire-safety-benchmark,0.8244897959183674,0.828525641025641,0.6675097284317016,166.87743210792542,250,207
16
+ claude-3-haiku-20240307,Claude 3 Haiku,qualifire/Qualifire-safety-benchmark,0.7948717948717948,0.8067307692307693,1.1251281709671022,281.2820427417755,250,202
17
+ claude-3-sonnet-20240229,Claude 3 Sonnet,qualifire/Qualifire-safety-benchmark,0.8185654008438819,0.8272435897435897,1.1244929456710815,281.1232364177704,250,207
18
+ claude-3-opus-latest,Claude 3 Opus,qualifire/Qualifire-safety-benchmark,0.8142292490118577,0.8099358974358974,1.781647458076477,445.41186451911926,250,202
19
+ claude-3-5-sonnet-latest,Claude 3.5 Sonnet,qualifire/Qualifire-safety-benchmark,0.8220338983050848,0.8310897435897435,2.1737587728500367,543.4396932125092,250,208
20
+ claude-3-5-haiku-latest,Claude 3.5 Haiku,qualifire/Qualifire-safety-benchmark,0.8392156862745098,0.8381410256410257,1.6393817520141603,409.84543800354004,250,209
21
+ qwen-2.5-72b-instruct-turbo,Qwen 2.5 72B Instruct,qualifire/Qualifire-safety-benchmark,0.8412698412698413,0.8416666666666667,1.9033411865234375,475.8352966308594,250,210
22
+ qwen-2.5-7b-instruct-turbo,Qwen 2.5 7B Instruct,qualifire/Qualifire-safety-benchmark,0.8046875,0.8022435897435898,0.752701171875,188.17529296875,250,200
23
+ deepseek-v3,DeepSeek V3,qualifire/Qualifire-safety-benchmark,0.8379446640316206,0.8378205128205127,3.185033848762512,796.258462190628,250,209
24
+ deepseek-r1,DeepSeek R1,qualifire/Qualifire-safety-benchmark,0.8387096774193549,0.8294871794871794,14.166411603927612,3541.602900981903,250,207
data/leaderboard.csv CHANGED
@@ -19,9 +19,9 @@ claude-3-opus-latest,Claude 3 Opus,1497.676896074783,1.0,2.0,3.0,Anthropic,Propr
19
  mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,1492.172021842517,0.0,1.0,1.0,Mistral AI,Open Source,
20
  gemma-2-27b-it,Gemma 2 27B,1484.736306793522,0.0,1.0,1.0,Google,Open Source,
21
  meta-llama-3.3-70B-instruct-turbo,Meta Llama 4 Scout 32K Instruct,1484.0,0.0,1.0,1.0,Meta,Open Source,
22
- meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,1469.3538005869507,0.0,3.0,3.0,Meta,Open Source,
23
  gpt-4.1,GPT-4.1,1468.847220765222,0.0,2.0,2.0,OpenAI,Proprietary,
24
  deepseek-v3,DeepSeek V3,1466.4505035965371,0.0,3.0,3.0,DeepSeek,Open Source,
25
  deepseek-r1,DeepSeek R1,1466.3355627816525,1.0,4.0,5.0,DeepSeek,Open Source,
26
- o3-mini, o3-mini,1456.1196407049385,0.0,3.0,3.0,OpenAI,Proprietary,
27
- gemma-2-9b-it,Gemma 2 9B,1322.1433222517267,3.0,28.0,31.0,Google,Open Source,
 
19
  mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,1492.172021842517,0.0,1.0,1.0,Mistral AI,Open Source,
20
  gemma-2-27b-it,Gemma 2 27B,1484.736306793522,0.0,1.0,1.0,Google,Open Source,
21
  meta-llama-3.3-70B-instruct-turbo,Meta Llama 4 Scout 32K Instruct,1484.0,0.0,1.0,1.0,Meta,Open Source,
22
+ meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,1469.353800586951,0.0,3.0,3.0,Meta,Open Source,
23
  gpt-4.1,GPT-4.1,1468.847220765222,0.0,2.0,2.0,OpenAI,Proprietary,
24
  deepseek-v3,DeepSeek V3,1466.4505035965371,0.0,3.0,3.0,DeepSeek,Open Source,
25
  deepseek-r1,DeepSeek R1,1466.3355627816525,1.0,4.0,5.0,DeepSeek,Open Source,
26
+ o3-mini, o3-mini,1456.1196407049383,0.0,3.0,3.0,OpenAI,Proprietary,
27
+ gemma-2-9b-it,Gemma 2 9B,1322.1433222517269,3.0,28.0,31.0,Google,Open Source,
run_benchmarks.py CHANGED
@@ -45,15 +45,20 @@ def get_test_type_from_dataset(dataset_name: str) -> str:
45
 
46
 
47
  def parse_ground_truth(
 
48
  label: Any,
49
  safe_label: str = "SAFE",
50
- unsafe_label: str = "PROMPT_INJECTION",
51
  ) -> str:
52
  """Convert various label formats to a standardized format for comparison"""
53
  if isinstance(label, bool):
 
 
54
  return "SAFE" if label else "UNSAFE"
55
  elif isinstance(label, (int, float)):
56
- return unsafe_label if label > 0.5 else safe_label
 
 
57
  elif isinstance(label, str):
58
  label = label.upper()
59
  # Map common label formats to the ones used by judges
@@ -326,7 +331,7 @@ def evaluate_dataset(
326
  # Get ground truth if available
327
  ground_truth = None
328
  if label_column and label_column in sample:
329
- ground_truth = parse_ground_truth(sample[label_column])
330
 
331
  # Evaluate with each judge
332
  for judge in models:
 
45
 
46
 
47
  def parse_ground_truth(
48
+ label_column: str,
49
  label: Any,
50
  safe_label: str = "SAFE",
51
+ unsafe_label: str = "UNSAFE",
52
  ) -> str:
53
  """Convert various label formats to a standardized format for comparison"""
54
  if isinstance(label, bool):
55
+ if label_column.lower() == "safe":
56
+ return "SAFE" if not label else "UNSAFE"
57
  return "SAFE" if label else "UNSAFE"
58
  elif isinstance(label, (int, float)):
59
+ if label_column.lower() == "safe":
60
+ return safe_label if label > 0.5 else unsafe_label
61
+ return "UNSAFE" if label > 0.5 else "SAFE"
62
  elif isinstance(label, str):
63
  label = label.upper()
64
  # Map common label formats to the ones used by judges
 
331
  # Get ground truth if available
332
  ground_truth = None
333
  if label_column and label_column in sample:
334
+ ground_truth = parse_ground_truth(label_column, sample[label_column])
335
 
336
  # Evaluate with each judge
337
  for judge in models: