hotfix - tsq table
Browse files
benchmarks/safety/qualifire-Qualifire-safety-benchmark-judges-metrics.csv
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
judge_id,judge_name,dataset,f1,bacc,avg_latency,total_time,count,correct
|
| 2 |
+
qualifire-eval,Qualifire,qualifire/Qualifire-safety-benchmark,0.8669527896995708,0.8746794871794872,1.1123814706802369,278.0953676700592,250,219
|
| 3 |
+
meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,qualifire/Qualifire-safety-benchmark,0.8278688524590164,0.8323717948717948,1.6606825695037841,415.17064237594604,250,208
|
| 4 |
+
meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,qualifire/Qualifire-safety-benchmark,0.8384279475982532,0.8500000000000001,2.0949828996658324,523.7457249164581,250,213
|
| 5 |
+
meta-llama-4-scout-17B-16E-instruct,Meta Llama 4 Scout 17B 16E Instruct,qualifire/Qualifire-safety-benchmark,0.8408163265306122,0.844551282051282,0.631532109260559,157.88302731513977,250,211
|
| 6 |
+
meta-llama-3.3-70B-instruct-turbo,Meta Llama 4 Scout 32K Instruct,qualifire/Qualifire-safety-benchmark,0.8354430379746836,0.8432692307692308,3.991186770439148,997.796692609787,250,211
|
| 7 |
+
meta-llama-3.1-8b-instruct-turbo,Meta Llama 3.1 8B Instruct,qualifire/Qualifire-safety-benchmark,0.7883817427385892,0.7958333333333334,1.352076150894165,338.01903772354126,250,199
|
| 8 |
+
gemma-2-27b-it,Gemma 2 27B,qualifire/Qualifire-safety-benchmark,0.8452830188679246,0.8397435897435898,0.9085412425994873,227.13531064987183,250,209
|
| 9 |
+
gemma-2-9b-it,Gemma 2 9B,qualifire/Qualifire-safety-benchmark,0.0,0.0,0.5193794212341308,129.84485530853271,250,0
|
| 10 |
+
mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,qualifire/Qualifire-safety-benchmark,0.8083333333333333,0.8157051282051282,0.7919255542755127,197.98138856887817,250,204
|
| 11 |
+
o3-mini, o3-mini,qualifire/Qualifire-safety-benchmark,0.772093023255814,0.7612179487179487,4.146714372634888,1036.678593158722,250,191
|
| 12 |
+
gpt-4.1,GPT-4.1,qualifire/Qualifire-safety-benchmark,0.8333333333333334,0.8458333333333333,0.9198735637664794,229.96839094161987,250,212
|
| 13 |
+
gpt-4o,GPT-4o,qualifire/Qualifire-safety-benchmark,0.848,0.8493589743589743,0.891314151763916,222.828537940979,250,212
|
| 14 |
+
gpt-4-turbo,GPT-4 Turbo,qualifire/Qualifire-safety-benchmark,0.8442622950819673,0.8483974358974359,1.2011809492111205,300.29523730278015,250,212
|
| 15 |
+
gpt-3.5-turbo,GPT-3.5 Turbo,qualifire/Qualifire-safety-benchmark,0.8244897959183674,0.828525641025641,0.6675097284317016,166.87743210792542,250,207
|
| 16 |
+
claude-3-haiku-20240307,Claude 3 Haiku,qualifire/Qualifire-safety-benchmark,0.7948717948717948,0.8067307692307693,1.1251281709671022,281.2820427417755,250,202
|
| 17 |
+
claude-3-sonnet-20240229,Claude 3 Sonnet,qualifire/Qualifire-safety-benchmark,0.8185654008438819,0.8272435897435897,1.1244929456710815,281.1232364177704,250,207
|
| 18 |
+
claude-3-opus-latest,Claude 3 Opus,qualifire/Qualifire-safety-benchmark,0.8142292490118577,0.8099358974358974,1.781647458076477,445.41186451911926,250,202
|
| 19 |
+
claude-3-5-sonnet-latest,Claude 3.5 Sonnet,qualifire/Qualifire-safety-benchmark,0.8220338983050848,0.8310897435897435,2.1737587728500367,543.4396932125092,250,208
|
| 20 |
+
claude-3-5-haiku-latest,Claude 3.5 Haiku,qualifire/Qualifire-safety-benchmark,0.8392156862745098,0.8381410256410257,1.6393817520141603,409.84543800354004,250,209
|
| 21 |
+
qwen-2.5-72b-instruct-turbo,Qwen 2.5 72B Instruct,qualifire/Qualifire-safety-benchmark,0.8412698412698413,0.8416666666666667,1.9033411865234375,475.8352966308594,250,210
|
| 22 |
+
qwen-2.5-7b-instruct-turbo,Qwen 2.5 7B Instruct,qualifire/Qualifire-safety-benchmark,0.8046875,0.8022435897435898,0.752701171875,188.17529296875,250,200
|
| 23 |
+
deepseek-v3,DeepSeek V3,qualifire/Qualifire-safety-benchmark,0.8379446640316206,0.8378205128205127,3.185033848762512,796.258462190628,250,209
|
| 24 |
+
deepseek-r1,DeepSeek R1,qualifire/Qualifire-safety-benchmark,0.8387096774193549,0.8294871794871794,14.166411603927612,3541.602900981903,250,207
|
data/leaderboard.csv
CHANGED
|
@@ -19,9 +19,9 @@ claude-3-opus-latest,Claude 3 Opus,1497.676896074783,1.0,2.0,3.0,Anthropic,Propr
|
|
| 19 |
mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,1492.172021842517,0.0,1.0,1.0,Mistral AI,Open Source,
|
| 20 |
gemma-2-27b-it,Gemma 2 27B,1484.736306793522,0.0,1.0,1.0,Google,Open Source,
|
| 21 |
meta-llama-3.3-70B-instruct-turbo,Meta Llama 4 Scout 32K Instruct,1484.0,0.0,1.0,1.0,Meta,Open Source,
|
| 22 |
-
meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,1469.
|
| 23 |
gpt-4.1,GPT-4.1,1468.847220765222,0.0,2.0,2.0,OpenAI,Proprietary,
|
| 24 |
deepseek-v3,DeepSeek V3,1466.4505035965371,0.0,3.0,3.0,DeepSeek,Open Source,
|
| 25 |
deepseek-r1,DeepSeek R1,1466.3355627816525,1.0,4.0,5.0,DeepSeek,Open Source,
|
| 26 |
-
o3-mini, o3-mini,1456.
|
| 27 |
-
gemma-2-9b-it,Gemma 2 9B,1322.
|
|
|
|
| 19 |
mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,1492.172021842517,0.0,1.0,1.0,Mistral AI,Open Source,
|
| 20 |
gemma-2-27b-it,Gemma 2 27B,1484.736306793522,0.0,1.0,1.0,Google,Open Source,
|
| 21 |
meta-llama-3.3-70B-instruct-turbo,Meta Llama 4 Scout 32K Instruct,1484.0,0.0,1.0,1.0,Meta,Open Source,
|
| 22 |
+
meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,1469.353800586951,0.0,3.0,3.0,Meta,Open Source,
|
| 23 |
gpt-4.1,GPT-4.1,1468.847220765222,0.0,2.0,2.0,OpenAI,Proprietary,
|
| 24 |
deepseek-v3,DeepSeek V3,1466.4505035965371,0.0,3.0,3.0,DeepSeek,Open Source,
|
| 25 |
deepseek-r1,DeepSeek R1,1466.3355627816525,1.0,4.0,5.0,DeepSeek,Open Source,
|
| 26 |
+
o3-mini, o3-mini,1456.1196407049383,0.0,3.0,3.0,OpenAI,Proprietary,
|
| 27 |
+
gemma-2-9b-it,Gemma 2 9B,1322.1433222517269,3.0,28.0,31.0,Google,Open Source,
|
run_benchmarks.py
CHANGED
|
@@ -45,15 +45,20 @@ def get_test_type_from_dataset(dataset_name: str) -> str:
|
|
| 45 |
|
| 46 |
|
| 47 |
def parse_ground_truth(
|
|
|
|
| 48 |
label: Any,
|
| 49 |
safe_label: str = "SAFE",
|
| 50 |
-
unsafe_label: str = "
|
| 51 |
) -> str:
|
| 52 |
"""Convert various label formats to a standardized format for comparison"""
|
| 53 |
if isinstance(label, bool):
|
|
|
|
|
|
|
| 54 |
return "SAFE" if label else "UNSAFE"
|
| 55 |
elif isinstance(label, (int, float)):
|
| 56 |
-
|
|
|
|
|
|
|
| 57 |
elif isinstance(label, str):
|
| 58 |
label = label.upper()
|
| 59 |
# Map common label formats to the ones used by judges
|
|
@@ -326,7 +331,7 @@ def evaluate_dataset(
|
|
| 326 |
# Get ground truth if available
|
| 327 |
ground_truth = None
|
| 328 |
if label_column and label_column in sample:
|
| 329 |
-
ground_truth = parse_ground_truth(sample[label_column])
|
| 330 |
|
| 331 |
# Evaluate with each judge
|
| 332 |
for judge in models:
|
|
|
|
| 45 |
|
| 46 |
|
| 47 |
def parse_ground_truth(
|
| 48 |
+
label_column: str,
|
| 49 |
label: Any,
|
| 50 |
safe_label: str = "SAFE",
|
| 51 |
+
unsafe_label: str = "UNSAFE",
|
| 52 |
) -> str:
|
| 53 |
"""Convert various label formats to a standardized format for comparison"""
|
| 54 |
if isinstance(label, bool):
|
| 55 |
+
if label_column.lower() == "safe":
|
| 56 |
+
return "SAFE" if not label else "UNSAFE"
|
| 57 |
return "SAFE" if label else "UNSAFE"
|
| 58 |
elif isinstance(label, (int, float)):
|
| 59 |
+
if label_column.lower() == "safe":
|
| 60 |
+
return safe_label if label > 0.5 else unsafe_label
|
| 61 |
+
return "UNSAFE" if label > 0.5 else "SAFE"
|
| 62 |
elif isinstance(label, str):
|
| 63 |
label = label.upper()
|
| 64 |
# Map common label formats to the ones used by judges
|
|
|
|
| 331 |
# Get ground truth if available
|
| 332 |
ground_truth = None
|
| 333 |
if label_column and label_column in sample:
|
| 334 |
+
ground_truth = parse_ground_truth(label_column, sample[label_column])
|
| 335 |
|
| 336 |
# Evaluate with each judge
|
| 337 |
for judge in models:
|