| | |
| | """ |
| | Helion-2.5-Rnd Evaluation Script |
| | Comprehensive benchmark evaluation across multiple datasets |
| | """ |
| |
|
| | import argparse |
| | import json |
| | import logging |
| | import os |
| | from collections import defaultdict |
| | from pathlib import Path |
| | from typing import Dict, List, Optional |
| |
|
| | import torch |
| | from datasets import load_dataset |
| | from tqdm import tqdm |
| | from transformers import AutoModelForCausalLM, AutoTokenizer |
| |
|
| | logging.basicConfig( |
| | level=logging.INFO, |
| | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' |
| | ) |
| | logger = logging.getLogger(__name__) |
| |
|
| |
|
| | class HelionEvaluator: |
| | """Evaluation framework for Helion model""" |
| | |
| | def __init__( |
| | self, |
| | model_path: str, |
| | device: str = "cuda", |
| | batch_size: int = 1, |
| | max_length: int = 2048 |
| | ): |
| | """ |
| | Initialize evaluator |
| | |
| | Args: |
| | model_path: Path to model or HuggingFace model ID |
| | device: Device to run evaluation on |
| | batch_size: Batch size for evaluation |
| | max_length: Maximum sequence length |
| | """ |
| | logger.info(f"Loading model from {model_path}") |
| | |
| | self.tokenizer = AutoTokenizer.from_pretrained(model_path) |
| | self.model = AutoModelForCausalLM.from_pretrained( |
| | model_path, |
| | torch_dtype=torch.bfloat16, |
| | device_map="auto", |
| | trust_remote_code=True |
| | ) |
| | |
| | self.device = device |
| | self.batch_size = batch_size |
| | self.max_length = max_length |
| | |
| | logger.info("Model loaded successfully") |
| | |
| | def generate( |
| | self, |
| | prompt: str, |
| | max_new_tokens: int = 512, |
| | temperature: float = 0.0, |
| | **kwargs |
| | ) -> str: |
| | """Generate text from prompt""" |
| | inputs = self.tokenizer( |
| | prompt, |
| | return_tensors="pt", |
| | truncation=True, |
| | max_length=self.max_length |
| | ).to(self.device) |
| | |
| | with torch.no_grad(): |
| | outputs = self.model.generate( |
| | **inputs, |
| | max_new_tokens=max_new_tokens, |
| | temperature=temperature if temperature > 0 else 1.0, |
| | do_sample=temperature > 0, |
| | pad_token_id=self.tokenizer.pad_token_id, |
| | **kwargs |
| | ) |
| | |
| | response = self.tokenizer.decode( |
| | outputs[0][inputs['input_ids'].shape[1]:], |
| | skip_special_tokens=True |
| | ) |
| | |
| | return response.strip() |
| | |
| | def evaluate_mmlu(self, num_samples: Optional[int] = None) -> Dict: |
| | """Evaluate on MMLU benchmark""" |
| | logger.info("Evaluating on MMLU...") |
| | |
| | dataset = load_dataset("cais/mmlu", "all", split="test") |
| | if num_samples: |
| | dataset = dataset.select(range(min(num_samples, len(dataset)))) |
| | |
| | correct = 0 |
| | total = 0 |
| | |
| | for example in tqdm(dataset, desc="MMLU"): |
| | question = example["question"] |
| | choices = example["choices"] |
| | answer = example["answer"] |
| | |
| | |
| | prompt = f"Question: {question}\n\nChoices:\n" |
| | for i, choice in enumerate(choices): |
| | prompt += f"{chr(65+i)}. {choice}\n" |
| | prompt += "\nAnswer: " |
| | |
| | |
| | response = self.generate(prompt, max_new_tokens=10, temperature=0.0) |
| | |
| | |
| | pred = response.strip()[0].upper() if response else "" |
| | correct_answer = chr(65 + answer) |
| | |
| | if pred == correct_answer: |
| | correct += 1 |
| | total += 1 |
| | |
| | accuracy = correct / total if total > 0 else 0 |
| | |
| | return { |
| | "benchmark": "MMLU", |
| | "accuracy": accuracy, |
| | "correct": correct, |
| | "total": total |
| | } |
| | |
| | def evaluate_gsm8k(self, num_samples: Optional[int] = None) -> Dict: |
| | """Evaluate on GSM8K mathematical reasoning""" |
| | logger.info("Evaluating on GSM8K...") |
| | |
| | dataset = load_dataset("gsm8k", "main", split="test") |
| | if num_samples: |
| | dataset = dataset.select(range(min(num_samples, len(dataset)))) |
| | |
| | correct = 0 |
| | total = 0 |
| | |
| | for example in tqdm(dataset, desc="GSM8K"): |
| | question = example["question"] |
| | answer = example["answer"] |
| | |
| | |
| | import re |
| | match = re.search(r'####\s*(-?\d+(?:,\d+)*(?:\.\d+)?)', answer) |
| | if not match: |
| | continue |
| | |
| | correct_answer = match.group(1).replace(',', '') |
| | |
| | |
| | prompt = f"Question: {question}\n\nLet's solve this step by step:\n" |
| | |
| | |
| | response = self.generate(prompt, max_new_tokens=512, temperature=0.0) |
| | |
| | |
| | pred_match = re.search(r'(?:answer is|=)\s*(-?\d+(?:,\d+)*(?:\.\d+)?)', response.lower()) |
| | if pred_match: |
| | pred_answer = pred_match.group(1).replace(',', '') |
| | if pred_answer == correct_answer: |
| | correct += 1 |
| | |
| | total += 1 |
| | |
| | accuracy = correct / total if total > 0 else 0 |
| | |
| | return { |
| | "benchmark": "GSM8K", |
| | "accuracy": accuracy, |
| | "correct": correct, |
| | "total": total |
| | } |
| | |
| | def evaluate_humaneval(self, num_samples: Optional[int] = None) -> Dict: |
| | """Evaluate on HumanEval code generation""" |
| | logger.info("Evaluating on HumanEval...") |
| | |
| | try: |
| | dataset = load_dataset("openai_humaneval", split="test") |
| | except: |
| | logger.warning("HumanEval dataset not available") |
| | return {"benchmark": "HumanEval", "error": "Dataset not available"} |
| | |
| | if num_samples: |
| | dataset = dataset.select(range(min(num_samples, len(dataset)))) |
| | |
| | results = [] |
| | |
| | for example in tqdm(dataset, desc="HumanEval"): |
| | prompt = example["prompt"] |
| | |
| | |
| | full_prompt = f"Complete the following Python function:\n\n{prompt}" |
| | response = self.generate( |
| | full_prompt, |
| | max_new_tokens=512, |
| | temperature=0.0 |
| | ) |
| | |
| | |
| | code = prompt + response |
| | |
| | results.append({ |
| | "task_id": example["task_id"], |
| | "completion": code, |
| | "test": example["test"] |
| | }) |
| | |
| | |
| | |
| | return { |
| | "benchmark": "HumanEval", |
| | "samples_generated": len(results), |
| | "note": "Full evaluation requires code execution framework" |
| | } |
| | |
| | def evaluate_truthfulqa(self, num_samples: Optional[int] = None) -> Dict: |
| | """Evaluate on TruthfulQA""" |
| | logger.info("Evaluating on TruthfulQA...") |
| | |
| | dataset = load_dataset("truthful_qa", "generation", split="validation") |
| | if num_samples: |
| | dataset = dataset.select(range(min(num_samples, len(dataset)))) |
| | |
| | responses = [] |
| | |
| | for example in tqdm(dataset, desc="TruthfulQA"): |
| | question = example["question"] |
| | |
| | prompt = f"Question: {question}\n\nProvide a truthful and accurate answer:\nAnswer: " |
| | |
| | response = self.generate(prompt, max_new_tokens=256, temperature=0.0) |
| | |
| | responses.append({ |
| | "question": question, |
| | "response": response, |
| | "best_answer": example["best_answer"], |
| | "correct_answers": example["correct_answers"], |
| | "incorrect_answers": example["incorrect_answers"] |
| | }) |
| | |
| | return { |
| | "benchmark": "TruthfulQA", |
| | "samples_evaluated": len(responses), |
| | "note": "Manual review required for truthfulness assessment" |
| | } |
| | |
| | def evaluate_all( |
| | self, |
| | output_file: Optional[str] = None, |
| | num_samples: Optional[int] = None |
| | ) -> Dict: |
| | """Run all evaluations""" |
| | logger.info("Starting comprehensive evaluation...") |
| | |
| | results = { |
| | "model": "DeepXR/Helion-2.5-Rnd", |
| | "benchmarks": {} |
| | } |
| | |
| | |
| | try: |
| | results["benchmarks"]["mmlu"] = self.evaluate_mmlu(num_samples) |
| | except Exception as e: |
| | logger.error(f"MMLU evaluation failed: {e}") |
| | results["benchmarks"]["mmlu"] = {"error": str(e)} |
| | |
| | try: |
| | results["benchmarks"]["gsm8k"] = self.evaluate_gsm8k(num_samples) |
| | except Exception as e: |
| | logger.error(f"GSM8K evaluation failed: {e}") |
| | results["benchmarks"]["gsm8k"] = {"error": str(e)} |
| | |
| | try: |
| | results["benchmarks"]["humaneval"] = self.evaluate_humaneval(num_samples) |
| | except Exception as e: |
| | logger.error(f"HumanEval evaluation failed: {e}") |
| | results["benchmarks"]["humaneval"] = {"error": str(e)} |
| | |
| | try: |
| | results["benchmarks"]["truthfulqa"] = self.evaluate_truthfulqa(num_samples) |
| | except Exception as e: |
| | logger.error(f"TruthfulQA evaluation failed: {e}") |
| | results["benchmarks"]["truthfulqa"] = {"error": str(e)} |
| | |
| | |
| | if output_file: |
| | output_path = Path(output_file) |
| | output_path.parent.mkdir(parents=True, exist_ok=True) |
| | |
| | with open(output_path, 'w') as f: |
| | json.dump(results, f, indent=2) |
| | |
| | logger.info(f"Results saved to {output_path}") |
| | |
| | |
| | logger.info("\n" + "="*50) |
| | logger.info("EVALUATION SUMMARY") |
| | logger.info("="*50) |
| | |
| | for benchmark, result in results["benchmarks"].items(): |
| | if "accuracy" in result: |
| | logger.info(f"{benchmark.upper()}: {result['accuracy']:.2%}") |
| | elif "error" in result: |
| | logger.info(f"{benchmark.upper()}: ERROR - {result['error']}") |
| | else: |
| | logger.info(f"{benchmark.upper()}: {result.get('note', 'Completed')}") |
| | |
| | return results |
| |
|
| |
|
| | def main(): |
| | """Main evaluation entry point""" |
| | parser = argparse.ArgumentParser(description="Evaluate Helion model") |
| | parser.add_argument( |
| | "--model", |
| | type=str, |
| | required=True, |
| | help="Model path or HuggingFace ID" |
| | ) |
| | parser.add_argument( |
| | "--benchmarks", |
| | type=str, |
| | nargs="+", |
| | default=["all"], |
| | choices=["all", "mmlu", "gsm8k", "humaneval", "truthfulqa"], |
| | help="Benchmarks to run" |
| | ) |
| | parser.add_argument( |
| | "--output", |
| | type=str, |
| | default="evaluation_results.json", |
| | help="Output file for results" |
| | ) |
| | parser.add_argument( |
| | "--num-samples", |
| | type=int, |
| | default=None, |
| | help="Number of samples to evaluate (for quick testing)" |
| | ) |
| | parser.add_argument( |
| | "--device", |
| | type=str, |
| | default="cuda", |
| | help="Device to use" |
| | ) |
| | parser.add_argument( |
| | "--batch-size", |
| | type=int, |
| | default=1, |
| | help="Batch size" |
| | ) |
| | |
| | args = parser.parse_args() |
| | |
| | |
| | evaluator = HelionEvaluator( |
| | model_path=args.model, |
| | device=args.device, |
| | batch_size=args.batch_size |
| | ) |
| | |
| | |
| | if "all" in args.benchmarks: |
| | results = evaluator.evaluate_all( |
| | output_file=args.output, |
| | num_samples=args.num_samples |
| | ) |
| | else: |
| | results = {"model": args.model, "benchmarks": {}} |
| | |
| | if "mmlu" in args.benchmarks: |
| | results["benchmarks"]["mmlu"] = evaluator.evaluate_mmlu(args.num_samples) |
| | |
| | if "gsm8k" in args.benchmarks: |
| | results["benchmarks"]["gsm8k"] = evaluator.evaluate_gsm8k(args.num_samples) |
| | |
| | if "humaneval" in args.benchmarks: |
| | results["benchmarks"]["humaneval"] = evaluator.evaluate_humaneval(args.num_samples) |
| | |
| | if "truthfulqa" in args.benchmarks: |
| | results["benchmarks"]["truthfulqa"] = evaluator.evaluate_truthfulqa(args.num_samples) |
| | |
| | |
| | with open(args.output, 'w') as f: |
| | json.dump(results, f, indent=2) |
| | |
| | logger.info(f"Results saved to {args.output}") |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |