#!/usr/bin/env python3
"""
Script to populate the Hugging Face dataset with mock data
"""

from datasets import Dataset
from datetime import datetime, timedelta
import random
import os

# Configuration
DATASET_ID = "daniehua/dsr1-fp4-sgl-isl8192osl1024"
HF_TOKEN = None  # Get from environment or use None to use cached token

# Generate mock data
mock_data = []

teams = ["Official Test1", "Official Test2"]
base_date = datetime.now() - timedelta(days=7)

for i in range(2):
    team = random.choice(teams)
    timestamp = (base_date + timedelta(days=i / 2)).strftime("%Y-%m-%d %H:%M:%S")

    CONC = random.choice([4, 8, 16, 32, 64])
    MI355X_E2E = random.randint(1000, 2000)
    MI355X_THROUGHPUT = random.randint(1000, 2000)
    B200_E2E = random.randint(1000, 2000)
    B200_THROUGHPUT = random.randint(1000, 2000)
    E2E_RATIO = MI355X_E2E / B200_E2E
    THROUGHPUT_RATIO = MI355X_THROUGHPUT / B200_THROUGHPUT
    BITS_PER_BYTE = random.random()
    BYTE_PERPLEXITY = random.random()
    WORD_PERPLEXITY = random.random()

    entry = {
        "team_name": team,
        "timestamp": timestamp,
        "conc": CONC,
        "mi355x_e2e": MI355X_E2E,
        "mi355x_throughput": MI355X_THROUGHPUT,
        "b200_e2e": B200_E2E,
        "b200_throughput": B200_THROUGHPUT,
        "e2e_ratio": E2E_RATIO,
        "throughput_ratio": THROUGHPUT_RATIO,
        "bits_per_byte": BITS_PER_BYTE,
        "byte_perplexity": BYTE_PERPLEXITY,
        "word_perplexity": WORD_PERPLEXITY
    }

    mock_data.append(entry)

# Sort by timestamp
mock_data.sort(key=lambda x: x["mi355x_throughput"], reverse=True)

# Create dataset and push to hub
print(f"Creating dataset with {len(mock_data)} entries...")
dataset = Dataset.from_list(mock_data)

print(f"Pushing to Hugging Face Hub: {DATASET_ID}")
# if HF_TOKEN:
#     dataset.push_to_hub(DATASET_ID, token=HF_TOKEN)
# else:
#     # Use cached token from huggingface-cli login
dataset.push_to_hub(DATASET_ID, token=HF_TOKEN)

print("Dataset populated successfully!")
print("\nSample entries:")