dsr1-fp4-sgl-isl8192osl1024 / populate_datasets.py
daniehua's picture
Rename populate_dataset.py to populate_datasets.py
68b042b verified
#!/usr/bin/env python3
"""
Script to populate the Hugging Face dataset with mock data
"""
from datasets import Dataset
from datetime import datetime, timedelta
import random
import os
# Configuration
DATASET_ID = "daniehua/dsr1-fp4-sgl-isl8192osl1024"
HF_TOKEN = None # Get from environment or use None to use cached token
# Generate mock data
mock_data = []
teams = ["Official Test1", "Official Test2"]
base_date = datetime.now() - timedelta(days=7)
for i in range(2):
team = random.choice(teams)
timestamp = (base_date + timedelta(days=i / 2)).strftime("%Y-%m-%d %H:%M:%S")
CONC = random.choice([4, 8, 16, 32, 64])
MI355X_E2E = random.randint(1000, 2000)
MI355X_THROUGHPUT = random.randint(1000, 2000)
B200_E2E = random.randint(1000, 2000)
B200_THROUGHPUT = random.randint(1000, 2000)
E2E_RATIO = MI355X_E2E / B200_E2E
THROUGHPUT_RATIO = MI355X_THROUGHPUT / B200_THROUGHPUT
BITS_PER_BYTE = random.random()
BYTE_PERPLEXITY = random.random()
WORD_PERPLEXITY = random.random()
entry = {
"team_name": team,
"timestamp": timestamp,
"conc": CONC,
"mi355x_e2e": MI355X_E2E,
"mi355x_throughput": MI355X_THROUGHPUT,
"b200_e2e": B200_E2E,
"b200_throughput": B200_THROUGHPUT,
"e2e_ratio": E2E_RATIO,
"throughput_ratio": THROUGHPUT_RATIO,
"bits_per_byte": BITS_PER_BYTE,
"byte_perplexity": BYTE_PERPLEXITY,
"word_perplexity": WORD_PERPLEXITY
}
mock_data.append(entry)
# Sort by timestamp
mock_data.sort(key=lambda x: x["mi355x_throughput"], reverse=True)
# Create dataset and push to hub
print(f"Creating dataset with {len(mock_data)} entries...")
dataset = Dataset.from_list(mock_data)
print(f"Pushing to Hugging Face Hub: {DATASET_ID}")
# if HF_TOKEN:
# dataset.push_to_hub(DATASET_ID, token=HF_TOKEN)
# else:
# # Use cached token from huggingface-cli login
dataset.push_to_hub(DATASET_ID, token=HF_TOKEN)
print("Dataset populated successfully!")
print("\nSample entries:")