Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| """ | |
| Demo script for RND1 generation. | |
| """ | |
| import torch | |
| import argparse | |
| import os | |
| import sys | |
| import random | |
| import numpy as np | |
| from transformers import AutoTokenizer | |
| # Add RND1 module to path for local testing | |
| sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) | |
| def set_seed(seed: int): | |
| """Set random seed for reproducibility. | |
| """ | |
| random.seed(seed) | |
| np.random.seed(seed) | |
| torch.manual_seed(seed) | |
| torch.cuda.manual_seed_all(seed) | |
| def demo_completion( | |
| model_path: str, | |
| checkpoint_path: str = None, | |
| device: str = "cuda:0", | |
| use_bfloat16: bool = True, | |
| show_visualization: bool = True, | |
| num_steps: int = 64, | |
| max_new_tokens: int = 256, | |
| custom_prompt: str = None, | |
| temperature: float = 1.0, | |
| top_k: int = None, | |
| top_p: float = None, | |
| mask_token_id: int = 151669, | |
| seed: int = 12345, | |
| moe_backend: str = "hf", | |
| mode: str = "task", | |
| ): | |
| """ | |
| Demonstrate text completion using RND1. | |
| Args: | |
| model_path: Path to base model or HuggingFace model ID | |
| checkpoint_path: Path to custom checkpoint (if any) | |
| device: Device to run on (e.g., cuda:0, cpu) | |
| use_bfloat16: Whether to use bfloat16 precision | |
| show_visualization: Whether to show live visualization (requires rich) | |
| num_steps: Number of diffusion steps | |
| max_new_tokens: Maximum number of tokens to generate | |
| custom_prompt: Custom prompt to use instead of default examples | |
| temperature: Temperature for sampling (0.0 = greedy) | |
| top_k: Top-k filtering for sampling (None = disabled) | |
| top_p: Top-p (nucleus) filtering for sampling (None = disabled) | |
| mask_token_id: Token ID for mask token | |
| seed: Random seed for reproducibility | |
| moe_backend: MoE backend to use ('hf' or 'flashinfer') | |
| mode: Generation mode ('task' for Q&A format, 'completion' for continuation) | |
| """ | |
| set_seed(seed) | |
| from rnd.configuration_rnd import RND1Config | |
| from rnd.modeling_rnd import RND1LM | |
| print("Loading tokenizer...") | |
| tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) | |
| dtype = torch.bfloat16 if use_bfloat16 else torch.float32 | |
| print(f"Using dtype: {dtype}") | |
| if moe_backend == "hf": | |
| print("\n⚠️ Note: HuggingFace backend is slower. Consider using --moe_backend flashinfer or sglang for better performance.\n") | |
| # Load from checkpoint if provided, otherwise from model_path | |
| load_path = checkpoint_path if checkpoint_path else model_path | |
| print(f"Loading model from {load_path}...") | |
| # Load config and set RND1-specific settings | |
| cfg = RND1Config.from_pretrained(load_path) | |
| cfg.model_type = "rnd1" | |
| cfg.attn_implementation = "sdpa" | |
| cfg.moe_backend = moe_backend | |
| # Load model with RND1LM | |
| model = RND1LM.from_pretrained( | |
| load_path, | |
| config=cfg, | |
| torch_dtype=dtype, | |
| device_map="auto" if device == "cuda:0" else device, | |
| trust_remote_code=True, | |
| use_safetensors=True, | |
| low_cpu_mem_usage=True, | |
| ) | |
| print("Model loaded") | |
| model = model.eval() | |
| if custom_prompt: | |
| prompts = [custom_prompt] | |
| else: | |
| # Default prompts based on mode | |
| if mode == "task": | |
| prompts = ["Write a Python function that finds the longest common subsequence of two strings. Include comments explaining the algorithm."] | |
| else: | |
| prompts = ["The key to understanding quantum computing lies in"] | |
| greedy = (temperature == 1.0) | |
| generator = torch.Generator(device=device if device != "auto" else "cuda") | |
| generator.manual_seed(seed) | |
| for i, user_prompt in enumerate(prompts): | |
| print(f"\n{'='*60}") | |
| print(f"Mode: {mode.upper()}") | |
| print(f"Prompt {i+1}: {user_prompt[:100]}...") | |
| print(f"{'='*60}\n") | |
| if mode == "task": | |
| # Task mode: Add "Question: " prefix if not already present | |
| if not user_prompt.strip().startswith("Question:"): | |
| prompt = f"Question: {user_prompt}\n" | |
| else: | |
| prompt = user_prompt | |
| else: | |
| # Completion mode: Use prompt as-is for continuation | |
| prompt = user_prompt | |
| inputs = tokenizer(prompt, return_tensors="pt") | |
| input_ids = inputs.input_ids.to(device if device != "auto" else "cuda") | |
| attention_mask = inputs.attention_mask.to(device if device != "auto" else "cuda") if 'attention_mask' in inputs else None | |
| print("Generation parameters:") | |
| print(f" Prompt length: {input_ids.shape[1]} tokens") | |
| print(f" Max new tokens: {max_new_tokens}") | |
| print(f" Total sequence: {input_ids.shape[1] + max_new_tokens} tokens") | |
| print(f" Diffusion steps: {num_steps}") | |
| print(f" Temperature: {temperature}") | |
| print(f" Greedy: {greedy}") | |
| if top_k: | |
| print(f" Top-k: {top_k}") | |
| if top_p: | |
| print(f" Top-p: {top_p}") | |
| print() | |
| # Create explicit generation config that takes priority over model defaults | |
| from rnd.generation_config import RND1GenerationConfig | |
| gen_config = RND1GenerationConfig( | |
| max_new_tokens=max_new_tokens, | |
| num_diffusion_steps=num_steps, | |
| mask_token_id=mask_token_id, | |
| temperature=temperature if not greedy else 1.0, | |
| top_k=top_k, | |
| top_p=top_p, | |
| greedy=greedy, | |
| eos_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id else 151645, | |
| pad_token_id=tokenizer.pad_token_id, | |
| bos_token_id=tokenizer.bos_token_id, | |
| ) | |
| with torch.no_grad(): | |
| if show_visualization and hasattr(model, 'generate_with_visualization'): | |
| # Use method with visualization support (requires tokenizer) | |
| output = model.generate_with_visualization( | |
| tokenizer=tokenizer, | |
| inputs=input_ids, | |
| generation_config=gen_config, | |
| generator=generator, | |
| ) | |
| else: | |
| # Use standard generate method with explicit config | |
| output = model.generate( | |
| inputs=input_ids, | |
| generation_config=gen_config, | |
| generator=generator, | |
| ) | |
| generated_tokens = output[0][len(input_ids[0]):] | |
| generation = tokenizer.decode( | |
| generated_tokens.tolist(), | |
| skip_special_tokens=True | |
| ) | |
| print("\nGenerated response:") | |
| print(generation) | |
| print(f"\n(Generation completed in {num_steps} diffusion steps)") | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description="RND1 diffusion model demo with live visualization", | |
| formatter_class=argparse.ArgumentDefaultsHelpFormatter | |
| ) | |
| # Model configuration | |
| model_group = parser.add_argument_group('Model Configuration') | |
| model_group.add_argument( | |
| "--model_path", | |
| type=str, | |
| default="radicalnumerics/RND1-Base-0910", | |
| help="Path to model or HuggingFace model ID" | |
| ) | |
| model_group.add_argument( | |
| "--checkpoint", | |
| type=str, | |
| default=None, | |
| help="Path to custom checkpoint file or directory" | |
| ) | |
| model_group.add_argument( | |
| "--device", | |
| type=str, | |
| default="cuda:0", | |
| help="Device to run on (e.g., cuda:0, cpu)" | |
| ) | |
| model_group.add_argument( | |
| "--fp32", | |
| action="store_true", | |
| help="Use FP32 precision instead of BF16" | |
| ) | |
| # Generation configuration | |
| gen_group = parser.add_argument_group('Generation Settings') | |
| gen_group.add_argument( | |
| "--num_steps", | |
| type=int, | |
| default=256, | |
| help="Number of diffusion steps" | |
| ) | |
| gen_group.add_argument( | |
| "--max_new_tokens", | |
| type=int, | |
| default=256, | |
| help="Maximum number of tokens to generate" | |
| ) | |
| gen_group.add_argument( | |
| "--prompt", | |
| type=str, | |
| default=None, | |
| help="Custom prompt to use for generation" | |
| ) | |
| gen_group.add_argument( | |
| "--mode", | |
| type=str, | |
| default="task", | |
| choices=["task", "completion"], | |
| help="Generation mode: 'task' (Q&A format for instructions) or 'completion' (text continuation)" | |
| ) | |
| gen_group.add_argument( | |
| "--mask_token_id", | |
| type=int, | |
| default=151669, | |
| help="Token ID for mask token" | |
| ) | |
| # Sampling configuration | |
| sampling_group = parser.add_argument_group('Sampling Parameters') | |
| sampling_group.add_argument( | |
| "--temperature", | |
| type=float, | |
| default=1.0, | |
| help="Temperature for sampling (1.0 = greedy/deterministic)" | |
| ) | |
| sampling_group.add_argument( | |
| "--top_k", | |
| type=int, | |
| default=None, | |
| help="Top-k filtering: keep only k most likely tokens" | |
| ) | |
| sampling_group.add_argument( | |
| "--top_p", | |
| type=float, | |
| default=None, | |
| help="Top-p (nucleus) filtering: keep tokens with cumulative probability <= p" | |
| ) | |
| # Visualization | |
| viz_group = parser.add_argument_group('Visualization') | |
| viz_group.add_argument( | |
| "--no_viz", | |
| action="store_true", | |
| help="Disable live visualization during generation (requires rich library)" | |
| ) | |
| # Other settings | |
| other_group = parser.add_argument_group('Other Settings') | |
| other_group.add_argument( | |
| "--seed", | |
| type=int, | |
| default=12345, | |
| help="Random seed for reproducibility" | |
| ) | |
| moe_backend_group = parser.add_argument_group('MoE Backend') | |
| moe_backend_group.add_argument( | |
| "--moe_backend", | |
| type=str, | |
| default="hf", | |
| choices=["hf", "flashinfer", "sglang"], | |
| help="MoE backend to use for sparse mixture of experts layers" | |
| ) | |
| args = parser.parse_args() | |
| if args.temperature < 0: | |
| parser.error("Temperature must be non-negative") | |
| if args.top_k is not None and args.top_k <= 0: | |
| parser.error("Top-k must be positive") | |
| if args.top_p is not None and (args.top_p <= 0 or args.top_p > 1): | |
| parser.error("Top-p must be between 0 and 1") | |
| print("\n" + "="*60) | |
| print("RND1 Diffusion Language Model Demo") | |
| print("="*60) | |
| print("Configuration:") | |
| print(f" Model: {args.model_path}") | |
| if args.checkpoint: | |
| print(f" Checkpoint: {args.checkpoint}") | |
| print(f" Device: {args.device}") | |
| print(f" Precision: {'FP32' if args.fp32 else 'BF16'}") | |
| print(f" Mode: {args.mode.upper()} ({'Q&A format for instructions' if args.mode == 'task' else 'Text continuation'})") | |
| print(f" Random seed: {args.seed}") | |
| print(f" Diffusion steps: {args.num_steps}") | |
| print(f" Max new tokens: {args.max_new_tokens}") | |
| print(f" Algorithm: Entropy-based selection") | |
| print(f" Temperature: {args.temperature}") | |
| if args.top_k: | |
| print(f" Top-k: {args.top_k}") | |
| if args.top_p: | |
| print(f" Top-p: {args.top_p}") | |
| print(f" MoE Backend: {args.moe_backend}") | |
| print(f" Visualization: {'Enabled' if not args.no_viz else 'Disabled'}") | |
| print("="*60 + "\n") | |
| demo_completion( | |
| model_path=args.model_path, | |
| checkpoint_path=args.checkpoint, | |
| device=args.device, | |
| use_bfloat16=not args.fp32, | |
| show_visualization=not args.no_viz, | |
| num_steps=args.num_steps, | |
| max_new_tokens=args.max_new_tokens, | |
| custom_prompt=args.prompt, | |
| temperature=args.temperature, | |
| top_k=args.top_k, | |
| top_p=args.top_p, | |
| mask_token_id=args.mask_token_id, | |
| seed=args.seed, | |
| moe_backend=args.moe_backend, | |
| mode=args.mode, | |
| ) | |
| if __name__ == "__main__": | |
| main() |