Spaces:
Running
Running
| # IMPORTANT: spaces must be imported first to avoid CUDA initialization issues | |
| import spaces | |
| # Standard library imports | |
| import os | |
| # Third-party imports (non-CUDA) | |
| import numpy as np | |
| from PIL import Image | |
| import gradio as gr | |
| # CUDA-related imports (must come after spaces) | |
| import torch | |
| from diffusers import WanPipeline, AutoencoderKLWan | |
| from diffusers.utils import export_to_video | |
| # Model configuration | |
| MODEL_ID = "Wan-AI/Wan2.2-TI2V-5B-Diffusers" | |
| dtype = torch.bfloat16 | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # Global pipeline variable | |
| pipe = None | |
| def initialize_pipeline(): | |
| """Initialize the Wan2.2 pipeline""" | |
| global pipe | |
| if pipe is None: | |
| print("Loading Wan2.2-TI2V-5B model...") | |
| vae = AutoencoderKLWan.from_pretrained( | |
| MODEL_ID, | |
| subfolder="vae", | |
| torch_dtype=torch.float32 | |
| ) | |
| pipe = WanPipeline.from_pretrained( | |
| MODEL_ID, | |
| vae=vae, | |
| torch_dtype=dtype | |
| ) | |
| pipe.to(device) | |
| print("Model loaded successfully!") | |
| return pipe | |
| # Allocate GPU for 3 minutes (max allowed for Pro) | |
| def generate_video( | |
| prompt: str, | |
| image: Image.Image = None, | |
| width: int = 1280, | |
| height: int = 704, | |
| num_frames: int = 73, | |
| num_inference_steps: int = 35, | |
| guidance_scale: float = 5.0, | |
| seed: int = -1 | |
| ): | |
| """ | |
| Generate video from text prompt and optional image | |
| Args: | |
| prompt: Text description of the video to generate | |
| image: Optional input image for image-to-video generation | |
| width: Video width (default: 1280) | |
| height: Video height (default: 704) | |
| num_frames: Number of frames to generate (default: 73 for 3 seconds at 24fps) | |
| num_inference_steps: Number of denoising steps (default: 35 for faster generation) | |
| guidance_scale: Guidance scale for generation (default: 5.0) | |
| seed: Random seed for reproducibility (-1 for random) | |
| """ | |
| try: | |
| # Initialize pipeline | |
| pipeline = initialize_pipeline() | |
| # Set seed for reproducibility | |
| if seed == -1: | |
| seed = torch.randint(0, 2**32 - 1, (1,)).item() | |
| generator = torch.Generator(device=device).manual_seed(seed) | |
| # Prepare generation parameters | |
| gen_params = { | |
| "prompt": prompt, | |
| "height": height, | |
| "width": width, | |
| "num_frames": num_frames, | |
| "guidance_scale": guidance_scale, | |
| "num_inference_steps": num_inference_steps, | |
| "generator": generator, | |
| } | |
| # Add image if provided (for image-to-video) | |
| if image is not None: | |
| gen_params["image"] = image | |
| # Generate video | |
| print(f"Generating video with prompt: {prompt}") | |
| print(f"Parameters: {width}x{height}, {num_frames} frames, seed: {seed}") | |
| output = pipeline(**gen_params).frames[0] | |
| # Export to video file | |
| output_path = "output.mp4" | |
| export_to_video(output, output_path, fps=24) | |
| return output_path, f"Video generated successfully! Seed used: {seed}" | |
| except Exception as e: | |
| error_msg = f"Error generating video: {str(e)}" | |
| print(error_msg) | |
| return None, error_msg | |
| # Create Gradio interface | |
| with gr.Blocks(title="Wan2.2 Video Generation") as demo: | |
| gr.Markdown( | |
| """ | |
| # Wan2.2 Video Generation | |
| Generate high-quality videos from text prompts or images using Wan2.2-TI2V-5B model. | |
| This model supports both **Text-to-Video** and **Image-to-Video** generation at 720P/24fps. | |
| **Note:** Generation takes 2-3 minutes. Settings are optimized for Zero GPU 3-minute limit. | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| # Input controls | |
| prompt_input = gr.Textbox( | |
| label="Prompt", | |
| placeholder="Describe the video you want to generate...", | |
| lines=3, | |
| value="Two anthropomorphic cats in comfy boxing gear fight on stage" | |
| ) | |
| image_input = gr.Image( | |
| label="Input Image (Optional - for Image-to-Video)", | |
| type="pil", | |
| sources=["upload"] | |
| ) | |
| with gr.Accordion("Advanced Settings", open=False): | |
| with gr.Row(): | |
| width_input = gr.Slider( | |
| label="Width", | |
| minimum=512, | |
| maximum=1920, | |
| step=64, | |
| value=1280 | |
| ) | |
| height_input = gr.Slider( | |
| label="Height", | |
| minimum=512, | |
| maximum=1080, | |
| step=64, | |
| value=704 | |
| ) | |
| num_frames_input = gr.Slider( | |
| label="Number of Frames (more frames = longer video)", | |
| minimum=25, | |
| maximum=145, | |
| step=24, | |
| value=73, | |
| info="73 frames ≈ 3 seconds at 24fps (optimized for Zero GPU limits)" | |
| ) | |
| num_steps_input = gr.Slider( | |
| label="Inference Steps (more steps = better quality, slower)", | |
| minimum=20, | |
| maximum=60, | |
| step=5, | |
| value=35 | |
| ) | |
| guidance_scale_input = gr.Slider( | |
| label="Guidance Scale (higher = closer to prompt)", | |
| minimum=1.0, | |
| maximum=15.0, | |
| step=0.5, | |
| value=5.0 | |
| ) | |
| seed_input = gr.Number( | |
| label="Seed (-1 for random)", | |
| value=-1, | |
| precision=0 | |
| ) | |
| generate_btn = gr.Button("Generate Video", variant="primary", size="lg") | |
| with gr.Column(): | |
| # Output | |
| video_output = gr.Video( | |
| label="Generated Video", | |
| autoplay=True | |
| ) | |
| status_output = gr.Textbox( | |
| label="Status", | |
| lines=2 | |
| ) | |
| # Examples | |
| gr.Examples( | |
| examples=[ | |
| [ | |
| "Two anthropomorphic cats in comfy boxing gear fight on stage", | |
| None, | |
| 1280, | |
| 704, | |
| 73, | |
| 35, | |
| 5.0, | |
| 42 | |
| ], | |
| [ | |
| "A serene underwater scene with colorful coral reefs and tropical fish swimming gracefully", | |
| None, | |
| 1280, | |
| 704, | |
| 73, | |
| 35, | |
| 5.0, | |
| 123 | |
| ], | |
| [ | |
| "A bustling futuristic city at night with neon lights and flying cars", | |
| None, | |
| 1280, | |
| 704, | |
| 73, | |
| 35, | |
| 5.0, | |
| 456 | |
| ], | |
| [ | |
| "A peaceful mountain landscape with snow-capped peaks and a flowing river", | |
| None, | |
| 1280, | |
| 704, | |
| 73, | |
| 35, | |
| 5.0, | |
| 789 | |
| ], | |
| ], | |
| inputs=[ | |
| prompt_input, | |
| image_input, | |
| width_input, | |
| height_input, | |
| num_frames_input, | |
| num_steps_input, | |
| guidance_scale_input, | |
| seed_input | |
| ], | |
| outputs=[video_output, status_output], | |
| fn=generate_video, | |
| cache_examples=False, | |
| ) | |
| # Connect generate button | |
| generate_btn.click( | |
| fn=generate_video, | |
| inputs=[ | |
| prompt_input, | |
| image_input, | |
| width_input, | |
| height_input, | |
| num_frames_input, | |
| num_steps_input, | |
| guidance_scale_input, | |
| seed_input | |
| ], | |
| outputs=[video_output, status_output] | |
| ) | |
| gr.Markdown( | |
| """ | |
| ## Tips for Best Results: | |
| - Use detailed, descriptive prompts | |
| - For image-to-video: Upload a clear image that matches your prompt | |
| - Higher inference steps = better quality but slower generation | |
| - Adjust guidance scale to balance creativity vs. prompt adherence | |
| - Use the same seed to reproduce results | |
| - Keep generation under 3 minutes to fit Zero GPU limits | |
| ## Model Information: | |
| - Model: Wan2.2-TI2V-5B (5B parameters) | |
| - Resolution: 720P (1280x704 or custom) | |
| - Frame Rate: 24 fps | |
| - Default Duration: 3 seconds (optimized for Zero GPU) | |
| - Generation Time: ~2-3 minutes on Zero GPU (with optimized settings) | |
| """ | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.queue(max_size=20) | |
| demo.launch() | |