Spaces:
Runtime error
Runtime error
| # Copyright Alibaba Inc. All Rights Reserved. | |
| import argparse | |
| import os | |
| import subprocess | |
| from datetime import datetime | |
| from pathlib import Path | |
| import gradio as gr | |
| import librosa | |
| import torch | |
| from PIL import Image | |
| from transformers import Wav2Vec2Model, Wav2Vec2Processor | |
| # 由于在Hugging Face Space中运行,我们需要简化导入 | |
| # from diffsynth import ModelManager, WanVideoPipeline | |
| # from model import FantasyTalkingAudioConditionModel | |
| # from utils import get_audio_features, resize_image_by_longest_edge, save_video | |
| pipe, fantasytalking, wav2vec_processor, wav2vec = None, None, None, None | |
| # 简化版的推理函数用于演示 | |
| def generate_video( | |
| image_path, | |
| audio_path, | |
| prompt, | |
| prompt_cfg_scale, | |
| audio_cfg_scale, | |
| audio_weight, | |
| image_size, | |
| max_num_frames, | |
| inference_steps, | |
| seed, | |
| ): | |
| """ | |
| 简化版的视频生成函数,用于演示目的 | |
| 在实际部署中,需要加载完整的模型 | |
| """ | |
| # 创建输出目录 | |
| output_dir = Path("./output") | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| # 这里应该是实际的推理代码 | |
| # 目前返回一个提示信息 | |
| return "模型正在准备中,请等待完整版本部署" | |
| def create_args( | |
| image_path: str, | |
| audio_path: str, | |
| prompt: str, | |
| output_dir: str, | |
| audio_weight: float, | |
| prompt_cfg_scale: float, | |
| audio_cfg_scale: float, | |
| image_size: int, | |
| max_num_frames: int, | |
| inference_steps: int, | |
| seed: int, | |
| ) -> argparse.Namespace: | |
| """创建参数配置""" | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--wan_model_dir", type=str, default="./models/Wan2.1-I2V-14B-720P") | |
| parser.add_argument("--fantasytalking_model_path", type=str, default="./models/fantasytalking_model.ckpt") | |
| parser.add_argument("--wav2vec_model_dir", type=str, default="./models/wav2vec2-base-960h") | |
| parser.add_argument("--image_path", type=str, default=image_path) | |
| parser.add_argument("--audio_path", type=str, default=audio_path) | |
| parser.add_argument("--prompt", type=str, default=prompt) | |
| parser.add_argument("--output_dir", type=str, default=output_dir) | |
| parser.add_argument("--image_size", type=int, default=image_size) | |
| parser.add_argument("--audio_scale", type=float, default=audio_weight) | |
| parser.add_argument("--prompt_cfg_scale", type=float, default=prompt_cfg_scale) | |
| parser.add_argument("--audio_cfg_scale", type=float, default=audio_cfg_scale) | |
| parser.add_argument("--max_num_frames", type=int, default=max_num_frames) | |
| parser.add_argument("--num_inference_steps", type=int, default=inference_steps) | |
| parser.add_argument("--seed", type=int, default=seed) | |
| parser.add_argument("--fps", type=int, default=24) | |
| parser.add_argument("--num_persistent_param_in_dit", type=int, default=3_000_000_000) # 16GB GPU优化 | |
| return parser.parse_args([]) | |
| # 创建 Gradio 界面 | |
| with gr.Blocks(title="FantasyTalking Video Generation") as demo: | |
| gr.Markdown( | |
| """ | |
| # FantasyTalking: Realistic Talking Portrait Generation via Coherent Motion Synthesis | |
| <div align="center"> | |
| <strong> Mengchao Wang1* Qiang Wang1* Fan Jiang1† | |
| Yaqi Fan2 Yunpeng Zhang1,2 YongGang Qi2‡ | |
| Kun Zhao1. Mu Xu1 </strong> | |
| </div> | |
| <div align="center"> | |
| <strong>1AMAP,Alibaba Group 2Beijing University of Posts and Telecommunications</strong> | |
| </div> | |
| <div style="display:flex;justify-content:center;column-gap:4px;"> | |
| <a href="https://github.com/Fantasy-AMAP/fantasy-talking"> | |
| <img src='https://img.shields.io/badge/GitHub-Repo-blue'> | |
| </a> | |
| <a href="https://arxiv.org/abs/2504.04842"> | |
| <img src='https://img.shields.io/badge/ArXiv-Paper-red'> | |
| </a> | |
| </div> | |
| ## 注意 | |
| 此演示版本正在准备中。完整功能需要下载大量模型文件(约40GB+)。 | |
| 请参考 [GitHub仓库](https://github.com/Fantasy-AMAP/fantasy-talking) 获取完整安装和使用说明。 | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| image_input = gr.Image(label="输入图像", type="filepath") | |
| audio_input = gr.Audio(label="输入音频", type="filepath") | |
| prompt_input = gr.Text(label="输入提示词", value="A woman is talking.") | |
| with gr.Row(): | |
| prompt_cfg_scale = gr.Slider( | |
| minimum=1.0, | |
| maximum=9.0, | |
| value=5.0, | |
| step=0.5, | |
| label="提示词CFG比例", | |
| ) | |
| audio_cfg_scale = gr.Slider( | |
| minimum=1.0, | |
| maximum=9.0, | |
| value=5.0, | |
| step=0.5, | |
| label="音频CFG比例", | |
| ) | |
| audio_weight = gr.Slider( | |
| minimum=0.1, | |
| maximum=3.0, | |
| value=1.0, | |
| step=0.1, | |
| label="音频权重", | |
| ) | |
| with gr.Row(): | |
| image_size = gr.Number( | |
| value=512, label="宽度/高度最大尺寸", precision=0 | |
| ) | |
| max_num_frames = gr.Number( | |
| value=81, label="最大帧数", precision=0 | |
| ) | |
| inference_steps = gr.Slider( | |
| minimum=1, maximum=50, value=20, step=1, label="推理步数" | |
| ) | |
| with gr.Row(): | |
| seed = gr.Number(value=1247, label="随机种子", precision=0) | |
| process_btn = gr.Button("生成视频") | |
| with gr.Column(): | |
| video_output = gr.Video(label="输出视频") | |
| gr.Markdown( | |
| """ | |
| ## 使用说明 | |
| 1. **上传图像**: 选择一张人物肖像图片 | |
| 2. **上传音频**: 选择对应的音频文件 | |
| 3. **设置参数**: 调整各种生成参数 | |
| 4. **生成视频**: 点击按钮开始生成 | |
| ## 模型要求 | |
| - **基础模型**: Wan2.1-I2V-14B-720P (~20GB) | |
| - **音频编码器**: Wav2Vec2 (~1GB) | |
| - **FantasyTalking模型**: 专用权重文件 (~2GB) | |
| - **显存要求**: 至少5GB VRAM(设置为低内存模式) | |
| ## 本地部署 | |
| ```bash | |
| # 1. 克隆仓库 | |
| git clone https://github.com/Fantasy-AMAP/fantasy-talking.git | |
| cd fantasy-talking | |
| # 2. 安装依赖 | |
| pip install -r requirements.txt | |
| pip install flash_attn # 可选,加速注意力计算 | |
| # 3. 下载模型 | |
| huggingface-cli download Wan-AI/Wan2.1-I2V-14B-720P --local-dir ./models/Wan2.1-I2V-14B-720P | |
| huggingface-cli download facebook/wav2vec2-base-960h --local-dir ./models/wav2vec2-base-960h | |
| huggingface-cli download acvlab/FantasyTalking fantasytalking_model.ckpt --local-dir ./models | |
| # 4. 运行推理 | |
| python infer.py --image_path ./assets/images/woman.png --audio_path ./assets/audios/woman.wav | |
| # 5. 启动Web界面 | |
| python app.py | |
| ``` | |
| """ | |
| ) | |
| process_btn.click( | |
| fn=generate_video, | |
| inputs=[ | |
| image_input, | |
| audio_input, | |
| prompt_input, | |
| prompt_cfg_scale, | |
| audio_cfg_scale, | |
| audio_weight, | |
| image_size, | |
| max_num_frames, | |
| inference_steps, | |
| seed, | |
| ], | |
| outputs=video_output, | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(inbrowser=True, share=True) | |