Spaces:

jintinghou
/

fantasy-talking-demo

Runtime error

App Files Files Community

fantasy-talking-demo / app.py

jintinghou

Optimize for 16GB GPU: set num_persistent_param_in_dit to 3B

83f7cfe 6 months ago

raw

history blame contribute delete

8 kB

	# Copyright Alibaba Inc. All Rights Reserved.

	import argparse
	import os
	import subprocess
	from datetime import datetime
	from pathlib import Path

	import gradio as gr
	import librosa
	import torch
	from PIL import Image
	from transformers import Wav2Vec2Model, Wav2Vec2Processor

	# 由于在Hugging Face Space中运行，我们需要简化导入
	# from diffsynth import ModelManager, WanVideoPipeline
	# from model import FantasyTalkingAudioConditionModel
	# from utils import get_audio_features, resize_image_by_longest_edge, save_video

	pipe, fantasytalking, wav2vec_processor, wav2vec = None, None, None, None

	# 简化版的推理函数用于演示
	def generate_video(
	image_path,
	audio_path,
	prompt,
	prompt_cfg_scale,
	audio_cfg_scale,
	audio_weight,
	image_size,
	max_num_frames,
	inference_steps,
	seed,
	):
	"""
	简化版的视频生成函数，用于演示目的
	在实际部署中，需要加载完整的模型
	"""
	# 创建输出目录
	output_dir = Path("./output")
	output_dir.mkdir(parents=True, exist_ok=True)

	# 这里应该是实际的推理代码
	# 目前返回一个提示信息
	return "模型正在准备中，请等待完整版本部署"

	def create_args(
	image_path: str,
	audio_path: str,
	prompt: str,
	output_dir: str,
	audio_weight: float,
	prompt_cfg_scale: float,
	audio_cfg_scale: float,
	image_size: int,
	max_num_frames: int,
	inference_steps: int,
	seed: int,
	) -> argparse.Namespace:
	"""创建参数配置"""
	parser = argparse.ArgumentParser()
	parser.add_argument("--wan_model_dir", type=str, default="./models/Wan2.1-I2V-14B-720P")
	parser.add_argument("--fantasytalking_model_path", type=str, default="./models/fantasytalking_model.ckpt")
	parser.add_argument("--wav2vec_model_dir", type=str, default="./models/wav2vec2-base-960h")
	parser.add_argument("--image_path", type=str, default=image_path)
	parser.add_argument("--audio_path", type=str, default=audio_path)
	parser.add_argument("--prompt", type=str, default=prompt)
	parser.add_argument("--output_dir", type=str, default=output_dir)
	parser.add_argument("--image_size", type=int, default=image_size)
	parser.add_argument("--audio_scale", type=float, default=audio_weight)
	parser.add_argument("--prompt_cfg_scale", type=float, default=prompt_cfg_scale)
	parser.add_argument("--audio_cfg_scale", type=float, default=audio_cfg_scale)
	parser.add_argument("--max_num_frames", type=int, default=max_num_frames)
	parser.add_argument("--num_inference_steps", type=int, default=inference_steps)
	parser.add_argument("--seed", type=int, default=seed)
	parser.add_argument("--fps", type=int, default=24)
	parser.add_argument("--num_persistent_param_in_dit", type=int, default=3_000_000_000) # 16GB GPU优化

	return parser.parse_args([])

	# 创建 Gradio 界面
	with gr.Blocks(title="FantasyTalking Video Generation") as demo:
	gr.Markdown(
	"""
	# FantasyTalking: Realistic Talking Portrait Generation via Coherent Motion Synthesis

	<div align="center">
	<strong> Mengchao Wang1* Qiang Wang1* Fan Jiang1†
	Yaqi Fan2 Yunpeng Zhang1,2 YongGang Qi2‡
	Kun Zhao1. Mu Xu1 </strong>
	</div>

	<div align="center">
	<strong>1AMAP,Alibaba Group 2Beijing University of Posts and Telecommunications</strong>
	</div>

	<div style="display:flex;justify-content:center;column-gap:4px;">
	<a href="https://github.com/Fantasy-AMAP/fantasy-talking">
	<img src='https://img.shields.io/badge/GitHub-Repo-blue'>
	</a>
	<a href="https://arxiv.org/abs/2504.04842">
	<img src='https://img.shields.io/badge/ArXiv-Paper-red'>
	</a>
	</div>

	## 注意
	此演示版本正在准备中。完整功能需要下载大量模型文件（约40GB+）。
	请参考 [GitHub仓库](https://github.com/Fantasy-AMAP/fantasy-talking) 获取完整安装和使用说明。
	"""
	)

	with gr.Row():
	with gr.Column():
	image_input = gr.Image(label="输入图像", type="filepath")
	audio_input = gr.Audio(label="输入音频", type="filepath")
	prompt_input = gr.Text(label="输入提示词", value="A woman is talking.")

	with gr.Row():
	prompt_cfg_scale = gr.Slider(
	minimum=1.0,
	maximum=9.0,
	value=5.0,
	step=0.5,
	label="提示词CFG比例",
	)
	audio_cfg_scale = gr.Slider(
	minimum=1.0,
	maximum=9.0,
	value=5.0,
	step=0.5,
	label="音频CFG比例",
	)
	audio_weight = gr.Slider(
	minimum=0.1,
	maximum=3.0,
	value=1.0,
	step=0.1,
	label="音频权重",
	)

	with gr.Row():
	image_size = gr.Number(
	value=512, label="宽度/高度最大尺寸", precision=0
	)
	max_num_frames = gr.Number(
	value=81, label="最大帧数", precision=0
	)
	inference_steps = gr.Slider(
	minimum=1, maximum=50, value=20, step=1, label="推理步数"
	)

	with gr.Row():
	seed = gr.Number(value=1247, label="随机种子", precision=0)

	process_btn = gr.Button("生成视频")

	with gr.Column():
	video_output = gr.Video(label="输出视频")

	gr.Markdown(
	"""
	## 使用说明

	1. 上传图像: 选择一张人物肖像图片
	2. 上传音频: 选择对应的音频文件
	3. 设置参数: 调整各种生成参数
	4. 生成视频: 点击按钮开始生成

	## 模型要求

	- 基础模型: Wan2.1-I2V-14B-720P (~20GB)
	- 音频编码器: Wav2Vec2 (~1GB)
	- FantasyTalking模型: 专用权重文件 (~2GB)
	- 显存要求: 至少5GB VRAM（设置为低内存模式）

	## 本地部署

	```bash
	# 1. 克隆仓库
	git clone https://github.com/Fantasy-AMAP/fantasy-talking.git
	cd fantasy-talking

	# 2. 安装依赖
	pip install -r requirements.txt
	pip install flash_attn # 可选，加速注意力计算

	# 3. 下载模型
	huggingface-cli download Wan-AI/Wan2.1-I2V-14B-720P --local-dir ./models/Wan2.1-I2V-14B-720P
	huggingface-cli download facebook/wav2vec2-base-960h --local-dir ./models/wav2vec2-base-960h
	huggingface-cli download acvlab/FantasyTalking fantasytalking_model.ckpt --local-dir ./models

	# 4. 运行推理
	python infer.py --image_path ./assets/images/woman.png --audio_path ./assets/audios/woman.wav

	# 5. 启动Web界面
	python app.py
	```
	"""
	)

	process_btn.click(
	fn=generate_video,
	inputs=[
	image_input,
	audio_input,
	prompt_input,
	prompt_cfg_scale,
	audio_cfg_scale,
	audio_weight,
	image_size,
	max_num_frames,
	inference_steps,
	seed,
	],
	outputs=video_output,
	)

	if __name__ == "__main__":
	demo.launch(inbrowser=True, share=True)