Spaces:

jomasego
/

mcp-video-frontend

Running

jomasego

feat: Replace Anthropic with Llama 3 for video analysis

3e48648 6 months ago

10.3 kB

	#!/usr/bin/env python3
	"""
	MCP Video Analysis Client with Llama 3 Integration

	This application serves as an MCP (Model Context Protocol) client that:
	1. Connects to video analysis tools via MCP
	2. Integrates with a Llama 3 model hosted on Modal for intelligent video understanding
	3. Provides a Gradio interface for user interaction
	"""

	import os
	import json
	import logging
	from typing import Dict, Any, Optional
	import gradio as gr
	import httpx

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class MCPVideoAnalysisClient:
	"""MCP Client for video analysis with Llama 3 integration."""

	def __init__(self):
	# Modal backend for video processing
	self.video_analysis_endpoint = os.getenv(
	"MODAL_VIDEO_ANALYSIS_ENDPOINT_URL",
	"https://jomasego--video-analysis-gradio-pipeline-process-video-analysis.modal.run"
	)

	# Modal backend for Llama 3 insights
	self.llama_endpoint = os.getenv(
	"MODAL_LLAMA3_ENDPOINT_URL"
	# This will be set to the deployed Llama 3 app URL.
	# e.g., "https://jomasego--llama3-inference-service-summarize.modal.run"
	)

	logger.info(f"Initialized MCP Client.")
	logger.info(f"Video Analysis Endpoint: {self.video_analysis_endpoint}")
	if not self.llama_endpoint:
	logger.warning("MODAL_LLAMA3_ENDPOINT_URL not set. LLM insights will be unavailable.")
	else:
	logger.info(f"Llama 3 Endpoint: {self.llama_endpoint}")

	async def analyze_video_with_modal(self, video_url: str) -> Dict[str, Any]:
	"""Call the Modal backend for comprehensive video analysis."""
	try:
	async with httpx.AsyncClient(timeout=300.0) as client:
	logger.info(f"Calling video analysis backend: {video_url}")
	response = await client.post(
	self.video_analysis_endpoint,
	json={"video_url": video_url},
	headers={"Content-Type": "application/json"}
	)
	response.raise_for_status()
	return response.json()
	except Exception as e:
	logger.error(f"Error calling video analysis backend: {e}")
	return {"error": f"Video analysis backend error: {str(e)}"}

	async def get_insights_from_llama3(self, analysis_data: Dict[str, Any], user_query: Optional[str] = None) -> str:
	"""Call the Llama 3 Modal backend for intelligent insights."""
	if not self.llama_endpoint:
	return "Llama 3 endpoint is not configured. Cannot generate insights."

	try:
	payload = {
	"analysis_data": analysis_data,
	"user_query": user_query
	}
	async with httpx.AsyncClient(timeout=300.0) as client:
	logger.info(f"Calling Llama 3 Modal backend for insights.")
	response = await client.post(
	self.llama_endpoint,
	json=payload,
	headers={"Content-Type": "application/json"}
	)
	response.raise_for_status()
	result = response.json()
	return result.get("summary", "No summary returned from Llama 3 service.")
	except Exception as e:
	logger.error(f"Error calling Llama 3 backend: {e}")
	return f"Error generating Llama 3 insights: {str(e)}"

	async def process_video_request(self, video_url: str, user_query: str = None) -> tuple[str, str]:
	"""Process a complete video analysis request with Llama 3 enhancement."""
	if not video_url or not video_url.strip():
	return "Please provide a valid video URL.", ""

	try:
	# Step 1: Get video analysis from Modal backend
	logger.info(f"Starting video analysis for: {video_url}")
	video_analysis = await self.analyze_video_with_modal(video_url.strip())

	# Step 2: Format the raw analysis for display
	raw_analysis = json.dumps(video_analysis, indent=2)

	# Step 3: Enhance with Llama 3 insights
	logger.info("Generating Llama 3 insights...")
	llama_insights = await self.get_insights_from_llama3(video_analysis, user_query)

	return llama_insights, raw_analysis

	except Exception as e:
	error_msg = f"Error processing video request: {str(e)}"
	logger.error(error_msg)
	return error_msg, ""

	# Initialize the MCP client
	try:
	mcp_client = MCPVideoAnalysisClient()
	logger.info("MCP Video Analysis Client initialized successfully")
	except Exception as e:
	logger.error(f"Failed to initialize MCP client: {e}")
	mcp_client = None

	# Gradio Interface Functions
	async def analyze_video_interface(video_url: str, user_query: str = None) -> tuple[str, str]:
	"""Gradio interface function for video analysis."""
	if not mcp_client:
	return "MCP Client not initialized. Please check your environment variables.", ""

	return await mcp_client.process_video_request(video_url, user_query)

	def create_gradio_interface():
	"""Create and configure the Gradio interface."""

	with gr.Blocks(
	title="MCP Video Analysis with Llama 3",
	theme=gr.themes.Soft(),
	css="""
	.gradio-container {
	max-width: 1200px !important;
	}
	.main-header {
	text-align: center;
	margin-bottom: 30px;
	}
	.analysis-output {
	max-height: 600px;
	overflow-y: auto;
	}
	"""
	) as interface:

	gr.HTML("""
	<div class="main-header">
	<h1>🎥 MCP Video Analysis with Llama 3 AI</h1>
	<p>Intelligent video content analysis powered by a Modal backend and Llama 3</p>
	</div>
	""")

	with gr.Tab("🔍 Video Analysis"):
	with gr.Row():
	with gr.Column(scale=1):
	video_url_input = gr.Textbox(
	label="Video URL",
	placeholder="Enter YouTube URL or direct video link...",
	lines=2
	)
	user_query_input = gr.Textbox(
	label="Specific Question (Optional)",
	placeholder="Ask a specific question about the video...",
	lines=2
	)

	with gr.Row():
	analyze_btn = gr.Button("🚀 Analyze Video", variant="primary", size="lg")
	clear_btn = gr.Button("🗑️ Clear", variant="secondary")

	with gr.Column(scale=2):
	llama_output = gr.Textbox(
	label="🤖 Llama 3 AI Insights",
	lines=20,
	elem_classes=["analysis-output"],
	interactive=False
	)

	with gr.Row():
	raw_analysis_output = gr.JSON(
	label="📊 Raw Analysis Data",
	elem_classes=["analysis-output"]
	)

	# Example videos
	gr.HTML("<h3>📝 Example Videos to Try:</h3>")
	with gr.Row():
	example_urls = [
	"https://www.youtube.com/watch?v=dQw4w9WgXcQ",
	"https://www.youtube.com/watch?v=jNQXAC9IVRw",
	"https://www.youtube.com/watch?v=9bZkp7q19f0"
	]
	for i, url in enumerate(example_urls, 1):
	gr.Button(f"Example {i}", size="sm").click(
	lambda url=url: url, outputs=video_url_input
	)

	with gr.Tab("ℹ️ About"):
	gr.Markdown("""
	## About MCP Video Analysis

	This application combines multiple AI technologies to provide comprehensive video analysis:

	### 🔧 Technology Stack
	- Modal Backend: Scalable cloud compute for video processing and LLM inference
	- Whisper: Speech-to-text transcription
	- Computer Vision Models: Object detection, action recognition, and captioning
	- Meta Llama 3: Advanced AI for intelligent content analysis
	- MCP Protocol: Model Context Protocol for seamless integration

	### 🎯 Features
	- Transcription: Extract spoken content from videos
	- Visual Analysis: Identify objects, actions, and scenes
	- Content Understanding: AI-powered insights and summaries
	- Custom Queries: Ask specific questions about video content

	### 🚀 Usage
	1. Enter a video URL (YouTube or direct link)
	2. Optionally ask a specific question
	3. Click "Analyze Video" to get comprehensive insights
	4. Review both Llama 3's intelligent analysis and raw data

	### 🔒 Privacy & Security
	- Video processing is handled securely in the cloud
	- No video data is stored permanently
	- API keys are handled securely via environment variables
	""")

	# Event handlers
	def clear_all():
	return "", "", "", ""

	analyze_btn.click(
	fn=analyze_video_interface,
	inputs=[video_url_input, user_query_input],
	outputs=[llama_output, raw_analysis_output],
	show_progress=True
	)

	clear_btn.click(
	fn=clear_all,
	outputs=[video_url_input, user_query_input, llama_output, raw_analysis_output]
	)

	return interface

	# Create and launch the interface
	if __name__ == "__main__":
	interface = create_gradio_interface()
	interface.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	show_error=True
	)