Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| MCP Video Analysis Client with Anthropic Integration | |
| This application serves as an MCP (Model Context Protocol) client that: | |
| 1. Connects to video analysis tools via MCP | |
| 2. Integrates with Anthropic's Claude for intelligent video understanding | |
| 3. Provides a Gradio interface for user interaction | |
| """ | |
| import os | |
| import json | |
| import asyncio | |
| import logging | |
| from typing import Dict, Any, List, Optional | |
| import gradio as gr | |
| import httpx | |
| from anthropic import Anthropic | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| class MCPVideoAnalysisClient: | |
| """MCP Client for video analysis with Anthropic integration.""" | |
| def __init__(self): | |
| # Initialize Anthropic client | |
| self.anthropic_api_key = os.getenv("ANTHROPIC_API_KEY") | |
| if not self.anthropic_api_key: | |
| raise ValueError("ANTHROPIC_API_KEY environment variable is required") | |
| self.anthropic_client = Anthropic(api_key=self.anthropic_api_key) | |
| # Modal backend endpoint | |
| self.modal_endpoint = os.getenv( | |
| "MODAL_VIDEO_ANALYSIS_ENDPOINT_URL", | |
| "https://jomasego--video-analysis-gradio-pipeline-process-video-analysis.modal.run" | |
| ) | |
| logger.info(f"Initialized MCP Video Analysis Client with Modal endpoint: {self.modal_endpoint}") | |
| async def analyze_video_with_modal(self, video_url: str) -> Dict[str, Any]: | |
| """Call the Modal backend for comprehensive video analysis.""" | |
| try: | |
| async with httpx.AsyncClient(timeout=300.0) as client: | |
| logger.info(f"Calling Modal backend for video analysis: {video_url}") | |
| response = await client.post( | |
| self.modal_endpoint, | |
| json={"video_url": video_url}, | |
| headers={"Content-Type": "application/json"} | |
| ) | |
| response.raise_for_status() | |
| return response.json() | |
| except Exception as e: | |
| logger.error(f"Error calling Modal backend: {e}") | |
| return {"error": f"Modal backend error: {str(e)}"} | |
| def enhance_analysis_with_claude(self, video_analysis: Dict[str, Any], user_query: str = None) -> str: | |
| """Use Claude to provide intelligent insights about the video analysis.""" | |
| # Prepare the analysis data for Claude | |
| analysis_summary = self._format_analysis_for_claude(video_analysis) | |
| # Create the prompt for Claude | |
| system_prompt = """You are an expert video analyst with deep knowledge of multimedia content, storytelling, and visual communication. You excel at interpreting video analysis data and providing meaningful insights. | |
| Your task is to analyze the provided video analysis data and give intelligent, actionable insights. Focus on: | |
| 1. Content understanding and themes | |
| 2. Visual storytelling elements | |
| 3. Technical quality assessment | |
| 4. Audience engagement potential | |
| 5. Key moments and highlights | |
| 6. Contextual relevance | |
| Be concise but thorough, and tailor your response to be useful for content creators, marketers, or researchers.""" | |
| if user_query: | |
| user_prompt = f"""Here is the video analysis data: | |
| {analysis_summary} | |
| User's specific question: {user_query} | |
| Please provide a comprehensive analysis addressing the user's question while incorporating insights from all the available data.""" | |
| else: | |
| user_prompt = f"""Here is the video analysis data: | |
| {analysis_summary} | |
| Please provide a comprehensive analysis of this video, highlighting the most important insights and potential applications.""" | |
| try: | |
| response = self.anthropic_client.messages.create( | |
| model="claude-3-5-sonnet-20241022", | |
| max_tokens=2000, | |
| temperature=0.3, | |
| system=system_prompt, | |
| messages=[{"role": "user", "content": user_prompt}] | |
| ) | |
| return response.content[0].text | |
| except Exception as e: | |
| logger.error(f"Error calling Anthropic API: {e}") | |
| return f"Error generating Claude analysis: {str(e)}" | |
| def _format_analysis_for_claude(self, analysis: Dict[str, Any]) -> str: | |
| """Format the video analysis data for Claude consumption.""" | |
| formatted = [] | |
| # Handle transcription | |
| if "transcription" in analysis: | |
| transcription = analysis["transcription"] | |
| if isinstance(transcription, str) and not transcription.startswith("Error"): | |
| formatted.append(f"**TRANSCRIPTION:**\n{transcription}\n") | |
| else: | |
| formatted.append(f"**TRANSCRIPTION:** {transcription}\n") | |
| # Handle caption | |
| if "caption" in analysis: | |
| caption = analysis["caption"] | |
| if isinstance(caption, str) and not caption.startswith("Error"): | |
| formatted.append(f"**VIDEO CAPTION:**\n{caption}\n") | |
| else: | |
| formatted.append(f"**VIDEO CAPTION:** {caption}\n") | |
| # Handle actions | |
| if "actions" in analysis: | |
| actions = analysis["actions"] | |
| if isinstance(actions, list) and actions: | |
| action_text = [] | |
| for action in actions: | |
| if isinstance(action, dict): | |
| if "error" in action: | |
| action_text.append(f"Error: {action['error']}") | |
| else: | |
| # Format action detection results | |
| action_text.append(str(action)) | |
| else: | |
| action_text.append(str(action)) | |
| formatted.append(f"**ACTION RECOGNITION:**\n{'; '.join(action_text)}\n") | |
| else: | |
| formatted.append(f"**ACTION RECOGNITION:** {actions}\n") | |
| # Handle objects | |
| if "objects" in analysis: | |
| objects = analysis["objects"] | |
| if isinstance(objects, list) and objects: | |
| object_text = [] | |
| for obj in objects: | |
| if isinstance(obj, dict): | |
| if "error" in obj: | |
| object_text.append(f"Error: {obj['error']}") | |
| else: | |
| # Format object detection results | |
| object_text.append(str(obj)) | |
| else: | |
| object_text.append(str(obj)) | |
| formatted.append(f"**OBJECT DETECTION:**\n{'; '.join(object_text)}\n") | |
| else: | |
| formatted.append(f"**OBJECT DETECTION:** {objects}\n") | |
| # Handle any errors | |
| if "error" in analysis: | |
| formatted.append(f"**ANALYSIS ERROR:**\n{analysis['error']}\n") | |
| return "\n".join(formatted) if formatted else "No analysis data available." | |
| async def process_video_request(self, video_url: str, user_query: str = None) -> tuple[str, str]: | |
| """Process a complete video analysis request with Claude enhancement.""" | |
| if not video_url or not video_url.strip(): | |
| return "Please provide a valid video URL.", "" | |
| try: | |
| # Step 1: Get video analysis from Modal backend | |
| logger.info(f"Starting video analysis for: {video_url}") | |
| video_analysis = await self.analyze_video_with_modal(video_url.strip()) | |
| # Step 2: Format the raw analysis for display | |
| raw_analysis = json.dumps(video_analysis, indent=2) | |
| # Step 3: Enhance with Claude insights | |
| logger.info("Generating Claude insights...") | |
| claude_insights = self.enhance_analysis_with_claude(video_analysis, user_query) | |
| return claude_insights, raw_analysis | |
| except Exception as e: | |
| error_msg = f"Error processing video request: {str(e)}" | |
| logger.error(error_msg) | |
| return error_msg, "" | |
| # Initialize the MCP client | |
| try: | |
| mcp_client = MCPVideoAnalysisClient() | |
| logger.info("MCP Video Analysis Client initialized successfully") | |
| except Exception as e: | |
| logger.error(f"Failed to initialize MCP client: {e}") | |
| mcp_client = None | |
| # Gradio Interface Functions | |
| async def analyze_video_interface(video_url: str, user_query: str = None) -> tuple[str, str]: | |
| """Gradio interface function for video analysis.""" | |
| if not mcp_client: | |
| return "MCP Client not initialized. Please check your environment variables.", "" | |
| return await mcp_client.process_video_request(video_url, user_query) | |
| def create_gradio_interface(): | |
| """Create and configure the Gradio interface.""" | |
| with gr.Blocks( | |
| title="MCP Video Analysis with Claude", | |
| theme=gr.themes.Soft(), | |
| css=""" | |
| .gradio-container { | |
| max-width: 1200px !important; | |
| } | |
| .main-header { | |
| text-align: center; | |
| margin-bottom: 30px; | |
| } | |
| .analysis-output { | |
| max-height: 600px; | |
| overflow-y: auto; | |
| } | |
| """ | |
| ) as interface: | |
| gr.HTML(""" | |
| <div class="main-header"> | |
| <h1>π₯ MCP Video Analysis with Claude AI</h1> | |
| <p>Intelligent video content analysis powered by Modal backend and Anthropic Claude</p> | |
| </div> | |
| """) | |
| with gr.Tab("π Video Analysis"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| video_url_input = gr.Textbox( | |
| label="Video URL", | |
| placeholder="Enter YouTube URL or direct video link...", | |
| lines=2 | |
| ) | |
| user_query_input = gr.Textbox( | |
| label="Specific Question (Optional)", | |
| placeholder="Ask a specific question about the video...", | |
| lines=2 | |
| ) | |
| with gr.Row(): | |
| analyze_btn = gr.Button("π Analyze Video", variant="primary", size="lg") | |
| clear_btn = gr.Button("ποΈ Clear", variant="secondary") | |
| with gr.Column(scale=2): | |
| claude_output = gr.Textbox( | |
| label="π€ Claude AI Insights", | |
| lines=20, | |
| elem_classes=["analysis-output"], | |
| interactive=False | |
| ) | |
| with gr.Row(): | |
| raw_analysis_output = gr.JSON( | |
| label="π Raw Analysis Data", | |
| elem_classes=["analysis-output"] | |
| ) | |
| # Example videos | |
| gr.HTML("<h3>π Example Videos to Try:</h3>") | |
| with gr.Row(): | |
| example_urls = [ | |
| "https://www.youtube.com/watch?v=dQw4w9WgXcQ", | |
| "https://www.youtube.com/watch?v=jNQXAC9IVRw", | |
| "https://www.youtube.com/watch?v=9bZkp7q19f0" | |
| ] | |
| for i, url in enumerate(example_urls, 1): | |
| gr.Button(f"Example {i}", size="sm").click( | |
| lambda url=url: url, outputs=video_url_input | |
| ) | |
| with gr.Tab("βΉοΈ About"): | |
| gr.Markdown(""" | |
| ## About MCP Video Analysis | |
| This application combines multiple AI technologies to provide comprehensive video analysis: | |
| ### π§ Technology Stack | |
| - **Modal Backend**: Scalable cloud compute for video processing | |
| - **Whisper**: Speech-to-text transcription | |
| - **Computer Vision Models**: Object detection, action recognition, and captioning | |
| - **Anthropic Claude**: Advanced AI for intelligent content analysis | |
| - **MCP Protocol**: Model Context Protocol for seamless integration | |
| ### π― Features | |
| - **Transcription**: Extract spoken content from videos | |
| - **Visual Analysis**: Identify objects, actions, and scenes | |
| - **Content Understanding**: AI-powered insights and summaries | |
| - **Custom Queries**: Ask specific questions about video content | |
| ### π Usage | |
| 1. Enter a video URL (YouTube or direct link) | |
| 2. Optionally ask a specific question | |
| 3. Click "Analyze Video" to get comprehensive insights | |
| 4. Review both Claude's intelligent analysis and raw data | |
| ### π Privacy & Security | |
| - Video processing is handled securely in the cloud | |
| - No video data is stored permanently | |
| - API keys are handled securely via environment variables | |
| """) | |
| # Event handlers | |
| def clear_all(): | |
| return "", "", "", "" | |
| analyze_btn.click( | |
| fn=analyze_video_interface, | |
| inputs=[video_url_input, user_query_input], | |
| outputs=[claude_output, raw_analysis_output], | |
| show_progress=True | |
| ) | |
| clear_btn.click( | |
| fn=clear_all, | |
| outputs=[video_url_input, user_query_input, claude_output, raw_analysis_output] | |
| ) | |
| return interface | |
| # Create and launch the interface | |
| if __name__ == "__main__": | |
| interface = create_gradio_interface() | |
| interface.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| show_error=True | |
| ) | |