Spaces:

Tonic
/

l-operator-demo

Running on Zero

App Files Files Community

Joseph Pollack commited on Aug 29

Commit

ff310d7

unverified ·

1 Parent(s): 81e328a

adds simlified interface, image loading using shutil

Browse files

Files changed (1) hide show

app.py +35 -152

app.py CHANGED Viewed

@@ -3,8 +3,8 @@ import torch
 from PIL import Image
 import json
 import os
-import base64
-import io
 from transformers import AutoProcessor, AutoModelForImageTextToText
 from typing import List, Dict, Any
 import logging
@@ -143,36 +143,20 @@ class LOperatorDemo:
         try:
             # Handle different image formats
             pil_image = None
-            if isinstance(image, str) and image.startswith('data:image/'):
-                # Handle base64 image
-                pil_image = base64_to_pil(image)
-            elif hasattr(image, 'mode'):  # PIL Image object
                 pil_image = image
             else:
                 return history + [{"role": "user", "content": message}, {"role": "assistant", "content": "❌ Invalid image format. Please upload a valid image."}]
             if pil_image is None:
                 return history + [{"role": "user", "content": message}, {"role": "assistant", "content": "❌ Failed to process image. Please try again."}]
-            # Extract goal and instruction from message
-            if "Goal:" in message and "Step:" in message:
-                # Parse structured input
-                lines = message.split('\n')
-                goal = ""
-                instruction = ""
-                for line in lines:
-                    if line.startswith("Goal:"):
-                        goal = line.replace("Goal:", "").strip()
-                    elif line.startswith("Step:"):
-                        instruction = line.replace("Step:", "").strip()
-                if not goal or not instruction:
-                    return history + [{"role": "user", "content": message}, {"role": "assistant", "content": "❌ Please provide both Goal and Step in your message."}]
-            else:
-                # Treat as general instruction
-                goal = "Complete the requested action"
-                instruction = message
             # Generate action
             response = self.generate_action(pil_image, goal, instruction)
@@ -196,48 +180,17 @@ def load_model():
         logger.error(f"Error loading model: {str(e)}")
         return f"❌ Error loading model: {str(e)}"
-def pil_to_base64(image):
-    """Convert PIL image to base64 string for Gradio examples"""
-    try:
-        # Convert to RGB if needed
-        if image.mode != "RGB":
-            image = image.convert("RGB")
-        # Save to bytes buffer
-        buffer = io.BytesIO()
-        image.save(buffer, format="PNG")
-        buffer.seek(0)
-        # Convert to base64
-        img_str = base64.b64encode(buffer.getvalue()).decode()
-        return f"data:image/png;base64,{img_str}"
-    except Exception as e:
-        logger.error(f"Error converting image to base64: {str(e)}")
-        return None
-def base64_to_pil(base64_string):
-    """Convert base64 string to PIL image"""
-    try:
-        # Remove data URL prefix if present
-        if base64_string.startswith('data:image/'):
-            base64_string = base64_string.split(',')[1]
-        # Decode base64
-        image_data = base64.b64decode(base64_string)
-        # Create PIL image from bytes
-        image = Image.open(io.BytesIO(image_data))
-        return image
-    except Exception as e:
-        logger.error(f"Error converting base64 to PIL image: {str(e)}")
-        return None
 def load_example_episodes():
-    """Load example episodes from the extracted data - properly load images for Gradio"""
     examples = []
     try:
-        # Load episode metadata and images
         episode_dirs = ["episode_13", "episode_53", "episode_73"]
         for episode_dir in episode_dirs:
@@ -252,23 +205,18 @@ def load_example_episodes():
                     with open(metadata_path, "r") as f:
                         metadata = json.load(f)
-                    # Load the image using PIL
-                    image = Image.open(image_path)
-                    # Convert to base64 for Gradio examples
-                    base64_image = pil_to_base64(image)
-                    if base64_image:
-                        episode_num = episode_dir.split('_')[1]
-                        goal_text = metadata.get('goal', f'Episode {episode_num} example')
-                        examples.append([
-                            base64_image,  # Use base64 encoded image
-                            f"Episode {episode_num}: {goal_text[:50]}..."
-                        ])
-                        logger.info(f"Successfully loaded example for Episode {episode_num}")
-                    else:
-                        logger.warning(f"Failed to convert image to base64 for {episode_dir}")
             except Exception as e:
                 logger.warning(f"Could not load example for {episode_dir}: {str(e)}")
@@ -278,7 +226,7 @@ def load_example_episodes():
         logger.error(f"Error loading examples: {str(e)}")
         examples = []
-    logger.info(f"Loaded {len(examples)} examples with proper image loading")
     return examples
 # Create Gradio interface
@@ -339,31 +287,8 @@ def create_demo():
                     interactive=False
                 )
-                gr.Markdown("### 📱 Input")
-                image_input = gr.Image(
-                    label="Android Screenshot",
-                    type="pil",
-                    height=400,
-                    sources=["upload"]
-                )
-                gr.Markdown("### 📝 Instructions")
-                goal_input = gr.Textbox(
-                    label="Goal",
-                    placeholder="e.g., Open the Settings app and navigate to Display settings",
-                    lines=2
-                )
-                step_input = gr.Textbox(
-                    label="Step Instruction",
-                    placeholder="e.g., Tap on the Settings app icon on the home screen",
-                    lines=2
-                )
-                generate_btn = gr.Button("🎯 Generate Action", variant="secondary")
-            with gr.Column(scale=2):
-                gr.Markdown("### 💬 Chat Interface")
                 # Load examples with error handling
                 try:
                     examples = load_example_episodes()
@@ -373,50 +298,17 @@ def create_demo():
                 chat_interface = gr.ChatInterface(
                     fn=demo_instance.chat_with_model,
-                    additional_inputs=[image_input],
-                    title="L-Operator Chat",
-                    description="Chat with L-Operator using screenshots and text instructions",
                     examples=examples,
                     type="messages",
-                    cache_examples=False
                 )
-                gr.Markdown("### 🎯 Action Output")
-                action_output = gr.JSON(
-                    label="Generated Action",
-                    value={},
-                    height=200
-                )
-        # Event handlers
-        def on_generate_action(image, goal, step):
-            if not image:
-                return {"error": "Please upload an image"}
-            if not goal or not step:
-                return {"error": "Please provide both goal and step"}
-            # Handle different image formats
-            pil_image = None
-            if isinstance(image, str) and image.startswith('data:image/'):
-                # Handle base64 image
-                pil_image = base64_to_pil(image)
-            elif hasattr(image, 'mode'):  # PIL Image object
-                pil_image = image
-            else:
-                return {"error": "Invalid image format. Please upload a valid image."}
-            if pil_image is None:
-                return {"error": "Failed to process image. Please try again."}
-            response = demo_instance.generate_action(pil_image, goal, step)
-            try:
-                # Try to parse as JSON
-                parsed = json.loads(response)
-                return parsed
-            except:
-                return {"raw_response": response}
         # Update model status on page load
         def update_model_status():
@@ -431,21 +323,12 @@ def create_demo():
             else:
                 return "❌ Model failed to load. Please check logs."
-        generate_btn.click(
-            fn=on_generate_action,
-            inputs=[image_input, goal_input, step_input],
-            outputs=action_output
-        )
         # Load model and update status on page load
         demo.load(
             fn=update_model_status,
             outputs=model_status
         )
-        # Note: The chat interface will automatically handle image updates
-        # No need for manual image change handling
         gr.Markdown("""
         ---

 from PIL import Image
 import json
 import os
+import shutil
+import tempfile
 from transformers import AutoProcessor, AutoModelForImageTextToText
 from typing import List, Dict, Any
 import logging
         try:
             # Handle different image formats
             pil_image = None
+            if hasattr(image, 'mode'):  # PIL Image object
                 pil_image = image
+            elif isinstance(image, str) and os.path.exists(image):
+                # Handle file path (from examples)
+                pil_image = Image.open(image)
             else:
                 return history + [{"role": "user", "content": message}, {"role": "assistant", "content": "❌ Invalid image format. Please upload a valid image."}]
             if pil_image is None:
                 return history + [{"role": "user", "content": message}, {"role": "assistant", "content": "❌ Failed to process image. Please try again."}]
+            # Use the message as the goal/instruction
+            goal = "Complete the requested action"
+            instruction = message
             # Generate action
             response = self.generate_action(pil_image, goal, instruction)
         logger.error(f"Error loading model: {str(e)}")
         return f"❌ Error loading model: {str(e)}"
 def load_example_episodes():
+    """Load example episodes using shutil to copy files to temp location"""
     examples = []
     try:
+        # Create temporary directory for examples
+        temp_dir = tempfile.mkdtemp()
+        logger.info(f"Created temporary directory for examples: {temp_dir}")
         episode_dirs = ["episode_13", "episode_53", "episode_73"]
         for episode_dir in episode_dirs:
                     with open(metadata_path, "r") as f:
                         metadata = json.load(f)
+                    # Copy image to temp directory
+                    temp_image_path = os.path.join(temp_dir, f"{episode_dir}_screenshot.png")
+                    shutil.copy2(image_path, temp_image_path)
+                    episode_num = episode_dir.split('_')[1]
+                    goal_text = metadata.get('goal', f'Episode {episode_num} example')
+                    examples.append([
+                        temp_image_path,  # Use temp file path
+                        goal_text  # Just the goal text, no additional formatting
+                    ])
+                    logger.info(f"Successfully loaded example for Episode {episode_num}")
             except Exception as e:
                 logger.warning(f"Could not load example for {episode_dir}: {str(e)}")
         logger.error(f"Error loading examples: {str(e)}")
         examples = []
+    logger.info(f"Loaded {len(examples)} examples using shutil")
     return examples
 # Create Gradio interface
                     interactive=False
                 )
+            with gr.Column(scale=3):
+                gr.Markdown("### 💬 L-Operator Chat Interface")
                 # Load examples with error handling
                 try:
                     examples = load_example_episodes()
                 chat_interface = gr.ChatInterface(
                     fn=demo_instance.chat_with_model,
+                    title="L-Operator: Android Device Control",
+                    description="Upload an Android screenshot and describe your goal. The model will generate JSON actions for device control.",
                     examples=examples,
                     type="messages",
+                    cache_examples=False,
+                    textbox=gr.Textbox(
+                        label="Goal",
+                        placeholder="e.g., Open the Settings app and navigate to Display settings",
+                        lines=2
+                    )
                 )
         # Update model status on page load
         def update_model_status():
             else:
                 return "❌ Model failed to load. Please check logs."
         # Load model and update status on page load
         demo.load(
             fn=update_model_status,
             outputs=model_status
         )
         gr.Markdown("""
         ---