Spaces:

Tonic
/

l-operator-demo

Running on Zero

App Files Files Community

Joseph Pollack commited on Aug 31

Commit

6b4a0c8

unverified ·

1 Parent(s): a0c936d

attempts to add an annotated image component with bounding boxes

Browse files

Files changed (1) hide show

app.py +160 -12

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import gradio as gr
 import torch
-from PIL import Image
 import json
 import os
 from transformers import AutoProcessor, AutoModelForImageTextToText
@@ -23,6 +23,107 @@ if not HF_TOKEN:
     logger.warning("HF_TOKEN not found in environment variables. Model access may be restricted.")
     logger.warning("Please set HF_TOKEN in your environment variables or Spaces secrets.")
 class LOperatorDemo:
     def __init__(self):
         self.model = None
@@ -160,16 +261,16 @@ demo_instance = LOperatorDemo()
 def process_input(image, goal, step_instructions):
     """Process the input and generate action"""
     if image is None:
-        return "❌ Please upload an Android screenshot image."
     if not goal.strip():
-        return "❌ Please provide a goal."
     if not step_instructions.strip():
-        return "❌ Please provide step instructions."
     if not demo_instance.is_loaded:
-        return "❌ Model not loaded. Please wait for it to load automatically."
     try:
         # Handle different image formats
@@ -183,10 +284,10 @@ def process_input(image, goal, step_instructions):
             # Handle Gradio file object
             pil_image = Image.open(image.name)
         else:
-            return "❌ Invalid image format. Please upload a valid image."
         if pil_image is None:
-            return "❌ Failed to process image. Please try again."
         # Convert image to RGB if needed
         if pil_image.mode != "RGB":
@@ -194,11 +295,33 @@ def process_input(image, goal, step_instructions):
         # Generate action using goal and step instructions
         response = demo_instance.generate_action(pil_image, goal, step_instructions)
-        return response
     except Exception as e:
         logger.error(f"Error processing input: {str(e)}")
-        return f"❌ Error: {str(e)}"
 def load_example_episodes():
@@ -281,6 +404,12 @@ def create_demo():
         .output-container {
             min-height: 200px;
         }
         """
     ) as demo:
@@ -303,7 +432,7 @@ def create_demo():
         The model generates JSON actions in the following format:
         ```json
         {
-          "action_type": "tap",
           "x": 540,
           "y": 1200,
           "text": "Settings",
@@ -312,6 +441,8 @@ def create_demo():
         }
         ```
         ---
         """)
@@ -342,6 +473,16 @@ def create_demo():
                 process_btn = gr.Button("🚀 Generate Action", variant="primary", size="lg")
             with gr.Column(scale=1):
                 gr.Markdown("### 📊 Generated Action")
                 output_text = gr.Textbox(
                     label="JSON Action Output",
@@ -350,12 +491,16 @@ def create_demo():
                     interactive=False,
                     elem_classes=["output-container"]
                 )
         # Connect the process button
         process_btn.click(
             fn=process_input,
             inputs=[image_input, goal_input, step_instructions_input],
-            outputs=output_text
         )
         # Load examples
@@ -395,6 +540,9 @@ def create_demo():
                                     fn=lambda img, g, s: (img, g, s),
                                     inputs=[example_image, example_goal, example_step_instruction],
                                     outputs=[image_input, goal_input, step_instructions_input]
                                 )
         except Exception as e:
             logger.warning(f"Failed to load examples: {str(e)}")

 import gradio as gr
 import torch
+from PIL import Image, ImageDraw, ImageFont
 import json
 import os
 from transformers import AutoProcessor, AutoModelForImageTextToText
     logger.warning("HF_TOKEN not found in environment variables. Model access may be restricted.")
     logger.warning("Please set HF_TOKEN in your environment variables or Spaces secrets.")
+def create_annotated_image(image: Image.Image, x: int, y: int, action_type: str = "click") -> Image.Image:
+    """Create an image with a bounding box around the specified coordinates"""
+    try:
+        # Create a copy of the original image
+        annotated_image = image.copy()
+        draw = ImageDraw.Draw(annotated_image)
+        # Define bounding box parameters - make it generous as requested
+        box_size = 120  # Increased size for more generous bounding box
+        box_color = (255, 0, 0)  # Red color
+        line_width = 4  # Thicker line for better visibility
+        # Calculate bounding box coordinates
+        left = max(0, x - box_size // 2)
+        top = max(0, y - box_size // 2)
+        right = min(image.width, x + box_size // 2)
+        bottom = min(image.height, y + box_size // 2)
+        # Draw the bounding box with rounded corners effect
+        draw.rectangle([left, top, right, bottom], outline=box_color, width=line_width)
+        # Draw corner indicators for better visibility
+        corner_size = 15
+        # Top-left corner
+        draw.line([left, top, left + corner_size, top], fill=box_color, width=line_width)
+        draw.line([left, top, left, top + corner_size], fill=box_color, width=line_width)
+        # Top-right corner
+        draw.line([right - corner_size, top, right, top], fill=box_color, width=line_width)
+        draw.line([right, top, right, top + corner_size], fill=box_color, width=line_width)
+        # Bottom-left corner
+        draw.line([left, bottom - corner_size, left, bottom], fill=box_color, width=line_width)
+        draw.line([left, bottom, left + corner_size, bottom], fill=box_color, width=line_width)
+        # Bottom-right corner
+        draw.line([right - corner_size, bottom, right, bottom], fill=box_color, width=line_width)
+        draw.line([right, bottom - corner_size, right, bottom], fill=box_color, width=line_width)
+        # Draw a crosshair at the exact point
+        crosshair_size = 15
+        crosshair_color = (255, 255, 0)  # Yellow crosshair for contrast
+        draw.line([x - crosshair_size, y, x + crosshair_size, y], fill=crosshair_color, width=3)
+        draw.line([x, y - crosshair_size, x, y + crosshair_size], fill=crosshair_color, width=3)
+        # Add a small circle at the center
+        circle_radius = 4
+        draw.ellipse([x - circle_radius, y - circle_radius, x + circle_radius, y + circle_radius],
+                    fill=crosshair_color, outline=box_color, width=2)
+        # Add text label with better positioning
+        try:
+            font = ImageFont.load_default()
+        except:
+            font = ImageFont.load_default()
+        label_text = f"{action_type.upper()}: ({x}, {y})"
+        text_bbox = draw.textbbox((0, 0), label_text, font=font)
+        text_width = text_bbox[2] - text_bbox[0]
+        text_height = text_bbox[3] - text_bbox[1]
+        # Position text above the bounding box, but ensure it's visible
+        text_x = max(5, left)
+        text_y = max(5, top - text_height - 10)
+        # If text would go off the top, position it below the box
+        if text_y < 5:
+            text_y = min(image.height - text_height - 5, bottom + 10)
+        # Draw text background with better contrast
+        draw.rectangle([text_x - 4, text_y - 4, text_x + text_width + 4, text_y + text_height + 4],
+                      fill=(0, 0, 0, 180))
+        # Draw text
+        draw.text((text_x, text_y), label_text, fill=(255, 255, 255), font=font)
+        return annotated_image
+    except Exception as e:
+        logger.error(f"Error creating annotated image: {str(e)}")
+        return image  # Return original image if annotation fails
+def parse_action_response(response: str) -> tuple:
+    """Parse the action response and extract coordinates if present"""
+    try:
+        # Try to parse as JSON
+        if response.strip().startswith('{'):
+            action_data = json.loads(response)
+            # Check if it's a click action with coordinates
+            if (action_data.get('action_type') == 'click' and
+                'x' in action_data and 'y' in action_data):
+                return action_data, True
+            else:
+                return action_data, False
+        else:
+            return response, False
+    except json.JSONDecodeError:
+        return response, False
+    except Exception as e:
+        logger.error(f"Error parsing action response: {str(e)}")
+        return response, False
 class LOperatorDemo:
     def __init__(self):
         self.model = None
 def process_input(image, goal, step_instructions):
     """Process the input and generate action"""
     if image is None:
+        return "❌ Please upload an Android screenshot image.", None
     if not goal.strip():
+        return "❌ Please provide a goal.", None
     if not step_instructions.strip():
+        return "❌ Please provide step instructions.", None
     if not demo_instance.is_loaded:
+        return "❌ Model not loaded. Please wait for it to load automatically.", None
     try:
         # Handle different image formats
             # Handle Gradio file object
             pil_image = Image.open(image.name)
         else:
+            return "❌ Invalid image format. Please upload a valid image.", None
         if pil_image is None:
+            return "❌ Failed to process image. Please try again.", None
         # Convert image to RGB if needed
         if pil_image.mode != "RGB":
         # Generate action using goal and step instructions
         response = demo_instance.generate_action(pil_image, goal, step_instructions)
+        # Parse the response to check for coordinates
+        action_data, has_coordinates = parse_action_response(response)
+        # If coordinates are found, create annotated image
+        annotated_image = None
+        if has_coordinates and isinstance(action_data, dict):
+            x = action_data.get('x')
+            y = action_data.get('y')
+            action_type = action_data.get('action_type', 'click')
+            if x is not None and y is not None:
+                annotated_image = create_annotated_image(pil_image, x, y, action_type)
+                logger.info(f"Created annotated image for coordinates ({x}, {y})")
+        return response, annotated_image
     except Exception as e:
         logger.error(f"Error processing input: {str(e)}")
+        return f"❌ Error: {str(e)}", None
+def update_annotated_image_visibility(response, annotated_image):
+    """Update the visibility of the annotated image based on whether coordinates are present"""
+    if annotated_image is not None:
+        return gr.update(visible=True, value=annotated_image)
+    else:
+        return gr.update(visible=False, value=None)
 def load_example_episodes():
         .output-container {
             min-height: 200px;
         }
+        .annotated-image-container {
+            border: 2px solid #e0e0e0;
+            border-radius: 8px;
+            padding: 10px;
+            margin-top: 10px;
+        }
         """
     ) as demo:
         The model generates JSON actions in the following format:
         ```json
         {
+          "action_type": "click",
           "x": 540,
           "y": 1200,
           "text": "Settings",
         }
         ```
+        **🎯 Visual Feedback**: When the model returns coordinates (x, y), an annotated screenshot will be displayed showing the exact click location with a red bounding box and crosshair.
         ---
         """)
                 process_btn = gr.Button("🚀 Generate Action", variant="primary", size="lg")
             with gr.Column(scale=1):
+                gr.Markdown("### 🎯 Annotated Screenshot")
+                annotated_image_output = gr.Image(
+                    label="Click Location Highlighted",
+                    height=400,
+                    visible=False,
+                    interactive=False,
+                    elem_classes=["annotated-image-container"]
+                )
                 gr.Markdown("### 📊 Generated Action")
                 output_text = gr.Textbox(
                     label="JSON Action Output",
                     interactive=False,
                     elem_classes=["output-container"]
                 )
         # Connect the process button
         process_btn.click(
             fn=process_input,
             inputs=[image_input, goal_input, step_instructions_input],
+            outputs=[output_text, annotated_image_output]
+        ).then(
+            fn=update_annotated_image_visibility,
+            inputs=[output_text, annotated_image_output],
+            outputs=annotated_image_output
         )
         # Load examples
                                     fn=lambda img, g, s: (img, g, s),
                                     inputs=[example_image, example_goal, example_step_instruction],
                                     outputs=[image_input, goal_input, step_instructions_input]
+                                ).then(
+                                    fn=lambda: (None, gr.update(visible=False)),
+                                    outputs=[output_text, annotated_image_output]
                                 )
         except Exception as e:
             logger.warning(f"Failed to load examples: {str(e)}")