Joseph Pollack commited on
Commit
ff310d7
Β·
unverified Β·
1 Parent(s): 81e328a

adds simlified interface, image loading using shutil

Browse files
Files changed (1) hide show
  1. app.py +35 -152
app.py CHANGED
@@ -3,8 +3,8 @@ import torch
3
  from PIL import Image
4
  import json
5
  import os
6
- import base64
7
- import io
8
  from transformers import AutoProcessor, AutoModelForImageTextToText
9
  from typing import List, Dict, Any
10
  import logging
@@ -143,36 +143,20 @@ class LOperatorDemo:
143
  try:
144
  # Handle different image formats
145
  pil_image = None
146
- if isinstance(image, str) and image.startswith('data:image/'):
147
- # Handle base64 image
148
- pil_image = base64_to_pil(image)
149
- elif hasattr(image, 'mode'): # PIL Image object
150
  pil_image = image
 
 
 
151
  else:
152
  return history + [{"role": "user", "content": message}, {"role": "assistant", "content": "❌ Invalid image format. Please upload a valid image."}]
153
 
154
  if pil_image is None:
155
  return history + [{"role": "user", "content": message}, {"role": "assistant", "content": "❌ Failed to process image. Please try again."}]
156
 
157
- # Extract goal and instruction from message
158
- if "Goal:" in message and "Step:" in message:
159
- # Parse structured input
160
- lines = message.split('\n')
161
- goal = ""
162
- instruction = ""
163
-
164
- for line in lines:
165
- if line.startswith("Goal:"):
166
- goal = line.replace("Goal:", "").strip()
167
- elif line.startswith("Step:"):
168
- instruction = line.replace("Step:", "").strip()
169
-
170
- if not goal or not instruction:
171
- return history + [{"role": "user", "content": message}, {"role": "assistant", "content": "❌ Please provide both Goal and Step in your message."}]
172
- else:
173
- # Treat as general instruction
174
- goal = "Complete the requested action"
175
- instruction = message
176
 
177
  # Generate action
178
  response = self.generate_action(pil_image, goal, instruction)
@@ -196,48 +180,17 @@ def load_model():
196
  logger.error(f"Error loading model: {str(e)}")
197
  return f"❌ Error loading model: {str(e)}"
198
 
199
- def pil_to_base64(image):
200
- """Convert PIL image to base64 string for Gradio examples"""
201
- try:
202
- # Convert to RGB if needed
203
- if image.mode != "RGB":
204
- image = image.convert("RGB")
205
-
206
- # Save to bytes buffer
207
- buffer = io.BytesIO()
208
- image.save(buffer, format="PNG")
209
- buffer.seek(0)
210
-
211
- # Convert to base64
212
- img_str = base64.b64encode(buffer.getvalue()).decode()
213
- return f"data:image/png;base64,{img_str}"
214
- except Exception as e:
215
- logger.error(f"Error converting image to base64: {str(e)}")
216
- return None
217
 
218
- def base64_to_pil(base64_string):
219
- """Convert base64 string to PIL image"""
220
- try:
221
- # Remove data URL prefix if present
222
- if base64_string.startswith('data:image/'):
223
- base64_string = base64_string.split(',')[1]
224
-
225
- # Decode base64
226
- image_data = base64.b64decode(base64_string)
227
-
228
- # Create PIL image from bytes
229
- image = Image.open(io.BytesIO(image_data))
230
- return image
231
- except Exception as e:
232
- logger.error(f"Error converting base64 to PIL image: {str(e)}")
233
- return None
234
 
235
  def load_example_episodes():
236
- """Load example episodes from the extracted data - properly load images for Gradio"""
237
  examples = []
238
 
239
  try:
240
- # Load episode metadata and images
 
 
 
241
  episode_dirs = ["episode_13", "episode_53", "episode_73"]
242
 
243
  for episode_dir in episode_dirs:
@@ -252,23 +205,18 @@ def load_example_episodes():
252
  with open(metadata_path, "r") as f:
253
  metadata = json.load(f)
254
 
255
- # Load the image using PIL
256
- image = Image.open(image_path)
 
257
 
258
- # Convert to base64 for Gradio examples
259
- base64_image = pil_to_base64(image)
260
 
261
- if base64_image:
262
- episode_num = episode_dir.split('_')[1]
263
- goal_text = metadata.get('goal', f'Episode {episode_num} example')
264
-
265
- examples.append([
266
- base64_image, # Use base64 encoded image
267
- f"Episode {episode_num}: {goal_text[:50]}..."
268
- ])
269
- logger.info(f"Successfully loaded example for Episode {episode_num}")
270
- else:
271
- logger.warning(f"Failed to convert image to base64 for {episode_dir}")
272
 
273
  except Exception as e:
274
  logger.warning(f"Could not load example for {episode_dir}: {str(e)}")
@@ -278,7 +226,7 @@ def load_example_episodes():
278
  logger.error(f"Error loading examples: {str(e)}")
279
  examples = []
280
 
281
- logger.info(f"Loaded {len(examples)} examples with proper image loading")
282
  return examples
283
 
284
  # Create Gradio interface
@@ -339,31 +287,8 @@ def create_demo():
339
  interactive=False
340
  )
341
 
342
- gr.Markdown("### πŸ“± Input")
343
- image_input = gr.Image(
344
- label="Android Screenshot",
345
- type="pil",
346
- height=400,
347
- sources=["upload"]
348
- )
349
-
350
- gr.Markdown("### πŸ“ Instructions")
351
- goal_input = gr.Textbox(
352
- label="Goal",
353
- placeholder="e.g., Open the Settings app and navigate to Display settings",
354
- lines=2
355
- )
356
-
357
- step_input = gr.Textbox(
358
- label="Step Instruction",
359
- placeholder="e.g., Tap on the Settings app icon on the home screen",
360
- lines=2
361
- )
362
-
363
- generate_btn = gr.Button("🎯 Generate Action", variant="secondary")
364
-
365
- with gr.Column(scale=2):
366
- gr.Markdown("### πŸ’¬ Chat Interface")
367
  # Load examples with error handling
368
  try:
369
  examples = load_example_episodes()
@@ -373,50 +298,17 @@ def create_demo():
373
 
374
  chat_interface = gr.ChatInterface(
375
  fn=demo_instance.chat_with_model,
376
- additional_inputs=[image_input],
377
- title="L-Operator Chat",
378
- description="Chat with L-Operator using screenshots and text instructions",
379
  examples=examples,
380
  type="messages",
381
- cache_examples=False
 
 
 
 
 
382
  )
383
-
384
- gr.Markdown("### 🎯 Action Output")
385
- action_output = gr.JSON(
386
- label="Generated Action",
387
- value={},
388
- height=200
389
- )
390
-
391
- # Event handlers
392
- def on_generate_action(image, goal, step):
393
- if not image:
394
- return {"error": "Please upload an image"}
395
-
396
- if not goal or not step:
397
- return {"error": "Please provide both goal and step"}
398
-
399
- # Handle different image formats
400
- pil_image = None
401
- if isinstance(image, str) and image.startswith('data:image/'):
402
- # Handle base64 image
403
- pil_image = base64_to_pil(image)
404
- elif hasattr(image, 'mode'): # PIL Image object
405
- pil_image = image
406
- else:
407
- return {"error": "Invalid image format. Please upload a valid image."}
408
-
409
- if pil_image is None:
410
- return {"error": "Failed to process image. Please try again."}
411
-
412
- response = demo_instance.generate_action(pil_image, goal, step)
413
-
414
- try:
415
- # Try to parse as JSON
416
- parsed = json.loads(response)
417
- return parsed
418
- except:
419
- return {"raw_response": response}
420
 
421
  # Update model status on page load
422
  def update_model_status():
@@ -431,21 +323,12 @@ def create_demo():
431
  else:
432
  return "❌ Model failed to load. Please check logs."
433
 
434
- generate_btn.click(
435
- fn=on_generate_action,
436
- inputs=[image_input, goal_input, step_input],
437
- outputs=action_output
438
- )
439
-
440
  # Load model and update status on page load
441
  demo.load(
442
  fn=update_model_status,
443
  outputs=model_status
444
  )
445
 
446
- # Note: The chat interface will automatically handle image updates
447
- # No need for manual image change handling
448
-
449
  gr.Markdown("""
450
  ---
451
 
 
3
  from PIL import Image
4
  import json
5
  import os
6
+ import shutil
7
+ import tempfile
8
  from transformers import AutoProcessor, AutoModelForImageTextToText
9
  from typing import List, Dict, Any
10
  import logging
 
143
  try:
144
  # Handle different image formats
145
  pil_image = None
146
+ if hasattr(image, 'mode'): # PIL Image object
 
 
 
147
  pil_image = image
148
+ elif isinstance(image, str) and os.path.exists(image):
149
+ # Handle file path (from examples)
150
+ pil_image = Image.open(image)
151
  else:
152
  return history + [{"role": "user", "content": message}, {"role": "assistant", "content": "❌ Invalid image format. Please upload a valid image."}]
153
 
154
  if pil_image is None:
155
  return history + [{"role": "user", "content": message}, {"role": "assistant", "content": "❌ Failed to process image. Please try again."}]
156
 
157
+ # Use the message as the goal/instruction
158
+ goal = "Complete the requested action"
159
+ instruction = message
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
161
  # Generate action
162
  response = self.generate_action(pil_image, goal, instruction)
 
180
  logger.error(f"Error loading model: {str(e)}")
181
  return f"❌ Error loading model: {str(e)}"
182
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
  def load_example_episodes():
186
+ """Load example episodes using shutil to copy files to temp location"""
187
  examples = []
188
 
189
  try:
190
+ # Create temporary directory for examples
191
+ temp_dir = tempfile.mkdtemp()
192
+ logger.info(f"Created temporary directory for examples: {temp_dir}")
193
+
194
  episode_dirs = ["episode_13", "episode_53", "episode_73"]
195
 
196
  for episode_dir in episode_dirs:
 
205
  with open(metadata_path, "r") as f:
206
  metadata = json.load(f)
207
 
208
+ # Copy image to temp directory
209
+ temp_image_path = os.path.join(temp_dir, f"{episode_dir}_screenshot.png")
210
+ shutil.copy2(image_path, temp_image_path)
211
 
212
+ episode_num = episode_dir.split('_')[1]
213
+ goal_text = metadata.get('goal', f'Episode {episode_num} example')
214
 
215
+ examples.append([
216
+ temp_image_path, # Use temp file path
217
+ goal_text # Just the goal text, no additional formatting
218
+ ])
219
+ logger.info(f"Successfully loaded example for Episode {episode_num}")
 
 
 
 
 
 
220
 
221
  except Exception as e:
222
  logger.warning(f"Could not load example for {episode_dir}: {str(e)}")
 
226
  logger.error(f"Error loading examples: {str(e)}")
227
  examples = []
228
 
229
+ logger.info(f"Loaded {len(examples)} examples using shutil")
230
  return examples
231
 
232
  # Create Gradio interface
 
287
  interactive=False
288
  )
289
 
290
+ with gr.Column(scale=3):
291
+ gr.Markdown("### πŸ’¬ L-Operator Chat Interface")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
  # Load examples with error handling
293
  try:
294
  examples = load_example_episodes()
 
298
 
299
  chat_interface = gr.ChatInterface(
300
  fn=demo_instance.chat_with_model,
301
+ title="L-Operator: Android Device Control",
302
+ description="Upload an Android screenshot and describe your goal. The model will generate JSON actions for device control.",
 
303
  examples=examples,
304
  type="messages",
305
+ cache_examples=False,
306
+ textbox=gr.Textbox(
307
+ label="Goal",
308
+ placeholder="e.g., Open the Settings app and navigate to Display settings",
309
+ lines=2
310
+ )
311
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
 
313
  # Update model status on page load
314
  def update_model_status():
 
323
  else:
324
  return "❌ Model failed to load. Please check logs."
325
 
 
 
 
 
 
 
326
  # Load model and update status on page load
327
  demo.load(
328
  fn=update_model_status,
329
  outputs=model_status
330
  )
331
 
 
 
 
332
  gr.Markdown("""
333
  ---
334