Sa2VA-simple-demo

Runtime error

App Files Files Community

aiqcamp commited on Jan 11

Commit

de04166

verified ·

1 Parent(s): 408e58c

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -58

app.py CHANGED Viewed

@@ -59,46 +59,73 @@ class WebcamProcessor:
         self.last_process_time = 0
     def start(self):
-        self.is_running = True
-        self.capture = cv2.VideoCapture(0)
-        self.capture_thread = threading.Thread(target=self._capture_loop)
-        self.process_thread = threading.Thread(target=self._process_loop)
-        self.capture_thread.start()
-        self.process_thread.start()
     def stop(self):
-        self.is_running = False
-        if hasattr(self, 'capture_thread'):
-            self.capture_thread.join()
-            self.process_thread.join()
-            self.capture.release()
     def _capture_loop(self):
         while self.is_running:
-            ret, frame = self.capture.read()
-            if ret:
-                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                frame = cv2.resize(frame, (640, 480))
-                current_time = time.time()
-                if current_time - self.last_process_time >= self.frame_interval:
-                    self.frame_buffer.append(frame)
-                    self.last_process_time = current_time
     def _process_loop(self):
         while self.is_running:
-            if len(self.frame_buffer) >= self.buffer_size:
-                frames = list(self.frame_buffer)
-                try:
                     result = self.model.predict_forward(
                         video=frames,
                         text="<image>Describe what you see",
                         tokenizer=self.tokenizer
                     )
                     self.result_queue.put(result)
-                except Exception as e:
-                    print(f"Processing error: {e}")
-                self.frame_buffer.clear()
-            time.sleep(0.1)
 from third_parts import VideoReader
 def read_video(video_path, video_interval):
@@ -229,28 +256,34 @@ def video_vision(video_input_path, prompt, video_interval):
     else:
         return prediction, None
-@spaces.GPU
 def webcam_vision(prompt):
-    is_korean = any(ord('가') <= ord(char) <= ord('힣') for char in prompt)
-    if not hasattr(webcam_vision, 'processor'):
-        webcam_vision.processor = WebcamProcessor(model, tokenizer)
-    if not webcam_vision.processor.is_running:
-        webcam_vision.processor.start()
     try:
-        result = webcam_vision.processor.result_queue.get(timeout=5)
-        prediction = result['prediction']
-        if is_korean:
-            prediction = translate_to_korean(prediction)
-        return prediction
-    except queue.Empty:
-        return "No results available yet"
     except Exception as e:
-        return f"Error: {str(e)}"
 # Gradio UI
 with gr.Blocks(analytics_enabled=False) as demo:
@@ -292,33 +325,33 @@ with gr.Blocks(analytics_enabled=False) as demo:
                 outputs = [vid_output_res, output_video]
             )
         with gr.Tab("Webcam"):
             with gr.Row():
                 with gr.Column():
-                    webcam_input = gr.Image(label="Webcam Input", sources=["webcam"], streaming=True)
                     with gr.Row():
-                        webcam_instruction = gr.Textbox(
-                            label="Instruction",
-                            placeholder="Enter instruction here...",
-                            scale=4
-                        )
-                        start_button = gr.Button("Start", scale=1)
-                        stop_button = gr.Button("Stop", scale=1)
                 with gr.Column():
                     webcam_output = gr.Textbox(label="Response")
-                    processed_view = gr.Image(label="Processed View")
-            status_text = gr.Textbox(label="Status", value="Ready")
             start_button.click(
-                fn=lambda x: webcam_vision(x),
                 inputs=[webcam_instruction],
                 outputs=[webcam_output]
             )
             stop_button.click(
-                fn=lambda: "Stopped" if hasattr(webcam_vision, 'processor') and webcam_vision.processor.stop() else "Not running",
                 outputs=[status_text]
             )
 demo.queue().launch(show_api=False, show_error=True)

         self.last_process_time = 0
     def start(self):
+        try:
+            self.is_running = True
+            self.capture = cv2.VideoCapture(0)
+            if not self.capture.isOpened():
+                raise Exception("Failed to open webcam")
+            # Set camera properties
+            self.capture.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
+            self.capture.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)
+            self.capture_thread = threading.Thread(target=self._capture_loop)
+            self.process_thread = threading.Thread(target=self._process_loop)
+            self.capture_thread.daemon = True
+            self.process_thread.daemon = True
+            self.capture_thread.start()
+            self.process_thread.start()
+            return "Webcam started successfully"
+        except Exception as e:
+            self.is_running = False
+            return f"Failed to start webcam: {str(e)}"
     def stop(self):
+        try:
+            self.is_running = False
+            if hasattr(self, 'capture_thread'):
+                self.capture_thread.join(timeout=1.0)
+            if hasattr(self, 'process_thread'):
+                self.process_thread.join(timeout=1.0)
+            if hasattr(self, 'capture'):
+                self.capture.release()
+            return "Webcam stopped successfully"
+        except Exception as e:
+            return f"Error stopping webcam: {str(e)}"
     def _capture_loop(self):
         while self.is_running:
+            try:
+                ret, frame = self.capture.read()
+                if ret:
+                    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    frame = cv2.resize(frame, (640, 480))
+                    current_time = time.time()
+                    if current_time - self.last_process_time >= self.frame_interval:
+                        self.frame_buffer.append(frame)
+                        self.last_process_time = current_time
+                time.sleep(0.01)  # Small delay to prevent CPU overuse
+            except Exception as e:
+                print(f"Capture error: {e}")
+                time.sleep(0.1)
     def _process_loop(self):
         while self.is_running:
+            try:
+                if len(self.frame_buffer) >= self.buffer_size:
+                    frames = list(self.frame_buffer)
                     result = self.model.predict_forward(
                         video=frames,
                         text="<image>Describe what you see",
                         tokenizer=self.tokenizer
                     )
                     self.result_queue.put(result)
+                    self.frame_buffer.clear()
+                time.sleep(0.1)
+            except Exception as e:
+                print(f"Processing error: {e}")
+                time.sleep(0.1)
 from third_parts import VideoReader
 def read_video(video_path, video_interval):
     else:
         return prediction, None
 def webcam_vision(prompt):
     try:
+        if not hasattr(webcam_vision, 'processor'):
+            webcam_vision.processor = WebcamProcessor(model, tokenizer)
+        if not webcam_vision.processor.is_running:
+            status = webcam_vision.processor.start()
+            if "Failed" in status:
+                return f"Error: {status}"
+        try:
+            result = webcam_vision.processor.result_queue.get(timeout=5)
+            prediction = result['prediction']
+            # Check if Korean translation is needed
+            is_korean = any(ord('가') <= ord(char) <= ord('힣') for char in prompt)
+            if is_korean:
+                prediction = translate_to_korean(prediction)
+            return prediction
+        except queue.Empty:
+            return "No results available yet. Please try again."
+        except Exception as e:
+            return f"Processing error: {str(e)}"
     except Exception as e:
+        return f"System error: {str(e)}"
 # Gradio UI
 with gr.Blocks(analytics_enabled=False) as demo:
                 outputs = [vid_output_res, output_video]
             )
         with gr.Tab("Webcam"):
             with gr.Row():
                 with gr.Column():
+                    webcam_instruction = gr.Textbox(
+                        label="Instruction",
+                        placeholder="Enter instruction here...",
+                        scale=4
+                    )
                     with gr.Row():
+                        start_button = gr.Button("Start Processing")
+                        stop_button = gr.Button("Stop Processing")
                 with gr.Column():
                     webcam_output = gr.Textbox(label="Response")
+                    status_text = gr.Textbox(label="Status", value="Ready")
             start_button.click(
+                fn=webcam_vision,
                 inputs=[webcam_instruction],
                 outputs=[webcam_output]
             )
             stop_button.click(
+                fn=lambda: webcam_vision.processor.stop() if hasattr(webcam_vision, 'processor') else "Not running",
                 outputs=[status_text]
             )
 demo.queue().launch(show_api=False, show_error=True)