fun-image-caption

Sleeping

App Files Files Community

Dylan commited on Mar 23

Commit

350f8a0

1 Parent(s): 0c35e90

calling gemma

Browse files

Files changed (2) hide show

agents.py +20 -9
app.py +1 -0

agents.py CHANGED Viewed

@@ -31,6 +31,9 @@ def build_graph():
     workflow = StateGraph(State)
     # Add nodes
     workflow.add_node("caption_image", caption_image)
     workflow.add_node("describe_with_voice", describe_with_voice)
@@ -55,16 +58,21 @@ model = Gemma3ForConditionalGeneration.from_pretrained(
 )
-def describe_with_voice(state: State) -> State:
-    state["description"] = "Dummy description"
     return state
-def caption_image(state: State) -> State:
-    state["caption"] = "Dummy caption"
-def describe_with_voice2(state: State) -> State:
     caption = state["caption"]
     voice = state["voice"]
@@ -75,7 +83,10 @@ def describe_with_voice2(state: State) -> State:
         "sarcastic teenager": "You are a sarcastic and disinterested teenager.",
     }
     messages = [
-        {"role": "system", "content": [voice_prompts.get(voice)]},
         {
             "role": "user",
             "content": [
@@ -93,7 +104,7 @@ def describe_with_voice2(state: State) -> State:
     input_len = inputs["input_ids"].shape[-1]
     with torch.inference_mode():
-        generation = model.generate(**inputs, max_new_tokens=100, do_sample=False)
         generation = generation[0][input_len:]
     description = processor.decode(generation, skip_special_tokens=True)
@@ -103,7 +114,7 @@ def describe_with_voice2(state: State) -> State:
     return state
-def caption_image2(state: State) -> State:
     # image is PIL
     image = state["image"]
@@ -136,7 +147,7 @@ def caption_image2(state: State) -> State:
     input_len = inputs["input_ids"].shape[-1]
     with torch.inference_mode():
-        generation = model.generate(**inputs, max_new_tokens=100, do_sample=False)
         generation = generation[0][input_len:]
     caption = processor.decode(generation, skip_special_tokens=True)

     workflow = StateGraph(State)
     # Add nodes
+    # workflow.add_node("caption_image", caption_image_dummy)
+    # workflow.add_node("describe_with_voice", describe_with_voice_dummy)
     workflow.add_node("caption_image", caption_image)
     workflow.add_node("describe_with_voice", describe_with_voice)
 )
+def describe_with_voice_dummy(state: State) -> State:
+    print("Describe")
+    voice = state["voice"]
+    state["description"] = f"Dummy description from {voice}"
     return state
+def caption_image_dummy(state: State) -> State:
+    print("Caption")
+    voice = state["voice"]
+    state["caption"] = f"Dummy caption from {voice}"
+    return state
+def describe_with_voice(state: State) -> State:
     caption = state["caption"]
     voice = state["voice"]
         "sarcastic teenager": "You are a sarcastic and disinterested teenager.",
     }
     messages = [
+        {
+            "role": "system",
+            "content": [{"type": "text", "text": voice_prompts.get(voice)}],
+        },
         {
             "role": "user",
             "content": [
     input_len = inputs["input_ids"].shape[-1]
     with torch.inference_mode():
+        generation = model.generate(**inputs, max_new_tokens=1000, do_sample=False)
         generation = generation[0][input_len:]
     description = processor.decode(generation, skip_special_tokens=True)
     return state
+def caption_image(state: State) -> State:
     # image is PIL
     image = state["image"]
     input_len = inputs["input_ids"].shape[-1]
     with torch.inference_mode():
+        generation = model.generate(**inputs, max_new_tokens=1000, do_sample=False)
         generation = generation[0][input_len:]
     caption = processor.decode(generation, skip_special_tokens=True)

app.py CHANGED Viewed

@@ -12,6 +12,7 @@ def process_and_display(image, voice):
     # Run the graph
     result = graph.invoke(state)
     # Return the caption and description
     return result["caption"], result["description"]

     # Run the graph
     result = graph.invoke(state)
+    print(result)
     # Return the caption and description
     return result["caption"], result["description"]