Spaces:

taesiri
/

CLIPScore

Running on Zero

App Files Files Community

taesiri commited on Jun 9, 2024

Commit

d60d34b

1 Parent(s): 3601eff

update

Browse files

Files changed (3) hide show

README.md +1 -1
app.py +41 -16
requirements.txt +1 -2

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 💯
 colorFrom: red
 colorTo: blue
 sdk: gradio
-sdk_version: 3.0.24
 app_file: app.py
 pinned: false
 license: mit

 colorFrom: red
 colorTo: blue
 sdk: gradio
+sdk_version:  4.36.0
 app_file: app.py
 pinned: false
 license: mit

app.py CHANGED Viewed

@@ -1,10 +1,17 @@
 import gradio as gr
 from transformers import CLIPProcessor, CLIPModel
-model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
 processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
 def calculate_score(image, text):
     labels = text.split(";")
     labels = [l.strip() for l in labels]
@@ -12,8 +19,13 @@ def calculate_score(image, text):
     if len(labels) == 0:
         return dict()
     inputs = processor(text=labels, images=image, return_tensors="pt", padding=True)
     outputs = model(**inputs)
-    logits_per_image = outputs.logits_per_image.detach().numpy()
     results_dict = {
         label: score / 100.0 for label, score in zip(labels, logits_per_image[0])
@@ -21,21 +33,34 @@ def calculate_score(image, text):
     return results_dict
-if __name__ == "__main__":
-    cat_example = [
-        "cat.jpg",
-        "a cat stuck in a door; a cat in the air; a cat sitting; a cat standing; a cat is entering the matrix; a cat is entering the void",
-    ]
-    demo = gr.Interface(
         fn=calculate_score,
-        inputs=["image", "text"],
-        outputs="label",
-        examples=[cat_example],
-        allow_flagging="never",
-        description="# CLIP Score",
-        article="Calculate the [CLIP](https://openai.com/blog/clip/) score of a given image and text",
-        cache_examples=True,
     )
-    demo.launch()

+import torch
 import gradio as gr
 from transformers import CLIPProcessor, CLIPModel
+import spaces
+# Check if CUDA is available and set the device accordingly
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16").to(device)
 processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
+@spaces.GPU
 def calculate_score(image, text):
     labels = text.split(";")
     labels = [l.strip() for l in labels]
     if len(labels) == 0:
         return dict()
     inputs = processor(text=labels, images=image, return_tensors="pt", padding=True)
+    inputs = {
+        k: v.to(device) for k, v in inputs.items()
+    }  # Move tensors to the appropriate device
     outputs = model(**inputs)
+    logits_per_image = (
+        outputs.logits_per_image.detach().cpu().numpy()
+    )  # Move results back to CPU for further processing
     results_dict = {
         label: score / 100.0 for label, score in zip(labels, logits_per_image[0])
     return results_dict
+with gr.Blocks() as demo:
+    gr.Markdown("# CLIP Score")
+    gr.Markdown(
+        "Calculate the [CLIP](https://openai.com/blog/clip/) score of a given image and text"
+    )
+    with gr.Row():
+        image_input = gr.Image()
+        output_label = gr.Label()
+    text_input = gr.Textbox(label="Descriptions (separated by semicolons)")
+    image_input.change(
+        fn=calculate_score, inputs=[image_input, text_input], outputs=output_label
+    )
+    text_input.submit(
+        fn=calculate_score, inputs=[image_input, text_input], outputs=output_label
+    )
+    gr.Examples(
+        examples=[
+            [
+                "cat.jpg",
+                "a cat stuck in a door; a cat in the air; a cat sitting; a cat standing; a cat is entering the matrix; a cat is entering the void",
+            ]
+        ],
         fn=calculate_score,
+        inputs=[image_input, text_input],
+        outputs=output_label,
     )
+demo.launch()

requirements.txt CHANGED Viewed

@@ -1,4 +1,3 @@
-gradio
-transformers
 torch
 torchvision

+git+https://github.com/huggingface/transformers
 torch
 torchvision