Update
Browse files
app.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import torch
|
|
|
|
| 2 |
import gradio as gr
|
| 3 |
from transformers import CLIPProcessor, CLIPModel
|
| 4 |
import spaces
|
|
@@ -34,18 +35,28 @@ def calculate_score(image, text, model_name):
|
|
| 34 |
inputs = processor(text=labels, images=[image], return_tensors="pt", padding=True)
|
| 35 |
inputs = {k: v.to("cuda") for k, v in inputs.items()}
|
| 36 |
|
| 37 |
-
# Calculate
|
| 38 |
with torch.no_grad():
|
| 39 |
outputs = model(**inputs)
|
|
|
|
|
|
|
| 40 |
|
| 41 |
-
|
|
|
|
|
|
|
| 42 |
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
return results_dict
|
| 45 |
|
| 46 |
with gr.Blocks() as demo:
|
| 47 |
gr.Markdown("# Multi-Model CLIP Score")
|
| 48 |
-
gr.Markdown("Calculate the
|
| 49 |
|
| 50 |
with gr.Row():
|
| 51 |
image_input = gr.Image(type="pil")
|
|
|
|
| 1 |
import torch
|
| 2 |
+
import torch.nn.functional as F
|
| 3 |
import gradio as gr
|
| 4 |
from transformers import CLIPProcessor, CLIPModel
|
| 5 |
import spaces
|
|
|
|
| 35 |
inputs = processor(text=labels, images=[image], return_tensors="pt", padding=True)
|
| 36 |
inputs = {k: v.to("cuda") for k, v in inputs.items()}
|
| 37 |
|
| 38 |
+
# Calculate embeddings
|
| 39 |
with torch.no_grad():
|
| 40 |
outputs = model(**inputs)
|
| 41 |
+
image_embeds = outputs.image_embeds
|
| 42 |
+
text_embeds = outputs.text_embeds
|
| 43 |
|
| 44 |
+
# Normalize embeddings
|
| 45 |
+
image_embeds = F.normalize(image_embeds, p=2, dim=1)
|
| 46 |
+
text_embeds = F.normalize(text_embeds, p=2, dim=1)
|
| 47 |
|
| 48 |
+
# Calculate cosine similarity
|
| 49 |
+
cosine_similarities = torch.mm(text_embeds, image_embeds.t()).squeeze(1)
|
| 50 |
+
|
| 51 |
+
# Convert to percentages
|
| 52 |
+
percentages = ((cosine_similarities + 1) / 2 * 100).cpu().numpy()
|
| 53 |
+
|
| 54 |
+
results_dict = {label: float(score) for label, score in zip(labels, percentages)}
|
| 55 |
return results_dict
|
| 56 |
|
| 57 |
with gr.Blocks() as demo:
|
| 58 |
gr.Markdown("# Multi-Model CLIP Score")
|
| 59 |
+
gr.Markdown("Calculate the CLIP score (cosine similarity) between the given image and text descriptions using different CLIP model variants")
|
| 60 |
|
| 61 |
with gr.Row():
|
| 62 |
image_input = gr.Image(type="pil")
|