Text-image similarity
利用CLIP 计算 Text-to-Image task中 生成的图像与对应的prompt的相似程度,该指标越大越好。
from tqdm import tqdm
from PIL import Image
import torch
import os
import numpy as np
from transformers import CLIPProcessor, CLIPModel
model = CLIPModel.from_pretrained("clip-vit-base-patch16")
processor = CLIPProcessor.from_pretrained("clip-vit-base-patch16")
def get_clip_score(image_path,text):
image = Image.open(image_path)
inputs = processor(text=text, images=image, return_tensors="pt", padding=True)
outputs = model(**inputs)
print(outputs)
logits_per_image = outputs.logits_per_image
print(logits_per_image, logits_per_image.shape) # 1,4
return logits_per_image
举例:
image_path='test.jpg'
text = ['dog','cat','pig'] # text must be a list
对应的输出分数是[32.3232,52.2312,63.1298]
分别对应了 test.jpg
和['dog','cat','pig']
的相似程度。
值得注意的是,预训练模型clip-vit-base-patch16
需要提前下载好,并放在项目目录下。
这里提供一个可以下载的路径:https://huggingface.co/openai/clip-vit-base-patch16/tree/main (可能需要逐个文件下载)
Image-Image similarity
利用CLIP 计算两个图像之间的相似性,与SSIM,PSNR,MSE不同的是,这里计算的图像相似性是图像特征层面的相似性,而SSIM,PSNR,MSE主要比较的是成像后pixel级别的相似性。
import torch
from transformers import CLIPImageProcessor, CLIPModel, CLIPTokenizer
# from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import os
import cv2
# Load the CLIP model
model_ID = "clip-vit-base-patch16"
model = CLIPModel.from_pretrained(model_ID)
preprocess = CLIPImageProcessor.from_pretrained(model_ID)
# Define a function to load an image and preprocess it for CLIP
def load_and_preprocess_image(image_path):
# Load the image from the specified path
image = Image.open(image_path)
# Apply the CLIP preprocessing to the image
image = preprocess(image, return_tensors="pt")
# Return the preprocessed image
return image
def clip_img_score (img1_path,img2_path):
# Load the two images and preprocess them for CLIP
image_a = load_and_preprocess_image(img1_path)["pixel_values"]
image_b = load_and_preprocess_image(img2_path)["pixel_values"]
# Calculate the embeddings for the images using the CLIP model
with torch.no_grad():
embedding_a = model.get_image_features(image_a)
embedding_b = model.get_image_features(image_b)
# Calculate the cosine similarity between the embeddings
similarity_score = torch.nn.functional.cosine_similarity(embedding_a, embedding_b)
return similarity_score.item()
使用:
score = clip_img_score(img1_path,img1_path) #give the path to two images.
值得注意的是,transformers
的版本。
博主测试 pip install transformers==4.25.0
,代码可以正常运行。