import argparse import glob import os from tqdm import tqdm from collections import namedtuple import numpy as np import torch import torchvision.transforms as transforms from torchvision import models from PIL import Image from ldm.modules.evaluate.ssim import ssim transform = transforms.Compose([transforms.ToTensor()]) def normalize_tensor(in_feat, eps=1e-10): norm_factor = torch.sqrt(torch.sum(in_feat ** 2, dim=1)).view( in_feat.size()[0], 1, in_feat.size()[2], in_feat.size()[3] ) return in_feat / (norm_factor.expand_as(in_feat) + eps) def cos_sim(in0, in1): in0_norm = normalize_tensor(in0) in1_norm = normalize_tensor(in1) N = in0.size()[0] X = in0.size()[2] Y = in0.size()[3] return torch.mean( torch.mean( torch.sum(in0_norm * in1_norm, dim=1).view(N, 1, X, Y), dim=2 ).view(N, 1, 1, Y), dim=3, ).view(N) class squeezenet(torch.nn.Module): def __init__(self, requires_grad=False, pretrained=True): super(squeezenet, self).__init__() pretrained_features = models.squeezenet1_1( pretrained=pretrained ).features self.slice1 = torch.nn.Sequential() self.slice2 = torch.nn.Sequential() self.slice3 = torch.nn.Sequential() self.slice4 = torch.nn.Sequential() self.slice5 = torch.nn.Sequential() self.slice6 = torch.nn.Sequential() self.slice7 = torch.nn.Sequential() self.N_slices = 7 for x in range(2): self.slice1.add_module(str(x), pretrained_features[x]) for x in range(2, 5): self.slice2.add_module(str(x), pretrained_features[x]) for x in range(5, 8): self.slice3.add_module(str(x), pretrained_features[x]) for x in range(8, 10): self.slice4.add_module(str(x), pretrained_features[x]) for x in range(10, 11): self.slice5.add_module(str(x), pretrained_features[x]) for x in range(11, 12): self.slice6.add_module(str(x), pretrained_features[x]) for x in range(12, 13): self.slice7.add_module(str(x), pretrained_features[x]) if not requires_grad: for param in self.parameters(): param.requires_grad = False def forward(self, X): h = self.slice1(X) h_relu1 = h h = self.slice2(h) h_relu2 = h h = self.slice3(h) h_relu3 = h h = self.slice4(h) h_relu4 = h h = self.slice5(h) h_relu5 = h h = self.slice6(h) h_relu6 = h h = self.slice7(h) h_relu7 = h vgg_outputs = namedtuple( "SqueezeOutputs", ["relu1", "relu2", "relu3", "relu4", "relu5", "relu6", "relu7"], ) out = vgg_outputs( h_relu1, h_relu2, h_relu3, h_relu4, h_relu5, h_relu6, h_relu7 ) return out class alexnet(torch.nn.Module): def __init__(self, requires_grad=False, pretrained=True): super(alexnet, self).__init__() alexnet_pretrained_features = models.alexnet( pretrained=pretrained ).features self.slice1 = torch.nn.Sequential() self.slice2 = torch.nn.Sequential() self.slice3 = torch.nn.Sequential() self.slice4 = torch.nn.Sequential() self.slice5 = torch.nn.Sequential() self.N_slices = 5 for x in range(2): self.slice1.add_module(str(x), alexnet_pretrained_features[x]) for x in range(2, 5): self.slice2.add_module(str(x), alexnet_pretrained_features[x]) for x in range(5, 8): self.slice3.add_module(str(x), alexnet_pretrained_features[x]) for x in range(8, 10): self.slice4.add_module(str(x), alexnet_pretrained_features[x]) for x in range(10, 12): self.slice5.add_module(str(x), alexnet_pretrained_features[x]) if not requires_grad: for param in self.parameters(): param.requires_grad = False def forward(self, X): h = self.slice1(X) h_relu1 = h h = self.slice2(h) h_relu2 = h h = self.slice3(h) h_relu3 = h h = self.slice4(h) h_relu4 = h h = self.slice5(h) h_relu5 = h alexnet_outputs = namedtuple( "AlexnetOutputs", ["relu1", "relu2", "relu3", "relu4", "relu5"] ) out = alexnet_outputs(h_relu1, h_relu2, h_relu3, h_relu4, h_relu5) return out class vgg16(torch.nn.Module): def __init__(self, requires_grad=False, pretrained=True): super(vgg16, self).__init__() vgg_pretrained_features = models.vgg16(pretrained=pretrained).features self.slice1 = torch.nn.Sequential() self.slice2 = torch.nn.Sequential() self.slice3 = torch.nn.Sequential() self.slice4 = torch.nn.Sequential() self.slice5 = torch.nn.Sequential() self.N_slices = 5 for x in range(4): self.slice1.add_module(str(x), vgg_pretrained_features[x]) for x in range(4, 9): self.slice2.add_module(str(x), vgg_pretrained_features[x]) for x in range(9, 16): self.slice3.add_module(str(x), vgg_pretrained_features[x]) for x in range(16, 23): self.slice4.add_module(str(x), vgg_pretrained_features[x]) for x in range(23, 30): self.slice5.add_module(str(x), vgg_pretrained_features[x]) if not requires_grad: for param in self.parameters(): param.requires_grad = False def forward(self, X): h = self.slice1(X) h_relu1_2 = h h = self.slice2(h) h_relu2_2 = h h = self.slice3(h) h_relu3_3 = h h = self.slice4(h) h_relu4_3 = h h = self.slice5(h) h_relu5_3 = h vgg_outputs = namedtuple( "VggOutputs", ["relu1_2", "relu2_2", "relu3_3", "relu4_3", "relu5_3"], ) out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3) return out class resnet(torch.nn.Module): def __init__(self, requires_grad=False, pretrained=True, num=18): super(resnet, self).__init__() if num == 18: self.net = models.resnet18(pretrained=pretrained) elif num == 34: self.net = models.resnet34(pretrained=pretrained) elif num == 50: self.net = models.resnet50(pretrained=pretrained) elif num == 101: self.net = models.resnet101(pretrained=pretrained) elif num == 152: self.net = models.resnet152(pretrained=pretrained) self.N_slices = 5 self.conv1 = self.net.conv1 self.bn1 = self.net.bn1 self.relu = self.net.relu self.maxpool = self.net.maxpool self.layer1 = self.net.layer1 self.layer2 = self.net.layer2 self.layer3 = self.net.layer3 self.layer4 = self.net.layer4 def forward(self, X): h = self.conv1(X) h = self.bn1(h) h = self.relu(h) h_relu1 = h h = self.maxpool(h) h = self.layer1(h) h_conv2 = h h = self.layer2(h) h_conv3 = h h = self.layer3(h) h_conv4 = h h = self.layer4(h) h_conv5 = h outputs = namedtuple( "Outputs", ["relu1", "conv2", "conv3", "conv4", "conv5"] ) out = outputs(h_relu1, h_conv2, h_conv3, h_conv4, h_conv5) return out # Off-the-shelf deep network class PNet(torch.nn.Module): """Pre-trained network with all channels equally weighted by default""" def __init__(self, pnet_type="vgg", pnet_rand=False, use_gpu=True): super(PNet, self).__init__() self.use_gpu = use_gpu self.pnet_type = pnet_type self.pnet_rand = pnet_rand self.shift = torch.Tensor([-0.030, -0.088, -0.188]).view(1, 3, 1, 1) self.scale = torch.Tensor([0.458, 0.448, 0.450]).view(1, 3, 1, 1) if self.pnet_type in ["vgg", "vgg16"]: self.net = vgg16(pretrained=not self.pnet_rand, requires_grad=False) elif self.pnet_type == "alex": self.net = alexnet( pretrained=not self.pnet_rand, requires_grad=False ) elif self.pnet_type[:-2] == "resnet": self.net = resnet( pretrained=not self.pnet_rand, requires_grad=False, num=int(self.pnet_type[-2:]), ) elif self.pnet_type == "squeeze": self.net = squeezenet( pretrained=not self.pnet_rand, requires_grad=False ) self.L = self.net.N_slices if use_gpu: self.net.cuda() self.shift = self.shift.cuda() self.scale = self.scale.cuda() def forward(self, in0, in1, retPerLayer=False): in0_sc = (in0 - self.shift.expand_as(in0)) / self.scale.expand_as(in0) in1_sc = (in1 - self.shift.expand_as(in0)) / self.scale.expand_as(in0) outs0 = self.net.forward(in0_sc) outs1 = self.net.forward(in1_sc) if retPerLayer: all_scores = [] for (kk, out0) in enumerate(outs0): cur_score = 1.0 - cos_sim(outs0[kk], outs1[kk]) if kk == 0: val = 1.0 * cur_score else: val = val + cur_score if retPerLayer: all_scores += [cur_score] if retPerLayer: return (val, all_scores) else: return val # The SSIM metric def ssim_metric(img1, img2, mask=None): return ssim(img1, img2, mask=mask, size_average=False) # The PSNR metric def psnr(img1, img2, mask=None,reshape=False): b = img1.size(0) if not (mask is None): b = img1.size(0) mse_err = (img1 - img2).pow(2) * mask if reshape: mse_err = mse_err.reshape(b, -1).sum(dim=1) / ( 3 * mask.reshape(b, -1).sum(dim=1).clamp(min=1) ) else: mse_err = mse_err.view(b, -1).sum(dim=1) / ( 3 * mask.view(b, -1).sum(dim=1).clamp(min=1) ) else: if reshape: mse_err = (img1 - img2).pow(2).reshape(b, -1).mean(dim=1) else: mse_err = (img1 - img2).pow(2).view(b, -1).mean(dim=1) psnr = 10 * (1 / mse_err).log10() return psnr # The perceptual similarity metric def perceptual_sim(img1, img2, vgg16): # First extract features dist = vgg16(img1 * 2 - 1, img2 * 2 - 1) return dist def load_img(img_name, size=None): try: img = Image.open(img_name) if type(size) == int: img = img.resize((size, size)) elif size is not None: img = img.resize((size[1], size[0])) img = transform(img).cuda() img = img.unsqueeze(0) except Exception as e: print("Failed at loading %s " % img_name) print(e) img = torch.zeros(1, 3, 256, 256).cuda() raise return img def compute_perceptual_similarity(folder, pred_img, tgt_img, take_every_other): # Load VGG16 for feature similarity vgg16 = PNet().to("cuda") vgg16.eval() vgg16.cuda() values_percsim = [] values_ssim = [] values_psnr = [] folders = os.listdir(folder) for i, f in tqdm(enumerate(sorted(folders))): pred_imgs = glob.glob(folder + f + "/" + pred_img) tgt_imgs = glob.glob(folder + f + "/" + tgt_img) assert len(tgt_imgs) == 1 perc_sim = 10000 ssim_sim = -10 psnr_sim = -10 for p_img in pred_imgs: t_img = load_img(tgt_imgs[0]) p_img = load_img(p_img, size=t_img.shape[2:]) t_perc_sim = perceptual_sim(p_img, t_img, vgg16).item() perc_sim = min(perc_sim, t_perc_sim) ssim_sim = max(ssim_sim, ssim_metric(p_img, t_img).item()) psnr_sim = max(psnr_sim, psnr(p_img, t_img).item()) values_percsim += [perc_sim] values_ssim += [ssim_sim] values_psnr += [psnr_sim] if take_every_other: n_valuespercsim = [] n_valuesssim = [] n_valuespsnr = [] for i in range(0, len(values_percsim) // 2): n_valuespercsim += [ min(values_percsim[2 * i], values_percsim[2 * i + 1]) ] n_valuespsnr += [max(values_psnr[2 * i], values_psnr[2 * i + 1])] n_valuesssim += [max(values_ssim[2 * i], values_ssim[2 * i + 1])] values_percsim = n_valuespercsim values_ssim = n_valuesssim values_psnr = n_valuespsnr avg_percsim = np.mean(np.array(values_percsim)) std_percsim = np.std(np.array(values_percsim)) avg_psnr = np.mean(np.array(values_psnr)) std_psnr = np.std(np.array(values_psnr)) avg_ssim = np.mean(np.array(values_ssim)) std_ssim = np.std(np.array(values_ssim)) return { "Perceptual similarity": (avg_percsim, std_percsim), "PSNR": (avg_psnr, std_psnr), "SSIM": (avg_ssim, std_ssim), } def compute_perceptual_similarity_from_list(pred_imgs_list, tgt_imgs_list, take_every_other, simple_format=True): # Load VGG16 for feature similarity vgg16 = PNet().to("cuda") vgg16.eval() vgg16.cuda() values_percsim = [] values_ssim = [] values_psnr = [] equal_count = 0 ambig_count = 0 for i, tgt_img in enumerate(tqdm(tgt_imgs_list)): pred_imgs = pred_imgs_list[i] tgt_imgs = [tgt_img] assert len(tgt_imgs) == 1 if type(pred_imgs) != list: pred_imgs = [pred_imgs] perc_sim = 10000 ssim_sim = -10 psnr_sim = -10 assert len(pred_imgs)>0 for p_img in pred_imgs: t_img = load_img(tgt_imgs[0]) p_img = load_img(p_img, size=t_img.shape[2:]) t_perc_sim = perceptual_sim(p_img, t_img, vgg16).item() perc_sim = min(perc_sim, t_perc_sim) ssim_sim = max(ssim_sim, ssim_metric(p_img, t_img).item()) psnr_sim = max(psnr_sim, psnr(p_img, t_img).item()) values_percsim += [perc_sim] values_ssim += [ssim_sim] if psnr_sim != np.float("inf"): values_psnr += [psnr_sim] else: if torch.allclose(p_img, t_img): equal_count += 1 print("{} equal src and wrp images.".format(equal_count)) else: ambig_count += 1 print("{} ambiguous src and wrp images.".format(ambig_count)) if take_every_other: n_valuespercsim = [] n_valuesssim = [] n_valuespsnr = [] for i in range(0, len(values_percsim) // 2): n_valuespercsim += [ min(values_percsim[2 * i], values_percsim[2 * i + 1]) ] n_valuespsnr += [max(values_psnr[2 * i], values_psnr[2 * i + 1])] n_valuesssim += [max(values_ssim[2 * i], values_ssim[2 * i + 1])] values_percsim = n_valuespercsim values_ssim = n_valuesssim values_psnr = n_valuespsnr avg_percsim = np.mean(np.array(values_percsim)) std_percsim = np.std(np.array(values_percsim)) avg_psnr = np.mean(np.array(values_psnr)) std_psnr = np.std(np.array(values_psnr)) avg_ssim = np.mean(np.array(values_ssim)) std_ssim = np.std(np.array(values_ssim)) if simple_format: # just to make yaml formatting readable return { "Perceptual similarity": [float(avg_percsim), float(std_percsim)], "PSNR": [float(avg_psnr), float(std_psnr)], "SSIM": [float(avg_ssim), float(std_ssim)], } else: return { "Perceptual similarity": (avg_percsim, std_percsim), "PSNR": (avg_psnr, std_psnr), "SSIM": (avg_ssim, std_ssim), } def compute_perceptual_similarity_from_list_topk(pred_imgs_list, tgt_imgs_list, take_every_other, resize=False): # Load VGG16 for feature similarity vgg16 = PNet().to("cuda") vgg16.eval() vgg16.cuda() values_percsim = [] values_ssim = [] values_psnr = [] individual_percsim = [] individual_ssim = [] individual_psnr = [] for i, tgt_img in enumerate(tqdm(tgt_imgs_list)): pred_imgs = pred_imgs_list[i] tgt_imgs = [tgt_img] assert len(tgt_imgs) == 1 if type(pred_imgs) != list: assert False pred_imgs = [pred_imgs] perc_sim = 10000 ssim_sim = -10 psnr_sim = -10 sample_percsim = list() sample_ssim = list() sample_psnr = list() for p_img in pred_imgs: if resize: t_img = load_img(tgt_imgs[0], size=(256,256)) else: t_img = load_img(tgt_imgs[0]) p_img = load_img(p_img, size=t_img.shape[2:]) t_perc_sim = perceptual_sim(p_img, t_img, vgg16).item() sample_percsim.append(t_perc_sim) perc_sim = min(perc_sim, t_perc_sim) t_ssim = ssim_metric(p_img, t_img).item() sample_ssim.append(t_ssim) ssim_sim = max(ssim_sim, t_ssim) t_psnr = psnr(p_img, t_img).item() sample_psnr.append(t_psnr) psnr_sim = max(psnr_sim, t_psnr) values_percsim += [perc_sim] values_ssim += [ssim_sim] values_psnr += [psnr_sim] individual_percsim.append(sample_percsim) individual_ssim.append(sample_ssim) individual_psnr.append(sample_psnr) if take_every_other: assert False, "Do this later, after specifying topk to get proper results" n_valuespercsim = [] n_valuesssim = [] n_valuespsnr = [] for i in range(0, len(values_percsim) // 2): n_valuespercsim += [ min(values_percsim[2 * i], values_percsim[2 * i + 1]) ] n_valuespsnr += [max(values_psnr[2 * i], values_psnr[2 * i + 1])] n_valuesssim += [max(values_ssim[2 * i], values_ssim[2 * i + 1])] values_percsim = n_valuespercsim values_ssim = n_valuesssim values_psnr = n_valuespsnr avg_percsim = np.mean(np.array(values_percsim)) std_percsim = np.std(np.array(values_percsim)) avg_psnr = np.mean(np.array(values_psnr)) std_psnr = np.std(np.array(values_psnr)) avg_ssim = np.mean(np.array(values_ssim)) std_ssim = np.std(np.array(values_ssim)) individual_percsim = np.array(individual_percsim) individual_psnr = np.array(individual_psnr) individual_ssim = np.array(individual_ssim) return { "avg_of_best": { "Perceptual similarity": [float(avg_percsim), float(std_percsim)], "PSNR": [float(avg_psnr), float(std_psnr)], "SSIM": [float(avg_ssim), float(std_ssim)], }, "individual": { "PSIM": individual_percsim, "PSNR": individual_psnr, "SSIM": individual_ssim, } } if __name__ == "__main__": args = argparse.ArgumentParser() args.add_argument("--folder", type=str, default="") args.add_argument("--pred_image", type=str, default="") args.add_argument("--target_image", type=str, default="") args.add_argument("--take_every_other", action="store_true", default=False) args.add_argument("--output_file", type=str, default="") opts = args.parse_args() folder = opts.folder pred_img = opts.pred_image tgt_img = opts.target_image results = compute_perceptual_similarity( folder, pred_img, tgt_img, opts.take_every_other ) f = open(opts.output_file, 'w') for key in results: print("%s for %s: \n" % (key, opts.folder)) print( "\t {:0.4f} | {:0.4f} \n".format(results[key][0], results[key][1]) ) f.write("%s for %s: \n" % (key, opts.folder)) f.write( "\t {:0.4f} | {:0.4f} \n".format(results[key][0], results[key][1]) ) f.close()