eval stuff

2022-06-09 10:56:34 +02:00 · 2022-06-09 10:56:34 +02:00 · 9cda7feafa
commit 9cda7feafa
parent 6dc939330d
9 changed files with 2365 additions and 30 deletions
--- a/ldm/models/diffusion/ddpm.py
+++ b/ldm/models/diffusion/ddpm.py
@ -1280,39 +1280,39 @@ class LatentDiffusion(DDPM):
                x_samples = self.decode_first_stage(samples.to(self.device))
                log["samples_x0_quantized"] = x_samples

-            if unconditional_guidance_scale > 1.0:
-                uc = self.get_unconditional_conditioning(N, unconditional_guidance_label)
-                with ema_scope("Sampling with classifier-free guidance"):
-                    samples_cfg, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
-                                                     ddim_steps=ddim_steps, eta=ddim_eta,
-                                                     unconditional_guidance_scale=unconditional_guidance_scale,
-                                                     unconditional_conditioning=uc,
-                                                     )
-                    x_samples_cfg = self.decode_first_stage(samples_cfg)
-                    log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
+        if unconditional_guidance_scale > 1.0:
+            uc = self.get_unconditional_conditioning(N, unconditional_guidance_label)
+            with ema_scope("Sampling with classifier-free guidance"):
+                samples_cfg, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
+                                                 ddim_steps=ddim_steps, eta=ddim_eta,
+                                                 unconditional_guidance_scale=unconditional_guidance_scale,
+                                                 unconditional_conditioning=uc,
+                                                 )
+                x_samples_cfg = self.decode_first_stage(samples_cfg)
+                log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg

-            if inpaint:
-                # make a simple center square
-                b, h, w = z.shape[0], z.shape[2], z.shape[3]
-                mask = torch.ones(N, h, w).to(self.device)
-                # zeros will be filled in
-                mask[:, h // 4:3 * h // 4, w // 4:3 * w // 4] = 0.
-                mask = mask[:, None, ...]
-                with ema_scope("Plotting Inpaint"):
+        if inpaint:
+            # make a simple center square
+            b, h, w = z.shape[0], z.shape[2], z.shape[3]
+            mask = torch.ones(N, h, w).to(self.device)
+            # zeros will be filled in
+            mask[:, h // 4:3 * h // 4, w // 4:3 * w // 4] = 0.
+            mask = mask[:, None, ...]
+            with ema_scope("Plotting Inpaint"):

-                    samples, _ = self.sample_log(cond=c,batch_size=N,ddim=use_ddim, eta=ddim_eta,
-                                                ddim_steps=ddim_steps, x0=z[:N], mask=mask)
-                x_samples = self.decode_first_stage(samples.to(self.device))
-                log["samples_inpainting"] = x_samples
-                log["mask"] = mask
+                samples, _ = self.sample_log(cond=c,batch_size=N,ddim=use_ddim, eta=ddim_eta,
+                                            ddim_steps=ddim_steps, x0=z[:N], mask=mask)
+            x_samples = self.decode_first_stage(samples.to(self.device))
+            log["samples_inpainting"] = x_samples
+            log["mask"] = mask

-                # outpaint
-                mask = 1. - mask
-                with ema_scope("Plotting Outpaint"):
-                    samples, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,eta=ddim_eta,
-                                                ddim_steps=ddim_steps, x0=z[:N], mask=mask)
-                x_samples = self.decode_first_stage(samples.to(self.device))
-                log["samples_outpainting"] = x_samples
+            # outpaint
+            mask = 1. - mask
+            with ema_scope("Plotting Outpaint"):
+                samples, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,eta=ddim_eta,
+                                            ddim_steps=ddim_steps, x0=z[:N], mask=mask)
+            x_samples = self.decode_first_stage(samples.to(self.device))
+            log["samples_outpainting"] = x_samples

        if plot_progressive_rows:
            with ema_scope("Plotting Progressives"):
--- a/ldm/modules/evaluate/adm_evaluator.py
+++ b/ldm/modules/evaluate/adm_evaluator.py
@ -0,0 +1,676 @@
+import argparse
+import io
+import os
+import random
+import warnings
+import zipfile
+from abc import ABC, abstractmethod
+from contextlib import contextmanager
+from functools import partial
+from multiprocessing import cpu_count
+from multiprocessing.pool import ThreadPool
+from typing import Iterable, Optional, Tuple
+import yaml
+
+import numpy as np
+import requests
+import tensorflow.compat.v1 as tf
+from scipy import linalg
+from tqdm.auto import tqdm
+
+INCEPTION_V3_URL = "https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/classify_image_graph_def.pb"
+INCEPTION_V3_PATH = "classify_image_graph_def.pb"
+
+FID_POOL_NAME = "pool_3:0"
+FID_SPATIAL_NAME = "mixed_6/conv:0"
+
+REQUIREMENTS = f"This script has the following requirements: \n" \
+               'tensorflow-gpu>=2.0' + "\n" + 'scipy' + "\n" + "requests" + "\n" + "tqdm"
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ref_batch", help="path to reference batch npz file")
+    parser.add_argument("--sample_batch", help="path to sample batch npz file")
+    args = parser.parse_args()
+
+    config = tf.ConfigProto(
+        allow_soft_placement=True  # allows DecodeJpeg to run on CPU in Inception graph
+    )
+    config.gpu_options.allow_growth = True
+    evaluator = Evaluator(tf.Session(config=config))
+
+    print("warming up TensorFlow...")
+    # This will cause TF to print a bunch of verbose stuff now rather
+    # than after the next print(), to help prevent confusion.
+    evaluator.warmup()
+
+    print("computing reference batch activations...")
+    ref_acts = evaluator.read_activations(args.ref_batch)
+    print("computing/reading reference batch statistics...")
+    ref_stats, ref_stats_spatial = evaluator.read_statistics(args.ref_batch, ref_acts)
+
+    print("computing sample batch activations...")
+    sample_acts = evaluator.read_activations(args.sample_batch)
+    print("computing/reading sample batch statistics...")
+    sample_stats, sample_stats_spatial = evaluator.read_statistics(args.sample_batch, sample_acts)
+
+    print("Computing evaluations...")
+    is_ = evaluator.compute_inception_score(sample_acts[0])
+    print("Inception Score:", is_)
+    fid = sample_stats.frechet_distance(ref_stats)
+    print("FID:", fid)
+    sfid = sample_stats_spatial.frechet_distance(ref_stats_spatial)
+    print("sFID:", sfid)
+    prec, recall = evaluator.compute_prec_recall(ref_acts[0], sample_acts[0])
+    print("Precision:", prec)
+    print("Recall:", recall)
+
+    savepath = '/'.join(args.sample_batch.split('/')[:-1])
+    results_file = os.path.join(savepath,'evaluation_metrics.yaml')
+    print(f'Saving evaluation results to "{results_file}"')
+
+    results = {
+        'IS': is_,
+        'FID': fid,
+        'sFID': sfid,
+        'Precision:':prec,
+        'Recall': recall
+    }
+
+    with open(results_file, 'w') as f:
+        yaml.dump(results, f, default_flow_style=False)
+
+class InvalidFIDException(Exception):
+    pass
+
+
+class FIDStatistics:
+    def __init__(self, mu: np.ndarray, sigma: np.ndarray):
+        self.mu = mu
+        self.sigma = sigma
+
+    def frechet_distance(self, other, eps=1e-6):
+        """
+        Compute the Frechet distance between two sets of statistics.
+        """
+        # https://github.com/bioinf-jku/TTUR/blob/73ab375cdf952a12686d9aa7978567771084da42/fid.py#L132
+        mu1, sigma1 = self.mu, self.sigma
+        mu2, sigma2 = other.mu, other.sigma
+
+        mu1 = np.atleast_1d(mu1)
+        mu2 = np.atleast_1d(mu2)
+
+        sigma1 = np.atleast_2d(sigma1)
+        sigma2 = np.atleast_2d(sigma2)
+
+        assert (
+            mu1.shape == mu2.shape
+        ), f"Training and test mean vectors have different lengths: {mu1.shape}, {mu2.shape}"
+        assert (
+            sigma1.shape == sigma2.shape
+        ), f"Training and test covariances have different dimensions: {sigma1.shape}, {sigma2.shape}"
+
+        diff = mu1 - mu2
+
+        # product might be almost singular
+        covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
+        if not np.isfinite(covmean).all():
+            msg = (
+                "fid calculation produces singular product; adding %s to diagonal of cov estimates"
+                % eps
+            )
+            warnings.warn(msg)
+            offset = np.eye(sigma1.shape[0]) * eps
+            covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
+
+        # numerical error might give slight imaginary component
+        if np.iscomplexobj(covmean):
+            if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
+                m = np.max(np.abs(covmean.imag))
+                raise ValueError("Imaginary component {}".format(m))
+            covmean = covmean.real
+
+        tr_covmean = np.trace(covmean)
+
+        return diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean
+
+
+class Evaluator:
+    def __init__(
+        self,
+        session,
+        batch_size=64,
+        softmax_batch_size=512,
+    ):
+        self.sess = session
+        self.batch_size = batch_size
+        self.softmax_batch_size = softmax_batch_size
+        self.manifold_estimator = ManifoldEstimator(session)
+        with self.sess.graph.as_default():
+            self.image_input = tf.placeholder(tf.float32, shape=[None, None, None, 3])
+            self.softmax_input = tf.placeholder(tf.float32, shape=[None, 2048])
+            self.pool_features, self.spatial_features = _create_feature_graph(self.image_input)
+            self.softmax = _create_softmax_graph(self.softmax_input)
+
+    def warmup(self):
+        self.compute_activations(np.zeros([1, 8, 64, 64, 3]))
+
+    def read_activations(self, npz_path: str) -> Tuple[np.ndarray, np.ndarray]:
+        with open_npz_array(npz_path, "arr_0") as reader:
+            return self.compute_activations(reader.read_batches(self.batch_size))
+
+    def compute_activations(self, batches: Iterable[np.ndarray],silent=False) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Compute image features for downstream evals.
+
+        :param batches: a iterator over NHWC numpy arrays in [0, 255].
+        :return: a tuple of numpy arrays of shape [N x X], where X is a feature
+                 dimension. The tuple is (pool_3, spatial).
+        """
+        preds = []
+        spatial_preds = []
+        it = batches if silent else tqdm(batches)
+        for batch in it:
+            batch = batch.astype(np.float32)
+            pred, spatial_pred = self.sess.run(
+                [self.pool_features, self.spatial_features], {self.image_input: batch}
+            )
+            preds.append(pred.reshape([pred.shape[0], -1]))
+            spatial_preds.append(spatial_pred.reshape([spatial_pred.shape[0], -1]))
+        return (
+            np.concatenate(preds, axis=0),
+            np.concatenate(spatial_preds, axis=0),
+        )
+
+    def read_statistics(
+        self, npz_path: str, activations: Tuple[np.ndarray, np.ndarray]
+    ) -> Tuple[FIDStatistics, FIDStatistics]:
+        obj = np.load(npz_path)
+        if "mu" in list(obj.keys()):
+            return FIDStatistics(obj["mu"], obj["sigma"]), FIDStatistics(
+                obj["mu_s"], obj["sigma_s"]
+            )
+        return tuple(self.compute_statistics(x) for x in activations)
+
+    def compute_statistics(self, activations: np.ndarray) -> FIDStatistics:
+        mu = np.mean(activations, axis=0)
+        sigma = np.cov(activations, rowvar=False)
+        return FIDStatistics(mu, sigma)
+
+    def compute_inception_score(self, activations: np.ndarray, split_size: int = 5000) -> float:
+        softmax_out = []
+        for i in range(0, len(activations), self.softmax_batch_size):
+            acts = activations[i : i + self.softmax_batch_size]
+            softmax_out.append(self.sess.run(self.softmax, feed_dict={self.softmax_input: acts}))
+        preds = np.concatenate(softmax_out, axis=0)
+        # https://github.com/openai/improved-gan/blob/4f5d1ec5c16a7eceb206f42bfc652693601e1d5c/inception_score/model.py#L46
+        scores = []
+        for i in range(0, len(preds), split_size):
+            part = preds[i : i + split_size]
+            kl = part * (np.log(part) - np.log(np.expand_dims(np.mean(part, 0), 0)))
+            kl = np.mean(np.sum(kl, 1))
+            scores.append(np.exp(kl))
+        return float(np.mean(scores))
+
+    def compute_prec_recall(
+        self, activations_ref: np.ndarray, activations_sample: np.ndarray
+    ) -> Tuple[float, float]:
+        radii_1 = self.manifold_estimator.manifold_radii(activations_ref)
+        radii_2 = self.manifold_estimator.manifold_radii(activations_sample)
+        pr = self.manifold_estimator.evaluate_pr(
+            activations_ref, radii_1, activations_sample, radii_2
+        )
+        return (float(pr[0][0]), float(pr[1][0]))
+
+
+class ManifoldEstimator:
+    """
+    A helper for comparing manifolds of feature vectors.
+
+    Adapted from https://github.com/kynkaat/improved-precision-and-recall-metric/blob/f60f25e5ad933a79135c783fcda53de30f42c9b9/precision_recall.py#L57
+    """
+
+    def __init__(
+        self,
+        session,
+        row_batch_size=10000,
+        col_batch_size=10000,
+        nhood_sizes=(3,),
+        clamp_to_percentile=None,
+        eps=1e-5,
+    ):
+        """
+        Estimate the manifold of given feature vectors.
+
+        :param session: the TensorFlow session.
+        :param row_batch_size: row batch size to compute pairwise distances
+                               (parameter to trade-off between memory usage and performance).
+        :param col_batch_size: column batch size to compute pairwise distances.
+        :param nhood_sizes: number of neighbors used to estimate the manifold.
+        :param clamp_to_percentile: prune hyperspheres that have radius larger than
+                                    the given percentile.
+        :param eps: small number for numerical stability.
+        """
+        self.distance_block = DistanceBlock(session)
+        self.row_batch_size = row_batch_size
+        self.col_batch_size = col_batch_size
+        self.nhood_sizes = nhood_sizes
+        self.num_nhoods = len(nhood_sizes)
+        self.clamp_to_percentile = clamp_to_percentile
+        self.eps = eps
+
+    def warmup(self):
+        feats, radii = (
+            np.zeros([1, 2048], dtype=np.float32),
+            np.zeros([1, 1], dtype=np.float32),
+        )
+        self.evaluate_pr(feats, radii, feats, radii)
+
+    def manifold_radii(self, features: np.ndarray) -> np.ndarray:
+        num_images = len(features)
+
+        # Estimate manifold of features by calculating distances to k-NN of each sample.
+        radii = np.zeros([num_images, self.num_nhoods], dtype=np.float32)
+        distance_batch = np.zeros([self.row_batch_size, num_images], dtype=np.float32)
+        seq = np.arange(max(self.nhood_sizes) + 1, dtype=np.int32)
+
+        for begin1 in range(0, num_images, self.row_batch_size):
+            end1 = min(begin1 + self.row_batch_size, num_images)
+            row_batch = features[begin1:end1]
+
+            for begin2 in range(0, num_images, self.col_batch_size):
+                end2 = min(begin2 + self.col_batch_size, num_images)
+                col_batch = features[begin2:end2]
+
+                # Compute distances between batches.
+                distance_batch[
+                    0 : end1 - begin1, begin2:end2
+                ] = self.distance_block.pairwise_distances(row_batch, col_batch)
+
+            # Find the k-nearest neighbor from the current batch.
+            radii[begin1:end1, :] = np.concatenate(
+                [
+                    x[:, self.nhood_sizes]
+                    for x in _numpy_partition(distance_batch[0 : end1 - begin1, :], seq, axis=1)
+                ],
+                axis=0,
+            )
+
+        if self.clamp_to_percentile is not None:
+            max_distances = np.percentile(radii, self.clamp_to_percentile, axis=0)
+            radii[radii > max_distances] = 0
+        return radii
+
+    def evaluate(self, features: np.ndarray, radii: np.ndarray, eval_features: np.ndarray):
+        """
+        Evaluate if new feature vectors are at the manifold.
+        """
+        num_eval_images = eval_features.shape[0]
+        num_ref_images = radii.shape[0]
+        distance_batch = np.zeros([self.row_batch_size, num_ref_images], dtype=np.float32)
+        batch_predictions = np.zeros([num_eval_images, self.num_nhoods], dtype=np.int32)
+        max_realism_score = np.zeros([num_eval_images], dtype=np.float32)
+        nearest_indices = np.zeros([num_eval_images], dtype=np.int32)
+
+        for begin1 in range(0, num_eval_images, self.row_batch_size):
+            end1 = min(begin1 + self.row_batch_size, num_eval_images)
+            feature_batch = eval_features[begin1:end1]
+
+            for begin2 in range(0, num_ref_images, self.col_batch_size):
+                end2 = min(begin2 + self.col_batch_size, num_ref_images)
+                ref_batch = features[begin2:end2]
+
+                distance_batch[
+                    0 : end1 - begin1, begin2:end2
+                ] = self.distance_block.pairwise_distances(feature_batch, ref_batch)
+
+            # From the minibatch of new feature vectors, determine if they are in the estimated manifold.
+            # If a feature vector is inside a hypersphere of some reference sample, then
+            # the new sample lies at the estimated manifold.
+            # The radii of the hyperspheres are determined from distances of neighborhood size k.
+            samples_in_manifold = distance_batch[0 : end1 - begin1, :, None] <= radii
+            batch_predictions[begin1:end1] = np.any(samples_in_manifold, axis=1).astype(np.int32)
+
+            max_realism_score[begin1:end1] = np.max(
+                radii[:, 0] / (distance_batch[0 : end1 - begin1, :] + self.eps), axis=1
+            )
+            nearest_indices[begin1:end1] = np.argmin(distance_batch[0 : end1 - begin1, :], axis=1)
+
+        return {
+            "fraction": float(np.mean(batch_predictions)),
+            "batch_predictions": batch_predictions,
+            "max_realisim_score": max_realism_score,
+            "nearest_indices": nearest_indices,
+        }
+
+    def evaluate_pr(
+        self,
+        features_1: np.ndarray,
+        radii_1: np.ndarray,
+        features_2: np.ndarray,
+        radii_2: np.ndarray,
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Evaluate precision and recall efficiently.
+
+        :param features_1: [N1 x D] feature vectors for reference batch.
+        :param radii_1: [N1 x K1] radii for reference vectors.
+        :param features_2: [N2 x D] feature vectors for the other batch.
+        :param radii_2: [N x K2] radii for other vectors.
+        :return: a tuple of arrays for (precision, recall):
+                 - precision: an np.ndarray of length K1
+                 - recall: an np.ndarray of length K2
+        """
+        features_1_status = np.zeros([len(features_1), radii_2.shape[1]], dtype=np.bool)
+        features_2_status = np.zeros([len(features_2), radii_1.shape[1]], dtype=np.bool)
+        for begin_1 in range(0, len(features_1), self.row_batch_size):
+            end_1 = begin_1 + self.row_batch_size
+            batch_1 = features_1[begin_1:end_1]
+            for begin_2 in range(0, len(features_2), self.col_batch_size):
+                end_2 = begin_2 + self.col_batch_size
+                batch_2 = features_2[begin_2:end_2]
+                batch_1_in, batch_2_in = self.distance_block.less_thans(
+                    batch_1, radii_1[begin_1:end_1], batch_2, radii_2[begin_2:end_2]
+                )
+                features_1_status[begin_1:end_1] |= batch_1_in
+                features_2_status[begin_2:end_2] |= batch_2_in
+        return (
+            np.mean(features_2_status.astype(np.float64), axis=0),
+            np.mean(features_1_status.astype(np.float64), axis=0),
+        )
+
+
+class DistanceBlock:
+    """
+    Calculate pairwise distances between vectors.
+
+    Adapted from https://github.com/kynkaat/improved-precision-and-recall-metric/blob/f60f25e5ad933a79135c783fcda53de30f42c9b9/precision_recall.py#L34
+    """
+
+    def __init__(self, session):
+        self.session = session
+
+        # Initialize TF graph to calculate pairwise distances.
+        with session.graph.as_default():
+            self._features_batch1 = tf.placeholder(tf.float32, shape=[None, None])
+            self._features_batch2 = tf.placeholder(tf.float32, shape=[None, None])
+            distance_block_16 = _batch_pairwise_distances(
+                tf.cast(self._features_batch1, tf.float16),
+                tf.cast(self._features_batch2, tf.float16),
+            )
+            self.distance_block = tf.cond(
+                tf.reduce_all(tf.math.is_finite(distance_block_16)),
+                lambda: tf.cast(distance_block_16, tf.float32),
+                lambda: _batch_pairwise_distances(self._features_batch1, self._features_batch2),
+            )
+
+            # Extra logic for less thans.
+            self._radii1 = tf.placeholder(tf.float32, shape=[None, None])
+            self._radii2 = tf.placeholder(tf.float32, shape=[None, None])
+            dist32 = tf.cast(self.distance_block, tf.float32)[..., None]
+            self._batch_1_in = tf.math.reduce_any(dist32 <= self._radii2, axis=1)
+            self._batch_2_in = tf.math.reduce_any(dist32 <= self._radii1[:, None], axis=0)
+
+    def pairwise_distances(self, U, V):
+        """
+        Evaluate pairwise distances between two batches of feature vectors.
+        """
+        return self.session.run(
+            self.distance_block,
+            feed_dict={self._features_batch1: U, self._features_batch2: V},
+        )
+
+    def less_thans(self, batch_1, radii_1, batch_2, radii_2):
+        return self.session.run(
+            [self._batch_1_in, self._batch_2_in],
+            feed_dict={
+                self._features_batch1: batch_1,
+                self._features_batch2: batch_2,
+                self._radii1: radii_1,
+                self._radii2: radii_2,
+            },
+        )
+
+
+def _batch_pairwise_distances(U, V):
+    """
+    Compute pairwise distances between two batches of feature vectors.
+    """
+    with tf.variable_scope("pairwise_dist_block"):
+        # Squared norms of each row in U and V.
+        norm_u = tf.reduce_sum(tf.square(U), 1)
+        norm_v = tf.reduce_sum(tf.square(V), 1)
+
+        # norm_u as a column and norm_v as a row vectors.
+        norm_u = tf.reshape(norm_u, [-1, 1])
+        norm_v = tf.reshape(norm_v, [1, -1])
+
+        # Pairwise squared Euclidean distances.
+        D = tf.maximum(norm_u - 2 * tf.matmul(U, V, False, True) + norm_v, 0.0)
+
+    return D
+
+
+class NpzArrayReader(ABC):
+    @abstractmethod
+    def read_batch(self, batch_size: int) -> Optional[np.ndarray]:
+        pass
+
+    @abstractmethod
+    def remaining(self) -> int:
+        pass
+
+    def read_batches(self, batch_size: int) -> Iterable[np.ndarray]:
+        def gen_fn():
+            while True:
+                batch = self.read_batch(batch_size)
+                if batch is None:
+                    break
+                yield batch
+
+        rem = self.remaining()
+        num_batches = rem // batch_size + int(rem % batch_size != 0)
+        return BatchIterator(gen_fn, num_batches)
+
+
+class BatchIterator:
+    def __init__(self, gen_fn, length):
+        self.gen_fn = gen_fn
+        self.length = length
+
+    def __len__(self):
+        return self.length
+
+    def __iter__(self):
+        return self.gen_fn()
+
+
+class StreamingNpzArrayReader(NpzArrayReader):
+    def __init__(self, arr_f, shape, dtype):
+        self.arr_f = arr_f
+        self.shape = shape
+        self.dtype = dtype
+        self.idx = 0
+
+    def read_batch(self, batch_size: int) -> Optional[np.ndarray]:
+        if self.idx >= self.shape[0]:
+            return None
+
+        bs = min(batch_size, self.shape[0] - self.idx)
+        self.idx += bs
+
+        if self.dtype.itemsize == 0:
+            return np.ndarray([bs, *self.shape[1:]], dtype=self.dtype)
+
+        read_count = bs * np.prod(self.shape[1:])
+        read_size = int(read_count * self.dtype.itemsize)
+        data = _read_bytes(self.arr_f, read_size, "array data")
+        return np.frombuffer(data, dtype=self.dtype).reshape([bs, *self.shape[1:]])
+
+    def remaining(self) -> int:
+        return max(0, self.shape[0] - self.idx)
+
+
+class MemoryNpzArrayReader(NpzArrayReader):
+    def __init__(self, arr):
+        self.arr = arr
+        self.idx = 0
+
+    @classmethod
+    def load(cls, path: str, arr_name: str):
+        with open(path, "rb") as f:
+            arr = np.load(f)[arr_name]
+        return cls(arr)
+
+    def read_batch(self, batch_size: int) -> Optional[np.ndarray]:
+        if self.idx >= self.arr.shape[0]:
+            return None
+
+        res = self.arr[self.idx : self.idx + batch_size]
+        self.idx += batch_size
+        return res
+
+    def remaining(self) -> int:
+        return max(0, self.arr.shape[0] - self.idx)
+
+
+@contextmanager
+def open_npz_array(path: str, arr_name: str) -> NpzArrayReader:
+    with _open_npy_file(path, arr_name) as arr_f:
+        version = np.lib.format.read_magic(arr_f)
+        if version == (1, 0):
+            header = np.lib.format.read_array_header_1_0(arr_f)
+        elif version == (2, 0):
+            header = np.lib.format.read_array_header_2_0(arr_f)
+        else:
+            yield MemoryNpzArrayReader.load(path, arr_name)
+            return
+        shape, fortran, dtype = header
+        if fortran or dtype.hasobject:
+            yield MemoryNpzArrayReader.load(path, arr_name)
+        else:
+            yield StreamingNpzArrayReader(arr_f, shape, dtype)
+
+
+def _read_bytes(fp, size, error_template="ran out of data"):
+    """
+    Copied from: https://github.com/numpy/numpy/blob/fb215c76967739268de71aa4bda55dd1b062bc2e/numpy/lib/format.py#L788-L886
+
+    Read from file-like object until size bytes are read.
+    Raises ValueError if not EOF is encountered before size bytes are read.
+    Non-blocking objects only supported if they derive from io objects.
+    Required as e.g. ZipExtFile in python 2.6 can return less data than
+    requested.
+    """
+    data = bytes()
+    while True:
+        # io files (default in python3) return None or raise on
+        # would-block, python2 file will truncate, probably nothing can be
+        # done about that.  note that regular files can't be non-blocking
+        try:
+            r = fp.read(size - len(data))
+            data += r
+            if len(r) == 0 or len(data) == size:
+                break
+        except io.BlockingIOError:
+            pass
+    if len(data) != size:
+        msg = "EOF: reading %s, expected %d bytes got %d"
+        raise ValueError(msg % (error_template, size, len(data)))
+    else:
+        return data
+
+
+@contextmanager
+def _open_npy_file(path: str, arr_name: str):
+    with open(path, "rb") as f:
+        with zipfile.ZipFile(f, "r") as zip_f:
+            if f"{arr_name}.npy" not in zip_f.namelist():
+                raise ValueError(f"missing {arr_name} in npz file")
+            with zip_f.open(f"{arr_name}.npy", "r") as arr_f:
+                yield arr_f
+
+
+def _download_inception_model():
+    if os.path.exists(INCEPTION_V3_PATH):
+        return
+    print("downloading InceptionV3 model...")
+    with requests.get(INCEPTION_V3_URL, stream=True) as r:
+        r.raise_for_status()
+        tmp_path = INCEPTION_V3_PATH + ".tmp"
+        with open(tmp_path, "wb") as f:
+            for chunk in tqdm(r.iter_content(chunk_size=8192)):
+                f.write(chunk)
+        os.rename(tmp_path, INCEPTION_V3_PATH)
+
+
+def _create_feature_graph(input_batch):
+    _download_inception_model()
+    prefix = f"{random.randrange(2**32)}_{random.randrange(2**32)}"
+    with open(INCEPTION_V3_PATH, "rb") as f:
+        graph_def = tf.GraphDef()
+        graph_def.ParseFromString(f.read())
+    pool3, spatial = tf.import_graph_def(
+        graph_def,
+        input_map={f"ExpandDims:0": input_batch},
+        return_elements=[FID_POOL_NAME, FID_SPATIAL_NAME],
+        name=prefix,
+    )
+    _update_shapes(pool3)
+    spatial = spatial[..., :7]
+    return pool3, spatial
+
+
+def _create_softmax_graph(input_batch):
+    _download_inception_model()
+    prefix = f"{random.randrange(2**32)}_{random.randrange(2**32)}"
+    with open(INCEPTION_V3_PATH, "rb") as f:
+        graph_def = tf.GraphDef()
+        graph_def.ParseFromString(f.read())
+    (matmul,) = tf.import_graph_def(
+        graph_def, return_elements=[f"softmax/logits/MatMul"], name=prefix
+    )
+    w = matmul.inputs[1]
+    logits = tf.matmul(input_batch, w)
+    return tf.nn.softmax(logits)
+
+
+def _update_shapes(pool3):
+    # https://github.com/bioinf-jku/TTUR/blob/73ab375cdf952a12686d9aa7978567771084da42/fid.py#L50-L63
+    ops = pool3.graph.get_operations()
+    for op in ops:
+        for o in op.outputs:
+            shape = o.get_shape()
+            if shape._dims is not None:  # pylint: disable=protected-access
+                # shape = [s.value for s in shape] TF 1.x
+                shape = [s for s in shape]  # TF 2.x
+                new_shape = []
+                for j, s in enumerate(shape):
+                    if s == 1 and j == 0:
+                        new_shape.append(None)
+                    else:
+                        new_shape.append(s)
+                o.__dict__["_shape_val"] = tf.TensorShape(new_shape)
+    return pool3
+
+
+def _numpy_partition(arr, kth, **kwargs):
+    num_workers = min(cpu_count(), len(arr))
+    chunk_size = len(arr) // num_workers
+    extra = len(arr) % num_workers
+
+    start_idx = 0
+    batches = []
+    for i in range(num_workers):
+        size = chunk_size + (1 if i < extra else 0)
+        batches.append(arr[start_idx : start_idx + size])
+        start_idx += size
+
+    with ThreadPool(num_workers) as pool:
+        return list(pool.map(partial(np.partition, kth=kth, **kwargs), batches))
+
+
+if __name__ == "__main__":
+    print(REQUIREMENTS)
+    main()
--- a/ldm/modules/evaluate/evaluate_perceptualsim.py
+++ b/ldm/modules/evaluate/evaluate_perceptualsim.py
@ -0,0 +1,630 @@
+import argparse
+import glob
+import os
+from tqdm import tqdm
+from collections import namedtuple
+
+import numpy as np
+import torch
+import torchvision.transforms as transforms
+from torchvision import models
+from PIL import Image
+
+from ldm.modules.evaluate.ssim import ssim
+
+
+transform = transforms.Compose([transforms.ToTensor()])
+
+def normalize_tensor(in_feat, eps=1e-10):
+    norm_factor = torch.sqrt(torch.sum(in_feat ** 2, dim=1)).view(
+        in_feat.size()[0], 1, in_feat.size()[2], in_feat.size()[3]
+    )
+    return in_feat / (norm_factor.expand_as(in_feat) + eps)
+
+
+def cos_sim(in0, in1):
+    in0_norm = normalize_tensor(in0)
+    in1_norm = normalize_tensor(in1)
+    N = in0.size()[0]
+    X = in0.size()[2]
+    Y = in0.size()[3]
+
+    return torch.mean(
+        torch.mean(
+            torch.sum(in0_norm * in1_norm, dim=1).view(N, 1, X, Y), dim=2
+        ).view(N, 1, 1, Y),
+        dim=3,
+    ).view(N)
+
+
+class squeezenet(torch.nn.Module):
+    def __init__(self, requires_grad=False, pretrained=True):
+        super(squeezenet, self).__init__()
+        pretrained_features = models.squeezenet1_1(
+            pretrained=pretrained
+        ).features
+        self.slice1 = torch.nn.Sequential()
+        self.slice2 = torch.nn.Sequential()
+        self.slice3 = torch.nn.Sequential()
+        self.slice4 = torch.nn.Sequential()
+        self.slice5 = torch.nn.Sequential()
+        self.slice6 = torch.nn.Sequential()
+        self.slice7 = torch.nn.Sequential()
+        self.N_slices = 7
+        for x in range(2):
+            self.slice1.add_module(str(x), pretrained_features[x])
+        for x in range(2, 5):
+            self.slice2.add_module(str(x), pretrained_features[x])
+        for x in range(5, 8):
+            self.slice3.add_module(str(x), pretrained_features[x])
+        for x in range(8, 10):
+            self.slice4.add_module(str(x), pretrained_features[x])
+        for x in range(10, 11):
+            self.slice5.add_module(str(x), pretrained_features[x])
+        for x in range(11, 12):
+            self.slice6.add_module(str(x), pretrained_features[x])
+        for x in range(12, 13):
+            self.slice7.add_module(str(x), pretrained_features[x])
+        if not requires_grad:
+            for param in self.parameters():
+                param.requires_grad = False
+
+    def forward(self, X):
+        h = self.slice1(X)
+        h_relu1 = h
+        h = self.slice2(h)
+        h_relu2 = h
+        h = self.slice3(h)
+        h_relu3 = h
+        h = self.slice4(h)
+        h_relu4 = h
+        h = self.slice5(h)
+        h_relu5 = h
+        h = self.slice6(h)
+        h_relu6 = h
+        h = self.slice7(h)
+        h_relu7 = h
+        vgg_outputs = namedtuple(
+            "SqueezeOutputs",
+            ["relu1", "relu2", "relu3", "relu4", "relu5", "relu6", "relu7"],
+        )
+        out = vgg_outputs(
+            h_relu1, h_relu2, h_relu3, h_relu4, h_relu5, h_relu6, h_relu7
+        )
+
+        return out
+
+
+class alexnet(torch.nn.Module):
+    def __init__(self, requires_grad=False, pretrained=True):
+        super(alexnet, self).__init__()
+        alexnet_pretrained_features = models.alexnet(
+            pretrained=pretrained
+        ).features
+        self.slice1 = torch.nn.Sequential()
+        self.slice2 = torch.nn.Sequential()
+        self.slice3 = torch.nn.Sequential()
+        self.slice4 = torch.nn.Sequential()
+        self.slice5 = torch.nn.Sequential()
+        self.N_slices = 5
+        for x in range(2):
+            self.slice1.add_module(str(x), alexnet_pretrained_features[x])
+        for x in range(2, 5):
+            self.slice2.add_module(str(x), alexnet_pretrained_features[x])
+        for x in range(5, 8):
+            self.slice3.add_module(str(x), alexnet_pretrained_features[x])
+        for x in range(8, 10):
+            self.slice4.add_module(str(x), alexnet_pretrained_features[x])
+        for x in range(10, 12):
+            self.slice5.add_module(str(x), alexnet_pretrained_features[x])
+        if not requires_grad:
+            for param in self.parameters():
+                param.requires_grad = False
+
+    def forward(self, X):
+        h = self.slice1(X)
+        h_relu1 = h
+        h = self.slice2(h)
+        h_relu2 = h
+        h = self.slice3(h)
+        h_relu3 = h
+        h = self.slice4(h)
+        h_relu4 = h
+        h = self.slice5(h)
+        h_relu5 = h
+        alexnet_outputs = namedtuple(
+            "AlexnetOutputs", ["relu1", "relu2", "relu3", "relu4", "relu5"]
+        )
+        out = alexnet_outputs(h_relu1, h_relu2, h_relu3, h_relu4, h_relu5)
+
+        return out
+
+
+class vgg16(torch.nn.Module):
+    def __init__(self, requires_grad=False, pretrained=True):
+        super(vgg16, self).__init__()
+        vgg_pretrained_features = models.vgg16(pretrained=pretrained).features
+        self.slice1 = torch.nn.Sequential()
+        self.slice2 = torch.nn.Sequential()
+        self.slice3 = torch.nn.Sequential()
+        self.slice4 = torch.nn.Sequential()
+        self.slice5 = torch.nn.Sequential()
+        self.N_slices = 5
+        for x in range(4):
+            self.slice1.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(4, 9):
+            self.slice2.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(9, 16):
+            self.slice3.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(16, 23):
+            self.slice4.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(23, 30):
+            self.slice5.add_module(str(x), vgg_pretrained_features[x])
+        if not requires_grad:
+            for param in self.parameters():
+                param.requires_grad = False
+
+    def forward(self, X):
+        h = self.slice1(X)
+        h_relu1_2 = h
+        h = self.slice2(h)
+        h_relu2_2 = h
+        h = self.slice3(h)
+        h_relu3_3 = h
+        h = self.slice4(h)
+        h_relu4_3 = h
+        h = self.slice5(h)
+        h_relu5_3 = h
+        vgg_outputs = namedtuple(
+            "VggOutputs",
+            ["relu1_2", "relu2_2", "relu3_3", "relu4_3", "relu5_3"],
+        )
+        out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3)
+
+        return out
+
+
+class resnet(torch.nn.Module):
+    def __init__(self, requires_grad=False, pretrained=True, num=18):
+        super(resnet, self).__init__()
+        if num == 18:
+            self.net = models.resnet18(pretrained=pretrained)
+        elif num == 34:
+            self.net = models.resnet34(pretrained=pretrained)
+        elif num == 50:
+            self.net = models.resnet50(pretrained=pretrained)
+        elif num == 101:
+            self.net = models.resnet101(pretrained=pretrained)
+        elif num == 152:
+            self.net = models.resnet152(pretrained=pretrained)
+        self.N_slices = 5
+
+        self.conv1 = self.net.conv1
+        self.bn1 = self.net.bn1
+        self.relu = self.net.relu
+        self.maxpool = self.net.maxpool
+        self.layer1 = self.net.layer1
+        self.layer2 = self.net.layer2
+        self.layer3 = self.net.layer3
+        self.layer4 = self.net.layer4
+
+    def forward(self, X):
+        h = self.conv1(X)
+        h = self.bn1(h)
+        h = self.relu(h)
+        h_relu1 = h
+        h = self.maxpool(h)
+        h = self.layer1(h)
+        h_conv2 = h
+        h = self.layer2(h)
+        h_conv3 = h
+        h = self.layer3(h)
+        h_conv4 = h
+        h = self.layer4(h)
+        h_conv5 = h
+
+        outputs = namedtuple(
+            "Outputs", ["relu1", "conv2", "conv3", "conv4", "conv5"]
+        )
+        out = outputs(h_relu1, h_conv2, h_conv3, h_conv4, h_conv5)
+
+        return out
+
+# Off-the-shelf deep network
+class PNet(torch.nn.Module):
+    """Pre-trained network with all channels equally weighted by default"""
+
+    def __init__(self, pnet_type="vgg", pnet_rand=False, use_gpu=True):
+        super(PNet, self).__init__()
+
+        self.use_gpu = use_gpu
+
+        self.pnet_type = pnet_type
+        self.pnet_rand = pnet_rand
+
+        self.shift = torch.Tensor([-0.030, -0.088, -0.188]).view(1, 3, 1, 1)
+        self.scale = torch.Tensor([0.458, 0.448, 0.450]).view(1, 3, 1, 1)
+
+        if self.pnet_type in ["vgg", "vgg16"]:
+            self.net = vgg16(pretrained=not self.pnet_rand, requires_grad=False)
+        elif self.pnet_type == "alex":
+            self.net = alexnet(
+                pretrained=not self.pnet_rand, requires_grad=False
+            )
+        elif self.pnet_type[:-2] == "resnet":
+            self.net = resnet(
+                pretrained=not self.pnet_rand,
+                requires_grad=False,
+                num=int(self.pnet_type[-2:]),
+            )
+        elif self.pnet_type == "squeeze":
+            self.net = squeezenet(
+                pretrained=not self.pnet_rand, requires_grad=False
+            )
+
+        self.L = self.net.N_slices
+
+        if use_gpu:
+            self.net.cuda()
+            self.shift = self.shift.cuda()
+            self.scale = self.scale.cuda()
+
+    def forward(self, in0, in1, retPerLayer=False):
+        in0_sc = (in0 - self.shift.expand_as(in0)) / self.scale.expand_as(in0)
+        in1_sc = (in1 - self.shift.expand_as(in0)) / self.scale.expand_as(in0)
+
+        outs0 = self.net.forward(in0_sc)
+        outs1 = self.net.forward(in1_sc)
+
+        if retPerLayer:
+            all_scores = []
+        for (kk, out0) in enumerate(outs0):
+            cur_score = 1.0 - cos_sim(outs0[kk], outs1[kk])
+            if kk == 0:
+                val = 1.0 * cur_score
+            else:
+                val = val + cur_score
+            if retPerLayer:
+                all_scores += [cur_score]
+
+        if retPerLayer:
+            return (val, all_scores)
+        else:
+            return val
+
+
+
+
+# The SSIM metric
+def ssim_metric(img1, img2, mask=None):
+    return ssim(img1, img2, mask=mask, size_average=False)
+
+
+# The PSNR metric
+def psnr(img1, img2, mask=None,reshape=False):
+    b = img1.size(0)
+    if not (mask is None):
+        b = img1.size(0)
+        mse_err = (img1 - img2).pow(2) * mask
+        if reshape:
+            mse_err = mse_err.reshape(b, -1).sum(dim=1) / (
+                    3 * mask.reshape(b, -1).sum(dim=1).clamp(min=1)
+            )
+        else:
+            mse_err = mse_err.view(b, -1).sum(dim=1) / (
+                3 * mask.view(b, -1).sum(dim=1).clamp(min=1)
+            )
+    else:
+        if reshape:
+            mse_err = (img1 - img2).pow(2).reshape(b, -1).mean(dim=1)
+        else:
+            mse_err = (img1 - img2).pow(2).view(b, -1).mean(dim=1)
+
+    psnr = 10 * (1 / mse_err).log10()
+    return psnr
+
+
+# The perceptual similarity metric
+def perceptual_sim(img1, img2, vgg16):
+    # First extract features
+    dist = vgg16(img1 * 2 - 1, img2 * 2 - 1)
+
+    return dist
+
+def load_img(img_name, size=None):
+    try:
+        img = Image.open(img_name)
+
+        if type(size) == int:
+            img = img.resize((size, size))
+        elif size is not None:
+            img = img.resize((size[1], size[0]))
+
+        img = transform(img).cuda()
+        img = img.unsqueeze(0)
+    except Exception as e:
+        print("Failed at loading %s " % img_name)
+        print(e)
+        img = torch.zeros(1, 3, 256, 256).cuda()
+        raise
+    return img
+
+
+def compute_perceptual_similarity(folder, pred_img, tgt_img, take_every_other):
+
+    # Load VGG16 for feature similarity
+    vgg16 = PNet().to("cuda")
+    vgg16.eval()
+    vgg16.cuda()
+
+    values_percsim = []
+    values_ssim = []
+    values_psnr = []
+    folders = os.listdir(folder)
+    for i, f in tqdm(enumerate(sorted(folders))):
+        pred_imgs = glob.glob(folder + f + "/" + pred_img)
+        tgt_imgs = glob.glob(folder + f + "/" + tgt_img)
+        assert len(tgt_imgs) == 1
+
+        perc_sim = 10000
+        ssim_sim = -10
+        psnr_sim = -10
+        for p_img in pred_imgs:
+            t_img = load_img(tgt_imgs[0])
+            p_img = load_img(p_img, size=t_img.shape[2:])
+            t_perc_sim = perceptual_sim(p_img, t_img, vgg16).item()
+            perc_sim = min(perc_sim, t_perc_sim)
+
+            ssim_sim = max(ssim_sim, ssim_metric(p_img, t_img).item())
+            psnr_sim = max(psnr_sim, psnr(p_img, t_img).item())
+
+        values_percsim += [perc_sim]
+        values_ssim += [ssim_sim]
+        values_psnr += [psnr_sim]
+
+    if take_every_other:
+        n_valuespercsim = []
+        n_valuesssim = []
+        n_valuespsnr = []
+        for i in range(0, len(values_percsim) // 2):
+            n_valuespercsim += [
+                min(values_percsim[2 * i], values_percsim[2 * i + 1])
+            ]
+            n_valuespsnr += [max(values_psnr[2 * i], values_psnr[2 * i + 1])]
+            n_valuesssim += [max(values_ssim[2 * i], values_ssim[2 * i + 1])]
+
+        values_percsim = n_valuespercsim
+        values_ssim = n_valuesssim
+        values_psnr = n_valuespsnr
+
+    avg_percsim = np.mean(np.array(values_percsim))
+    std_percsim = np.std(np.array(values_percsim))
+
+    avg_psnr = np.mean(np.array(values_psnr))
+    std_psnr = np.std(np.array(values_psnr))
+
+    avg_ssim = np.mean(np.array(values_ssim))
+    std_ssim = np.std(np.array(values_ssim))
+
+    return {
+        "Perceptual similarity": (avg_percsim, std_percsim),
+        "PSNR": (avg_psnr, std_psnr),
+        "SSIM": (avg_ssim, std_ssim),
+    }
+
+
+def compute_perceptual_similarity_from_list(pred_imgs_list, tgt_imgs_list,
+                                            take_every_other,
+                                            simple_format=True):
+
+    # Load VGG16 for feature similarity
+    vgg16 = PNet().to("cuda")
+    vgg16.eval()
+    vgg16.cuda()
+
+    values_percsim = []
+    values_ssim = []
+    values_psnr = []
+    equal_count = 0
+    ambig_count = 0
+    for i, tgt_img in enumerate(tqdm(tgt_imgs_list)):
+        pred_imgs = pred_imgs_list[i]
+        tgt_imgs = [tgt_img]
+        assert len(tgt_imgs) == 1
+
+        if type(pred_imgs) != list:
+            pred_imgs = [pred_imgs]
+
+        perc_sim = 10000
+        ssim_sim = -10
+        psnr_sim = -10
+        assert len(pred_imgs)>0
+        for p_img in pred_imgs:
+            t_img = load_img(tgt_imgs[0])
+            p_img = load_img(p_img, size=t_img.shape[2:])
+            t_perc_sim = perceptual_sim(p_img, t_img, vgg16).item()
+            perc_sim = min(perc_sim, t_perc_sim)
+
+            ssim_sim = max(ssim_sim, ssim_metric(p_img, t_img).item())
+            psnr_sim = max(psnr_sim, psnr(p_img, t_img).item())
+
+        values_percsim += [perc_sim]
+        values_ssim += [ssim_sim]
+        if psnr_sim != np.float("inf"):
+            values_psnr += [psnr_sim]
+        else:
+            if torch.allclose(p_img, t_img):
+                equal_count += 1
+                print("{} equal src and wrp images.".format(equal_count))
+            else:
+                ambig_count += 1
+                print("{} ambiguous src and wrp images.".format(ambig_count))
+
+    if take_every_other:
+        n_valuespercsim = []
+        n_valuesssim = []
+        n_valuespsnr = []
+        for i in range(0, len(values_percsim) // 2):
+            n_valuespercsim += [
+                min(values_percsim[2 * i], values_percsim[2 * i + 1])
+            ]
+            n_valuespsnr += [max(values_psnr[2 * i], values_psnr[2 * i + 1])]
+            n_valuesssim += [max(values_ssim[2 * i], values_ssim[2 * i + 1])]
+
+        values_percsim = n_valuespercsim
+        values_ssim = n_valuesssim
+        values_psnr = n_valuespsnr
+
+    avg_percsim = np.mean(np.array(values_percsim))
+    std_percsim = np.std(np.array(values_percsim))
+
+    avg_psnr = np.mean(np.array(values_psnr))
+    std_psnr = np.std(np.array(values_psnr))
+
+    avg_ssim = np.mean(np.array(values_ssim))
+    std_ssim = np.std(np.array(values_ssim))
+
+    if simple_format:
+        # just to make yaml formatting readable
+        return {
+            "Perceptual similarity": [float(avg_percsim), float(std_percsim)],
+            "PSNR": [float(avg_psnr), float(std_psnr)],
+            "SSIM": [float(avg_ssim), float(std_ssim)],
+        }
+    else:
+        return {
+            "Perceptual similarity": (avg_percsim, std_percsim),
+            "PSNR": (avg_psnr, std_psnr),
+            "SSIM": (avg_ssim, std_ssim),
+        }
+
+
+def compute_perceptual_similarity_from_list_topk(pred_imgs_list, tgt_imgs_list,
+                                                 take_every_other, resize=False):
+
+    # Load VGG16 for feature similarity
+    vgg16 = PNet().to("cuda")
+    vgg16.eval()
+    vgg16.cuda()
+
+    values_percsim = []
+    values_ssim = []
+    values_psnr = []
+    individual_percsim = []
+    individual_ssim = []
+    individual_psnr = []
+    for i, tgt_img in enumerate(tqdm(tgt_imgs_list)):
+        pred_imgs = pred_imgs_list[i]
+        tgt_imgs = [tgt_img]
+        assert len(tgt_imgs) == 1
+
+        if type(pred_imgs) != list:
+            assert False
+            pred_imgs = [pred_imgs]
+
+        perc_sim = 10000
+        ssim_sim = -10
+        psnr_sim = -10
+        sample_percsim = list()
+        sample_ssim = list()
+        sample_psnr = list()
+        for p_img in pred_imgs:
+            if resize:
+                t_img = load_img(tgt_imgs[0], size=(256,256))
+            else:
+                t_img = load_img(tgt_imgs[0])
+            p_img = load_img(p_img, size=t_img.shape[2:])
+
+            t_perc_sim = perceptual_sim(p_img, t_img, vgg16).item()
+            sample_percsim.append(t_perc_sim)
+            perc_sim = min(perc_sim, t_perc_sim)
+
+            t_ssim = ssim_metric(p_img, t_img).item()
+            sample_ssim.append(t_ssim)
+            ssim_sim = max(ssim_sim, t_ssim)
+
+            t_psnr = psnr(p_img, t_img).item()
+            sample_psnr.append(t_psnr)
+            psnr_sim = max(psnr_sim, t_psnr)
+
+        values_percsim += [perc_sim]
+        values_ssim += [ssim_sim]
+        values_psnr += [psnr_sim]
+        individual_percsim.append(sample_percsim)
+        individual_ssim.append(sample_ssim)
+        individual_psnr.append(sample_psnr)
+
+    if take_every_other:
+        assert False, "Do this later, after specifying topk to get proper results"
+        n_valuespercsim = []
+        n_valuesssim = []
+        n_valuespsnr = []
+        for i in range(0, len(values_percsim) // 2):
+            n_valuespercsim += [
+                min(values_percsim[2 * i], values_percsim[2 * i + 1])
+            ]
+            n_valuespsnr += [max(values_psnr[2 * i], values_psnr[2 * i + 1])]
+            n_valuesssim += [max(values_ssim[2 * i], values_ssim[2 * i + 1])]
+
+        values_percsim = n_valuespercsim
+        values_ssim = n_valuesssim
+        values_psnr = n_valuespsnr
+
+    avg_percsim = np.mean(np.array(values_percsim))
+    std_percsim = np.std(np.array(values_percsim))
+
+    avg_psnr = np.mean(np.array(values_psnr))
+    std_psnr = np.std(np.array(values_psnr))
+
+    avg_ssim = np.mean(np.array(values_ssim))
+    std_ssim = np.std(np.array(values_ssim))
+
+    individual_percsim = np.array(individual_percsim)
+    individual_psnr = np.array(individual_psnr)
+    individual_ssim = np.array(individual_ssim)
+
+    return {
+        "avg_of_best": {
+            "Perceptual similarity": [float(avg_percsim), float(std_percsim)],
+            "PSNR": [float(avg_psnr), float(std_psnr)],
+            "SSIM": [float(avg_ssim), float(std_ssim)],
+        },
+        "individual": {
+            "PSIM": individual_percsim,
+            "PSNR": individual_psnr,
+            "SSIM": individual_ssim,
+        }
+    }
+
+
+if __name__ == "__main__":
+    args = argparse.ArgumentParser()
+    args.add_argument("--folder", type=str, default="")
+    args.add_argument("--pred_image", type=str, default="")
+    args.add_argument("--target_image", type=str, default="")
+    args.add_argument("--take_every_other", action="store_true", default=False)
+    args.add_argument("--output_file", type=str, default="")
+
+    opts = args.parse_args()
+
+    folder = opts.folder
+    pred_img = opts.pred_image
+    tgt_img = opts.target_image
+
+    results = compute_perceptual_similarity(
+        folder, pred_img, tgt_img, opts.take_every_other
+    )
+
+    f = open(opts.output_file, 'w')
+    for key in results:
+        print("%s for %s: \n" % (key, opts.folder))
+        print(
+            "\t {:0.4f} | {:0.4f} \n".format(results[key][0], results[key][1])
+        )
+
+        f.write("%s for %s: \n" % (key, opts.folder))
+        f.write(
+            "\t {:0.4f} | {:0.4f} \n".format(results[key][0], results[key][1])
+        )
+
+    f.close()
--- a/ldm/modules/evaluate/frechet_video_distance.py
+++ b/ldm/modules/evaluate/frechet_video_distance.py
@ -0,0 +1,147 @@
+# coding=utf-8
+# Copyright 2022 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python2, python3
+"""Minimal Reference implementation for the Frechet Video Distance (FVD).
+
+FVD is a metric for the quality of video generation models. It is inspired by
+the FID (Frechet Inception Distance) used for images, but uses a different
+embedding to be better suitable for videos.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+import six
+import tensorflow.compat.v1 as tf
+import tensorflow_gan as tfgan
+import tensorflow_hub as hub
+
+
+def preprocess(videos, target_resolution):
+  """Runs some preprocessing on the videos for I3D model.
+
+  Args:
+    videos: <T>[batch_size, num_frames, height, width, depth] The videos to be
+      preprocessed. We don't care about the specific dtype of the videos, it can
+      be anything that tf.image.resize_bilinear accepts. Values are expected to
+      be in the range 0-255.
+    target_resolution: (width, height): target video resolution
+
+  Returns:
+    videos: <float32>[batch_size, num_frames, height, width, depth]
+  """
+  videos_shape = list(videos.shape)
+  all_frames = tf.reshape(videos, [-1] + videos_shape[-3:])
+  resized_videos = tf.image.resize_bilinear(all_frames, size=target_resolution)
+  target_shape = [videos_shape[0], -1] + list(target_resolution) + [3]
+  output_videos = tf.reshape(resized_videos, target_shape)
+  scaled_videos = 2. * tf.cast(output_videos, tf.float32) / 255. - 1
+  return scaled_videos
+
+
+def _is_in_graph(tensor_name):
+  """Checks whether a given tensor does exists in the graph."""
+  try:
+    tf.get_default_graph().get_tensor_by_name(tensor_name)
+  except KeyError:
+    return False
+  return True
+
+
+def create_id3_embedding(videos,warmup=False,batch_size=16):
+  """Embeds the given videos using the Inflated 3D Convolution ne   twork.
+
+  Downloads the graph of the I3D from tf.hub and adds it to the graph on the
+  first call.
+
+  Args:
+    videos: <float32>[batch_size, num_frames, height=224, width=224, depth=3].
+      Expected range is [-1, 1].
+
+  Returns:
+    embedding: <float32>[batch_size, embedding_size]. embedding_size depends
+               on the model used.
+
+  Raises:
+    ValueError: when a provided embedding_layer is not supported.
+  """
+
+  # batch_size = 16
+  module_spec = "https://tfhub.dev/deepmind/i3d-kinetics-400/1"
+
+
+  # Making sure that we import the graph separately for
+  # each different input video tensor.
+  module_name = "fvd_kinetics-400_id3_module_" + six.ensure_str(
+      videos.name).replace(":", "_")
+
+
+
+  assert_ops = [
+      tf.Assert(
+          tf.reduce_max(videos) <= 1.001,
+          ["max value in frame is > 1", videos]),
+      tf.Assert(
+          tf.reduce_min(videos) >= -1.001,
+          ["min value in frame is < -1", videos]),
+      tf.assert_equal(
+          tf.shape(videos)[0],
+          batch_size, ["invalid frame batch size: ",
+                       tf.shape(videos)],
+          summarize=6),
+  ]
+  with tf.control_dependencies(assert_ops):
+    videos = tf.identity(videos)
+
+  module_scope = "%s_apply_default/" % module_name
+
+  # To check whether the module has already been loaded into the graph, we look
+  # for a given tensor name. If this tensor name exists, we assume the function
+  # has been called before and the graph was imported. Otherwise we import it.
+  # Note: in theory, the tensor could exist, but have wrong shapes.
+  # This will happen if create_id3_embedding is called with a frames_placehoder
+  # of wrong size/batch size, because even though that will throw a tf.Assert
+  # on graph-execution time, it will insert the tensor (with wrong shape) into
+  # the graph. This is why we need the following assert.
+  if warmup:
+      video_batch_size = int(videos.shape[0])
+      assert video_batch_size in [batch_size, -1, None], f"Invalid batch size {video_batch_size}"
+  tensor_name = module_scope + "RGB/inception_i3d/Mean:0"
+  if not _is_in_graph(tensor_name):
+    i3d_model = hub.Module(module_spec, name=module_name)
+    i3d_model(videos)
+
+  # gets the kinetics-i3d-400-logits layer
+  tensor_name = module_scope + "RGB/inception_i3d/Mean:0"
+  tensor = tf.get_default_graph().get_tensor_by_name(tensor_name)
+  return tensor
+
+
+def calculate_fvd(real_activations,
+                  generated_activations):
+  """Returns a list of ops that compute metrics as funcs of activations.
+
+  Args:
+    real_activations: <float32>[num_samples, embedding_size]
+    generated_activations: <float32>[num_samples, embedding_size]
+
+  Returns:
+    A scalar that contains the requested FVD.
+  """
+  return tfgan.eval.frechet_classifier_distance_from_activations(
+      real_activations, generated_activations)
--- a/ldm/modules/evaluate/ssim.py
+++ b/ldm/modules/evaluate/ssim.py
@ -0,0 +1,124 @@
+# MIT Licence
+
+# Methods to predict the SSIM, taken from
+# https://github.com/Po-Hsun-Su/pytorch-ssim/blob/master/pytorch_ssim/__init__.py
+
+from math import exp
+
+import torch
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+def gaussian(window_size, sigma):
+    gauss = torch.Tensor(
+        [
+            exp(-((x - window_size // 2) ** 2) / float(2 * sigma ** 2))
+            for x in range(window_size)
+        ]
+    )
+    return gauss / gauss.sum()
+
+
+def create_window(window_size, channel):
+    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
+    _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0)
+    window = Variable(
+        _2D_window.expand(channel, 1, window_size, window_size).contiguous()
+    )
+    return window
+
+
+def _ssim(
+    img1, img2, window, window_size, channel, mask=None, size_average=True
+):
+    mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
+    mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
+
+    mu1_sq = mu1.pow(2)
+    mu2_sq = mu2.pow(2)
+    mu1_mu2 = mu1 * mu2
+
+    sigma1_sq = (
+        F.conv2d(img1 * img1, window, padding=window_size // 2, groups=channel)
+        - mu1_sq
+    )
+    sigma2_sq = (
+        F.conv2d(img2 * img2, window, padding=window_size // 2, groups=channel)
+        - mu2_sq
+    )
+    sigma12 = (
+        F.conv2d(img1 * img2, window, padding=window_size // 2, groups=channel)
+        - mu1_mu2
+    )
+
+    C1 = (0.01) ** 2
+    C2 = (0.03) ** 2
+
+    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / (
+        (mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2)
+    )
+
+    if not (mask is None):
+        b = mask.size(0)
+        ssim_map = ssim_map.mean(dim=1, keepdim=True) * mask
+        ssim_map = ssim_map.view(b, -1).sum(dim=1) / mask.view(b, -1).sum(
+            dim=1
+        ).clamp(min=1)
+        return ssim_map
+
+    import pdb
+
+    pdb.set_trace
+
+    if size_average:
+        return ssim_map.mean()
+    else:
+        return ssim_map.mean(1).mean(1).mean(1)
+
+
+class SSIM(torch.nn.Module):
+    def __init__(self, window_size=11, size_average=True):
+        super(SSIM, self).__init__()
+        self.window_size = window_size
+        self.size_average = size_average
+        self.channel = 1
+        self.window = create_window(window_size, self.channel)
+
+    def forward(self, img1, img2, mask=None):
+        (_, channel, _, _) = img1.size()
+
+        if (
+            channel == self.channel
+            and self.window.data.type() == img1.data.type()
+        ):
+            window = self.window
+        else:
+            window = create_window(self.window_size, channel)
+
+            if img1.is_cuda:
+                window = window.cuda(img1.get_device())
+            window = window.type_as(img1)
+
+            self.window = window
+            self.channel = channel
+
+        return _ssim(
+            img1,
+            img2,
+            window,
+            self.window_size,
+            channel,
+            mask,
+            self.size_average,
+        )
+
+
+def ssim(img1, img2, window_size=11, mask=None, size_average=True):
+    (_, channel, _, _) = img1.size()
+    window = create_window(window_size, channel)
+
+    if img1.is_cuda:
+        window = window.cuda(img1.get_device())
+    window = window.type_as(img1)
+
+    return _ssim(img1, img2, window, window_size, channel, mask, size_average)
--- a/ldm/modules/evaluate/torch_frechet_video_distance.py
+++ b/ldm/modules/evaluate/torch_frechet_video_distance.py
@ -0,0 +1,294 @@
+# based on https://github.com/universome/fvd-comparison/blob/master/compare_models.py; huge thanks!
+import os
+import numpy as np
+import io
+import re
+import requests
+import html
+import hashlib
+import urllib
+import urllib.request
+import scipy.linalg
+import multiprocessing as mp
+import glob
+
+
+from tqdm import tqdm
+from typing import Any, List, Tuple, Union, Dict, Callable
+
+from torchvision.io import read_video
+import torch; torch.set_grad_enabled(False)
+from einops import rearrange
+
+from nitro.util import isvideo
+
+def compute_frechet_distance(mu_sample,sigma_sample,mu_ref,sigma_ref) -> float:
+    print('Calculate frechet distance...')
+    m = np.square(mu_sample - mu_ref).sum()
+    s, _ = scipy.linalg.sqrtm(np.dot(sigma_sample, sigma_ref), disp=False) # pylint: disable=no-member
+    fid = np.real(m + np.trace(sigma_sample + sigma_ref - s * 2))
+
+    return float(fid)
+
+
+def compute_stats(feats: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    mu = feats.mean(axis=0) # [d]
+    sigma = np.cov(feats, rowvar=False) # [d, d]
+
+    return mu, sigma
+
+
+def open_url(url: str, num_attempts: int = 10, verbose: bool = True, return_filename: bool = False) -> Any:
+    """Download the given URL and return a binary-mode file object to access the data."""
+    assert num_attempts >= 1
+
+    # Doesn't look like an URL scheme so interpret it as a local filename.
+    if not re.match('^[a-z]+://', url):
+        return url if return_filename else open(url, "rb")
+
+    # Handle file URLs.  This code handles unusual file:// patterns that
+    # arise on Windows:
+    #
+    # file:///c:/foo.txt
+    #
+    # which would translate to a local '/c:/foo.txt' filename that's
+    # invalid.  Drop the forward slash for such pathnames.
+    #
+    # If you touch this code path, you should test it on both Linux and
+    # Windows.
+    #
+    # Some internet resources suggest using urllib.request.url2pathname() but
+    # but that converts forward slashes to backslashes and this causes
+    # its own set of problems.
+    if url.startswith('file://'):
+        filename = urllib.parse.urlparse(url).path
+        if re.match(r'^/[a-zA-Z]:', filename):
+            filename = filename[1:]
+        return filename if return_filename else open(filename, "rb")
+
+    url_md5 = hashlib.md5(url.encode("utf-8")).hexdigest()
+
+    # Download.
+    url_name = None
+    url_data = None
+    with requests.Session() as session:
+        if verbose:
+            print("Downloading %s ..." % url, end="", flush=True)
+        for attempts_left in reversed(range(num_attempts)):
+            try:
+                with session.get(url) as res:
+                    res.raise_for_status()
+                    if len(res.content) == 0:
+                        raise IOError("No data received")
+
+                    if len(res.content) < 8192:
+                        content_str = res.content.decode("utf-8")
+                        if "download_warning" in res.headers.get("Set-Cookie", ""):
+                            links = [html.unescape(link) for link in content_str.split('"') if "export=download" in link]
+                            if len(links) == 1:
+                                url = requests.compat.urljoin(url, links[0])
+                                raise IOError("Google Drive virus checker nag")
+                        if "Google Drive - Quota exceeded" in content_str:
+                            raise IOError("Google Drive download quota exceeded -- please try again later")
+
+                    match = re.search(r'filename="([^"]*)"', res.headers.get("Content-Disposition", ""))
+                    url_name = match[1] if match else url
+                    url_data = res.content
+                    if verbose:
+                        print(" done")
+                    break
+            except KeyboardInterrupt:
+                raise
+            except:
+                if not attempts_left:
+                    if verbose:
+                        print(" failed")
+                    raise
+                if verbose:
+                    print(".", end="", flush=True)
+
+    # Return data as file object.
+    assert not return_filename
+    return io.BytesIO(url_data)
+
+def load_video(ip):
+    vid, *_ = read_video(ip)
+    vid = rearrange(vid, 't h w c -> t c h w').to(torch.uint8)
+    return vid
+
+def get_data_from_str(input_str,nprc = None):
+    assert os.path.isdir(input_str), f'Specified input folder "{input_str}" is not a directory'
+    vid_filelist = glob.glob(os.path.join(input_str,'*.mp4'))
+    print(f'Found {len(vid_filelist)} videos in dir {input_str}')
+
+    if nprc is None:
+        try:
+            nprc = mp.cpu_count()
+        except NotImplementedError:
+            print('WARNING: cpu_count() not avlailable, using only 1 cpu for video loading')
+            nprc = 1
+
+    pool = mp.Pool(processes=nprc)
+
+    vids = []
+    for v in tqdm(pool.imap_unordered(load_video,vid_filelist),total=len(vid_filelist),desc='Loading videos...'):
+        vids.append(v)
+
+
+    vids = torch.stack(vids,dim=0).float()
+
+    return vids
+
+def get_stats(stats):
+    assert os.path.isfile(stats) and stats.endswith('.npz'), f'no stats found under {stats}'
+
+    print(f'Using precomputed statistics under {stats}')
+    stats = np.load(stats)
+    stats = {key: stats[key] for key in stats.files}
+
+    return stats
+
+
+
+
+@torch.no_grad()
+def compute_fvd(ref_input, sample_input, bs=32,
+                ref_stats=None,
+                sample_stats=None,
+                nprc_load=None):
+
+
+
+    calc_stats = ref_stats is None or sample_stats is None
+
+    if calc_stats:
+
+        only_ref = sample_stats is not None
+        only_sample = ref_stats is not None
+
+
+        if isinstance(ref_input,str) and not only_sample:
+            ref_input = get_data_from_str(ref_input,nprc_load)
+
+        if isinstance(sample_input, str) and not only_ref:
+            sample_input = get_data_from_str(sample_input, nprc_load)
+
+        stats = compute_statistics(sample_input,ref_input,
+                                        device='cuda' if torch.cuda.is_available() else 'cpu',
+                                        bs=bs,
+                                        only_ref=only_ref,
+                                        only_sample=only_sample)
+
+        if only_ref:
+            stats.update(get_stats(sample_stats))
+        elif only_sample:
+            stats.update(get_stats(ref_stats))
+
+
+
+    else:
+        stats = get_stats(sample_stats)
+        stats.update(get_stats(ref_stats))
+
+    fvd = compute_frechet_distance(**stats)
+
+    return {'FVD' : fvd,}
+
+
+@torch.no_grad()
+def compute_statistics(videos_fake, videos_real, device: str='cuda', bs=32, only_ref=False,only_sample=False) -> Dict:
+    detector_url = 'https://www.dropbox.com/s/ge9e5ujwgetktms/i3d_torchscript.pt?dl=1'
+    detector_kwargs = dict(rescale=True, resize=True, return_features=True) # Return raw features before the softmax layer.
+
+    with open_url(detector_url, verbose=False) as f:
+        detector = torch.jit.load(f).eval().to(device)
+
+
+
+    assert not (only_sample and only_ref), 'only_ref and only_sample arguments are mutually exclusive'
+
+    ref_embed, sample_embed = [], []
+
+    info = f'Computing I3D activations for FVD score with batch size {bs}'
+
+    if only_ref:
+
+        if not isvideo(videos_real):
+            # if not is video we assume to have numpy arrays pf shape (n_vids, t, h, w, c) in range [0,255]
+            videos_real = torch.from_numpy(videos_real).permute(0, 4, 1, 2, 3).float()
+            print(videos_real.shape)
+
+        if videos_real.shape[0] % bs == 0:
+            n_secs = videos_real.shape[0] // bs
+        else:
+            n_secs = videos_real.shape[0] // bs + 1
+
+        videos_real = torch.tensor_split(videos_real, n_secs, dim=0)
+
+        for ref_v in tqdm(videos_real, total=len(videos_real),desc=info):
+
+            feats_ref = detector(ref_v.to(device).contiguous(), **detector_kwargs).cpu().numpy()
+            ref_embed.append(feats_ref)
+
+    elif only_sample:
+
+        if not isvideo(videos_fake):
+            # if not is video we assume to have numpy arrays pf shape (n_vids, t, h, w, c) in range [0,255]
+            videos_fake = torch.from_numpy(videos_fake).permute(0, 4, 1, 2, 3).float()
+            print(videos_fake.shape)
+
+        if videos_fake.shape[0] % bs == 0:
+            n_secs = videos_fake.shape[0] // bs
+        else:
+            n_secs = videos_fake.shape[0] // bs + 1
+
+        videos_real = torch.tensor_split(videos_real, n_secs, dim=0)
+
+        for sample_v in tqdm(videos_fake, total=len(videos_real),desc=info):
+            feats_sample = detector(sample_v.to(device).contiguous(), **detector_kwargs).cpu().numpy()
+            sample_embed.append(feats_sample)
+
+
+    else:
+
+        if not isvideo(videos_real):
+            # if not is video we assume to have numpy arrays pf shape (n_vids, t, h, w, c) in range [0,255]
+            videos_real = torch.from_numpy(videos_real).permute(0, 4, 1, 2, 3).float()
+
+        if not isvideo(videos_fake):
+            videos_fake = torch.from_numpy(videos_fake).permute(0, 4, 1, 2, 3).float()
+
+        if videos_fake.shape[0] % bs == 0:
+            n_secs = videos_fake.shape[0] // bs
+        else:
+            n_secs = videos_fake.shape[0] // bs + 1
+
+        videos_real = torch.tensor_split(videos_real, n_secs, dim=0)
+        videos_fake = torch.tensor_split(videos_fake, n_secs, dim=0)
+
+        for ref_v, sample_v in tqdm(zip(videos_real,videos_fake),total=len(videos_fake),desc=info):
+            # print(ref_v.shape)
+            # ref_v = torch.nn.functional.interpolate(ref_v, size=(sample_v.shape[2], 256, 256), mode='trilinear', align_corners=False)
+            # sample_v = torch.nn.functional.interpolate(sample_v, size=(sample_v.shape[2], 256, 256), mode='trilinear', align_corners=False)
+
+
+            feats_sample = detector(sample_v.to(device).contiguous(), **detector_kwargs).cpu().numpy()
+            feats_ref = detector(ref_v.to(device).contiguous(), **detector_kwargs).cpu().numpy()
+            sample_embed.append(feats_sample)
+            ref_embed.append(feats_ref)
+
+    out = dict()
+    if len(sample_embed) > 0:
+        sample_embed = np.concatenate(sample_embed,axis=0)
+        mu_sample, sigma_sample = compute_stats(sample_embed)
+        out.update({'mu_sample': mu_sample,
+                    'sigma_sample': sigma_sample})
+
+    if len(ref_embed) > 0:
+        ref_embed = np.concatenate(ref_embed,axis=0)
+        mu_ref, sigma_ref = compute_stats(ref_embed)
+        out.update({'mu_ref': mu_ref,
+                    'sigma_ref': sigma_ref})
+
+
+    return out
--- a/main.py
+++ b/main.py
@ -415,6 +415,107 @@ class CUDACallback(Callback):
            pass


+class SingleImageLogger(Callback):
+    """does not save as grid but as single images"""
+    def __init__(self, batch_frequency, max_images, clamp=True, increase_log_steps=True,
+                 rescale=True, disabled=False, log_on_batch_idx=False, log_first_step=False,
+                 log_images_kwargs=None, log_always=False):
+        super().__init__()
+        self.rescale = rescale
+        self.batch_freq = batch_frequency
+        self.max_images = max_images
+        self.logger_log_images = {
+            pl.loggers.TestTubeLogger: self._testtube,
+        }
+        self.log_steps = [2 ** n for n in range(int(np.log2(self.batch_freq)) + 1)]
+        if not increase_log_steps:
+            self.log_steps = [self.batch_freq]
+        self.clamp = clamp
+        self.disabled = disabled
+        self.log_on_batch_idx = log_on_batch_idx
+        self.log_images_kwargs = log_images_kwargs if log_images_kwargs else {}
+        self.log_first_step = log_first_step
+        self.log_always = log_always
+
+    @rank_zero_only
+    def _testtube(self, pl_module, images, batch_idx, split):
+        for k in images:
+            grid = torchvision.utils.make_grid(images[k])
+            grid = (grid + 1.0) / 2.0  # -1,1 -> 0,1; c,h,w
+
+            tag = f"{split}/{k}"
+            pl_module.logger.experiment.add_image(
+                tag, grid,
+                global_step=pl_module.global_step)
+
+    @rank_zero_only
+    def log_local(self, save_dir, split, images,
+                  global_step, current_epoch, batch_idx):
+        root = os.path.join(save_dir, "images", split)
+        os.makedirs(root, exist_ok=True)
+        for k in images:
+            subroot = os.path.join(root, k)
+            os.makedirs(subroot, exist_ok=True)
+            base_count = len(glob.glob(os.path.join(subroot, "*.png")))
+            for img in images[k]:
+                if self.rescale:
+                    img = (img + 1.0) / 2.0  # -1,1 -> 0,1; c,h,w
+                img = img.transpose(0, 1).transpose(1, 2).squeeze(-1)
+                img = img.numpy()
+                img = (img * 255).astype(np.uint8)
+                filename = "{}_gs-{:06}_e-{:06}_b-{:06}_{:08}.png".format(
+                    k,
+                    global_step,
+                    current_epoch,
+                    batch_idx,
+                    base_count)
+                path = os.path.join(subroot, filename)
+                Image.fromarray(img).save(path)
+                base_count += 1
+
+    def log_img(self, pl_module, batch, batch_idx, split="train", save_dir=None):
+        check_idx = batch_idx if self.log_on_batch_idx else pl_module.global_step
+        if (self.check_frequency(check_idx) and  # batch_idx % self.batch_freq == 0
+                hasattr(pl_module, "log_images") and
+                callable(pl_module.log_images) and
+                self.max_images > 0) or self.log_always:
+            logger = type(pl_module.logger)
+
+            is_train = pl_module.training
+            if is_train:
+                pl_module.eval()
+
+            with torch.no_grad():
+                images = pl_module.log_images(batch, split=split, **self.log_images_kwargs)
+
+            for k in images:
+                N = min(images[k].shape[0], self.max_images)
+                images[k] = images[k][:N]
+                if isinstance(images[k], torch.Tensor):
+                    images[k] = images[k].detach().cpu()
+                    if self.clamp:
+                        images[k] = torch.clamp(images[k], -1., 1.)
+
+            self.log_local(pl_module.logger.save_dir if save_dir is None else save_dir, split, images,
+                           pl_module.global_step, pl_module.current_epoch, batch_idx)
+
+            logger_log_images = self.logger_log_images.get(logger, lambda *args, **kwargs: None)
+            logger_log_images(pl_module, images, pl_module.global_step, split)
+
+            if is_train:
+                pl_module.train()
+
+    def check_frequency(self, check_idx):
+        if ((check_idx % self.batch_freq) == 0 or (check_idx in self.log_steps)) and (
+                check_idx > 0 or self.log_first_step):
+            try:
+                self.log_steps.pop(0)
+            except IndexError as e:
+                print(e)
+            return True
+        return False
+
+
 if __name__ == "__main__":
    # custom parser to specify config files, train, test and debug mode,
    # postfix, resume.
--- a/scripts/autoencoder-eval.py
+++ b/scripts/autoencoder-eval.py
@ -0,0 +1,96 @@
+import argparse, os, sys, glob
+import numpy as np
+from torch_fidelity import calculate_metrics
+import yaml
+
+from ldm.modules.evaluate.evaluate_perceptualsim import compute_perceptual_similarity_from_list
+
+
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--logdir",
+        type=str,
+        nargs="?",
+        default="fidelity-evaluation",
+    )
+    parser.add_argument(
+        "--reconstructions",
+        type=str,
+        help="path to reconstructed images"
+    )
+    parser.add_argument(
+        "--inputs",
+        type=str,
+        help="path to input images"
+    )
+    parser.add_argument(
+        "--cache_root",
+        type=str,
+        help="optional, for pre-computed fidelity statistics",
+        nargs="?",
+    )
+    return parser
+
+
+if __name__ == "__main__":
+
+    command = " ".join(sys.argv)
+    np.random.RandomState(42)
+
+    sys.path.append(os.getcwd())
+
+    parser = get_parser()
+    opt, unknown = parser.parse_known_args()
+
+    outdir = os.path.join(opt.logdir, "metrics")
+    print(outdir)
+
+    inppath = opt.inputs
+    recpath = opt.reconstructions
+
+    results = dict()
+
+    ##### fid
+    fid_kwargs = {}
+    cache_root = None
+    if opt.cache_root and os.path.isdir(opt.cache_root):
+        print(f'Using cached Inception Features saved under "{cache_root}"')
+        fid_kwargs.update({
+            'cache_root': cache_root,
+            'input2_cache_name': 'input_data',
+            'cache': True
+        })
+
+    metrics_dict = calculate_metrics(input1=recpath, input2=inppath,
+                                     cuda=True, isc=True, fid=True, kid=True,
+                                     verbose=True, **fid_kwargs)
+
+    results["fidelity"] = metrics_dict
+    print(f'Metrics from fidelity: \n {results["fidelity"]}')
+
+    ##### sim
+    print("Evaluating reconstruction similarity")
+    reconstructions = sorted(glob.glob(os.path.join(recpath, "*.png")))
+    print(f"num reconstructions found: {len(reconstructions)}")
+    inputs = sorted(glob.glob(os.path.join(inppath, "*.png")))
+    print(f"num inputs found: {len(inputs)}")
+
+    results["image-sim"] = compute_perceptual_similarity_from_list(
+        reconstructions, inputs, take_every_other=False)
+    print(f'Results sim: {results["image-sim"]}')
+
+    # info
+    results["info"] = {
+        "n_examples": len(reconstructions),
+        "command": command,
+    }
+
+    # write out
+    ipath, rpath = map(lambda x: os.path.splitext(x)[0].split(os.sep)[-1], (inppath, recpath))
+    resultsfn = f"results_{ipath}-{rpath}.yaml"
+    results_file = os.path.join(outdir, resultsfn)
+    with open(results_file, 'w') as f:
+        yaml.dump(results, f, default_flow_style=False)
+    print(results_file)
+    print("\ndone.")
--- a/scripts/logging_template.py
+++ b/scripts/logging_template.py
@ -0,0 +1,267 @@
+import argparse, os, sys, glob
+import torch
+import numpy as np
+from omegaconf import OmegaConf
+import streamlit as st
+from streamlit import caching
+from PIL import Image
+from torch.utils.data import DataLoader
+from torch.utils.data.dataloader import default_collate
+import pytorch_lightning as pl
+from pytorch_lightning import seed_everything
+from pytorch_lightning.callbacks import Callback
+from pytorch_lightning.utilities.distributed import rank_zero_only
+from tqdm import tqdm
+import datetime
+
+from ldm.util import instantiate_from_config
+from main import DataModuleFromConfig, ImageLogger, SingleImageLogger
+
+rescale = lambda x: (x + 1.) / 2.
+
+class DummyLogger:
+    pass
+
+def bchw_to_st(x):
+    return rescale(x.detach().cpu().numpy().transpose(0,2,3,1))
+
+
+def run(model, dsets, callbacks, logdir, split="train",
+        batch_size=8, start_index=0, sample_batch=False, nowname="", use_full_data=False):
+    logdir = os.path.join(logdir, nowname)
+    os.makedirs(logdir, exist_ok=True)
+
+    dset = dsets.datasets[split]
+    print(f"Dataset size: {len(dset)}")
+    dloader = torch.utils.data.DataLoader(dset, batch_size=opt.batch_size, drop_last=False, shuffle=False)
+    if not use_full_data:
+        if sample_batch:
+            indices = np.random.choice(len(dset), batch_size)
+        else:
+            indices = list(range(start_index, start_index+batch_size))
+        print(f"Data indices: {list(indices)}")
+        example = default_collate([dset[i] for i in indices])
+        for cb in callbacks:
+            if isinstance(cb, ImageLogger):
+                print(f"logging with {cb.__class__.__name__}")
+                cb.log_img(model, example, 0, split=split, save_dir=logdir)
+    else:
+        for batch in tqdm(dloader, desc="Data"):
+            for cb in callbacks:
+                if isinstance(cb, SingleImageLogger):
+                    cb.log_img(model, batch, 0, split=split, save_dir=logdir)
+
+
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-r",
+        "--resume",
+        type=str,
+        nargs="?",
+        help="load from logdir or checkpoint in logdir",
+    )
+    parser.add_argument(
+        "-b",
+        "--base",
+        nargs="*",
+        metavar="base_config.yaml",
+        help="paths to base configs. Loaded from left-to-right. "
+        "Parameters can be overwritten or added with command-line options of the form `--key value`.",
+        default=list(),
+    )
+    parser.add_argument(
+        "-c",
+        "--config",
+        nargs="?",
+        metavar="single_config.yaml",
+        help="path to single config. If specified, base configs will be ignored "
+        "(except for the last one if left unspecified).",
+        const=True,
+        default="",
+    )
+    parser.add_argument(
+        "-n",
+        "--n_iter",
+        type=int,
+        default=1,
+        help="how many times to run",
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=4,
+        help="how many examples in the batch",
+    )
+    parser.add_argument(
+        "--split",
+        type=str,
+        default="validation",
+        help="evaluate on this split",
+    )
+    parser.add_argument(
+        "--logdir",
+        type=str,
+        default="eval_logs",
+        help="where to save the logs",
+    )
+    parser.add_argument(
+        "--state_key",
+        type=str,
+        default="state_dict",
+        choices=["state_dict", "model_ema", "model"],
+        help="where to access the model weights",
+    )
+    parser.add_argument(
+        "--full_data",
+        action='store_true',
+        help="evaluate on full dataset",
+    )
+    parser.add_argument(
+        "--ignore_callbacks",
+        action='store_true',
+        help="ignores all callbacks in the config and only uses main.SingleImageLogger",
+    )
+    return parser
+
+
+def load_model_from_config(config, sd, gpu=True, eval_mode=True):
+    model = instantiate_from_config(config)
+    print("loading model from state-dict...")
+    if sd is not None:
+        m, u = model.load_state_dict(sd)
+        if len(m) > 0: print(f"missing keys: \n {m}")
+        if len(u) > 0: print(f"unexpected keys: \n {u}")
+        print("loaded model.")
+    if gpu:
+        model.cuda()
+    if eval_mode:
+        model.eval()
+    return {"model": model}
+
+
+def get_data(config):
+    # get data
+    data = instantiate_from_config(config.data)
+    data.prepare_data()
+    data.setup()
+    return data
+
+
+def get_callbacks(lightning_config, ignore_callbacks=False):
+    callbacks_cfg = lightning_config.callbacks
+    callbacks = [instantiate_from_config(callbacks_cfg[k]) for k in callbacks_cfg]
+    print(f"found and instantiated the following callback(s):")
+    for cb in callbacks:
+        print(f"  > {cb.__class__.__name__}")
+    print()
+    if len(callbacks) == 0 or ignore_callbacks:
+        del callbacks
+        callbacks = list()
+        print("No callbacks found. Falling back to SingleImageLogger as a default")
+        try:
+            callbacks.append(SingleImageLogger(1, max_images=opt.batch_size, log_always=True,
+                                           log_images_kwargs=lightning_config.callbacks.image_logger.params.log_images_kwargs))
+        except:
+            print("No log_images_kwargs specified. Using SingleImageLogger with default values in log_images().")
+            callbacks.append(SingleImageLogger(1, max_images=opt.batch_size, log_always=True))
+    return callbacks
+
+
+@st.cache(allow_output_mutation=True)
+def load_model_and_dset(config, ckpt, gpu, eval_mode):
+    # get data
+    dsets = get_data(config)   # calls data.config ...
+
+    # now load the specified checkpoint
+    if ckpt:
+        pl_sd = torch.load(ckpt, map_location="cpu")
+        try:
+            global_step = pl_sd["global_step"]
+        except:
+            global_step = 0
+    else:
+        pl_sd = {"state_dict": None}
+        global_step = None
+    model = load_model_from_config(config.model,
+                                   #pl_sd["state_dict"],
+                                   pl_sd[opt.state_key],
+                                   gpu=gpu,
+                                   eval_mode=eval_mode)["model"]
+    return dsets, model, global_step
+
+
+def exists(x):
+    return x is not None
+
+
+if __name__ == "__main__":
+    sys.path.append(os.getcwd())
+    if not st._is_running_with_streamlit:
+        print("Not running with streamlit. Redefining st functions...")
+        st.info = print
+        st.write = print
+
+    seed_everything(42)
+    parser = get_parser()
+
+    opt, unknown = parser.parse_known_args()
+
+    ckpt = None
+    assert opt.resume
+    if not os.path.exists(opt.resume):
+        raise ValueError("Cannot find {}".format(opt.resume))
+
+    if os.path.isfile(opt.resume):
+        paths = opt.resume.split("/")
+        try:
+            idx = len(paths)-paths[::-1].index("logs")+1
+        except ValueError:
+            idx = -2 # take a guess: path/to/logdir/checkpoints/model.ckpt
+        logdir = "/".join(paths[:idx])
+        ckpt = opt.resume
+    else:
+        assert os.path.isdir(opt.resume), opt.resume
+        logdir = opt.resume.rstrip("/")
+        ckpt = os.path.join(logdir, "checkpoints", "last.ckpt")
+
+    base_configs = sorted(glob.glob(os.path.join(logdir, "configs/*-project.yaml")))
+    opt.base = base_configs+opt.base
+
+    if opt.config:
+        if type(opt.config) == str:
+            opt.base = [opt.config]
+        else:
+            opt.base = [opt.base[-1]]
+
+    configs = [OmegaConf.load(cfg) for cfg in opt.base]
+    cli = OmegaConf.from_dotlist(unknown)
+    config = OmegaConf.merge(*configs, cli)
+
+    lightning_configs = sorted(glob.glob(os.path.join(logdir, "configs/*-lightning.yaml")))
+    lightning_configs = [OmegaConf.load(lcfg) for lcfg in lightning_configs]
+    lightning_config = OmegaConf.merge(*lightning_configs, cli)
+
+    print(f"ckpt-path: {ckpt}")
+
+    print(config)
+    print(lightning_config)
+
+    gpu = True
+    eval_mode = True
+
+    callbacks = get_callbacks(lightning_config.lightning, ignore_callbacks=opt.ignore_callbacks)
+
+    dsets, model, global_step = load_model_and_dset(config, ckpt, gpu, eval_mode)
+    print(f"global step: {global_step}")
+
+    logdir = os.path.join(logdir, opt.logdir, f"{global_step:09}")
+    print(f"logging to {logdir}")
+    os.makedirs(logdir, exist_ok=True)
+
+    # go
+    now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
+    for n in range(opt.n_iter):
+        nowname = now + "_iteration-" + f"{n:03}"
+        run(model, dsets, callbacks, logdir=logdir, batch_size=opt.batch_size, nowname=nowname,
+            split=opt.split, use_full_data=opt.full_data)