stable-diffusion-finetune/ldm/modules/evaluate/frechet_video_distance.py

# coding=utf-8
# Copyright 2022 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Lint as: python2, python3
"""Minimal Reference implementation for the Frechet Video Distance (FVD).

FVD is a metric for the quality of video generation models. It is inspired by
the FID (Frechet Inception Distance) used for images, but uses a different
embedding to be better suitable for videos.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function


import six
import tensorflow.compat.v1 as tf
import tensorflow_gan as tfgan
import tensorflow_hub as hub


def preprocess(videos, target_resolution):
  """Runs some preprocessing on the videos for I3D model.

  Args:
    videos: <T>[batch_size, num_frames, height, width, depth] The videos to be
      preprocessed. We don't care about the specific dtype of the videos, it can
      be anything that tf.image.resize_bilinear accepts. Values are expected to
      be in the range 0-255.
    target_resolution: (width, height): target video resolution

  Returns:
    videos: <float32>[batch_size, num_frames, height, width, depth]
  """
  videos_shape = list(videos.shape)
  all_frames = tf.reshape(videos, [-1] + videos_shape[-3:])
  resized_videos = tf.image.resize_bilinear(all_frames, size=target_resolution)
  target_shape = [videos_shape[0], -1] + list(target_resolution) + [3]
  output_videos = tf.reshape(resized_videos, target_shape)
  scaled_videos = 2. * tf.cast(output_videos, tf.float32) / 255. - 1
  return scaled_videos


def _is_in_graph(tensor_name):
  """Checks whether a given tensor does exists in the graph."""
  try:
    tf.get_default_graph().get_tensor_by_name(tensor_name)
  except KeyError:
    return False
  return True


def create_id3_embedding(videos,warmup=False,batch_size=16):
  """Embeds the given videos using the Inflated 3D Convolution ne   twork.

  Downloads the graph of the I3D from tf.hub and adds it to the graph on the
  first call.

  Args:
    videos: <float32>[batch_size, num_frames, height=224, width=224, depth=3].
      Expected range is [-1, 1].

  Returns:
    embedding: <float32>[batch_size, embedding_size]. embedding_size depends
               on the model used.

  Raises:
    ValueError: when a provided embedding_layer is not supported.
  """

  # batch_size = 16
  module_spec = "https://tfhub.dev/deepmind/i3d-kinetics-400/1"


  # Making sure that we import the graph separately for
  # each different input video tensor.
  module_name = "fvd_kinetics-400_id3_module_" + six.ensure_str(
      videos.name).replace(":", "_")


  assert_ops = [
      tf.Assert(
          tf.reduce_max(videos) <= 1.001,
          ["max value in frame is > 1", videos]),
      tf.Assert(
          tf.reduce_min(videos) >= -1.001,
          ["min value in frame is < -1", videos]),
      tf.assert_equal(
          tf.shape(videos)[0],
          batch_size, ["invalid frame batch size: ",
                       tf.shape(videos)],
          summarize=6),
  ]
  with tf.control_dependencies(assert_ops):
    videos = tf.identity(videos)

  module_scope = "%s_apply_default/" % module_name

  # To check whether the module has already been loaded into the graph, we look
  # for a given tensor name. If this tensor name exists, we assume the function
  # has been called before and the graph was imported. Otherwise we import it.
  # Note: in theory, the tensor could exist, but have wrong shapes.
  # This will happen if create_id3_embedding is called with a frames_placehoder
  # of wrong size/batch size, because even though that will throw a tf.Assert
  # on graph-execution time, it will insert the tensor (with wrong shape) into
  # the graph. This is why we need the following assert.
  if warmup:
      video_batch_size = int(videos.shape[0])
      assert video_batch_size in [batch_size, -1, None], f"Invalid batch size {video_batch_size}"
  tensor_name = module_scope + "RGB/inception_i3d/Mean:0"
  if not _is_in_graph(tensor_name):
    i3d_model = hub.Module(module_spec, name=module_name)
    i3d_model(videos)

  # gets the kinetics-i3d-400-logits layer
  tensor_name = module_scope + "RGB/inception_i3d/Mean:0"
  tensor = tf.get_default_graph().get_tensor_by_name(tensor_name)
  return tensor


def calculate_fvd(real_activations,
                  generated_activations):
  """Returns a list of ops that compute metrics as funcs of activations.

  Args:
    real_activations: <float32>[num_samples, embedding_size]
    generated_activations: <float32>[num_samples, embedding_size]

  Returns:
    A scalar that contains the requested FVD.
  """
  return tfgan.eval.frechet_classifier_distance_from_activations(
      real_activations, generated_activations)
eval stuff 2022-06-09 08:56:34 +00:00			`# coding=utf-8`
			`# Copyright 2022 The Google Research Authors.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`# Lint as: python2, python3`
			`"""Minimal Reference implementation for the Frechet Video Distance (FVD).`

			`FVD is a metric for the quality of video generation models. It is inspired by`
			`the FID (Frechet Inception Distance) used for images, but uses a different`
			`embedding to be better suitable for videos.`
			`"""`

			`from __future__ import absolute_import`
			`from __future__ import division`
			`from __future__ import print_function`


			`import six`
			`import tensorflow.compat.v1 as tf`
			`import tensorflow_gan as tfgan`
			`import tensorflow_hub as hub`


			`def preprocess(videos, target_resolution):`
			`"""Runs some preprocessing on the videos for I3D model.`

			`Args:`
			`videos: <T>[batch_size, num_frames, height, width, depth] The videos to be`
			`preprocessed. We don't care about the specific dtype of the videos, it can`
			`be anything that tf.image.resize_bilinear accepts. Values are expected to`
			`be in the range 0-255.`
			`target_resolution: (width, height): target video resolution`

			`Returns:`
			`videos: <float32>[batch_size, num_frames, height, width, depth]`
			`"""`
			`videos_shape = list(videos.shape)`
			`all_frames = tf.reshape(videos, [-1] + videos_shape[-3:])`
			`resized_videos = tf.image.resize_bilinear(all_frames, size=target_resolution)`
			`target_shape = [videos_shape[0], -1] + list(target_resolution) + [3]`
			`output_videos = tf.reshape(resized_videos, target_shape)`
			`scaled_videos = 2. * tf.cast(output_videos, tf.float32) / 255. - 1`
			`return scaled_videos`


			`def _is_in_graph(tensor_name):`
			`"""Checks whether a given tensor does exists in the graph."""`
			`try:`
			`tf.get_default_graph().get_tensor_by_name(tensor_name)`
			`except KeyError:`
			`return False`
			`return True`


			`def create_id3_embedding(videos,warmup=False,batch_size=16):`
			`"""Embeds the given videos using the Inflated 3D Convolution ne twork.`

			`Downloads the graph of the I3D from tf.hub and adds it to the graph on the`
			`first call.`

			`Args:`
			`videos: <float32>[batch_size, num_frames, height=224, width=224, depth=3].`
			`Expected range is [-1, 1].`

			`Returns:`
			`embedding: <float32>[batch_size, embedding_size]. embedding_size depends`
			`on the model used.`

			`Raises:`
			`ValueError: when a provided embedding_layer is not supported.`
			`"""`

			`# batch_size = 16`
			`module_spec = "https://tfhub.dev/deepmind/i3d-kinetics-400/1"`


			`# Making sure that we import the graph separately for`
			`# each different input video tensor.`
			`module_name = "fvd_kinetics-400_id3_module_" + six.ensure_str(`
			`videos.name).replace(":", "_")`



			`assert_ops = [`
			`tf.Assert(`
			`tf.reduce_max(videos) <= 1.001,`
			`["max value in frame is > 1", videos]),`
			`tf.Assert(`
			`tf.reduce_min(videos) >= -1.001,`
			`["min value in frame is < -1", videos]),`
			`tf.assert_equal(`
			`tf.shape(videos)[0],`
			`batch_size, ["invalid frame batch size: ",`
			`tf.shape(videos)],`
			`summarize=6),`
			`]`
			`with tf.control_dependencies(assert_ops):`
			`videos = tf.identity(videos)`

			`module_scope = "%s_apply_default/" % module_name`

			`# To check whether the module has already been loaded into the graph, we look`
			`# for a given tensor name. If this tensor name exists, we assume the function`
			`# has been called before and the graph was imported. Otherwise we import it.`
			`# Note: in theory, the tensor could exist, but have wrong shapes.`
			`# This will happen if create_id3_embedding is called with a frames_placehoder`
			`# of wrong size/batch size, because even though that will throw a tf.Assert`
			`# on graph-execution time, it will insert the tensor (with wrong shape) into`
			`# the graph. This is why we need the following assert.`
			`if warmup:`
			`video_batch_size = int(videos.shape[0])`
			`assert video_batch_size in [batch_size, -1, None], f"Invalid batch size {video_batch_size}"`
			`tensor_name = module_scope + "RGB/inception_i3d/Mean:0"`
			`if not _is_in_graph(tensor_name):`
			`i3d_model = hub.Module(module_spec, name=module_name)`
			`i3d_model(videos)`

			`# gets the kinetics-i3d-400-logits layer`
			`tensor_name = module_scope + "RGB/inception_i3d/Mean:0"`
			`tensor = tf.get_default_graph().get_tensor_by_name(tensor_name)`
			`return tensor`


			`def calculate_fvd(real_activations,`
			`generated_activations):`
			`"""Returns a list of ops that compute metrics as funcs of activations.`

			`Args:`
			`real_activations: <float32>[num_samples, embedding_size]`
			`generated_activations: <float32>[num_samples, embedding_size]`

			`Returns:`
			`A scalar that contains the requested FVD.`
			`"""`
			`return tfgan.eval.frechet_classifier_distance_from_activations(`
			`real_activations, generated_activations)`