# coding=utf-8 # Copyright 2022 The Google Research Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Lint as: python2, python3 """Minimal Reference implementation for the Frechet Video Distance (FVD). FVD is a metric for the quality of video generation models. It is inspired by the FID (Frechet Inception Distance) used for images, but uses a different embedding to be better suitable for videos. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import six import tensorflow.compat.v1 as tf import tensorflow_gan as tfgan import tensorflow_hub as hub def preprocess(videos, target_resolution): """Runs some preprocessing on the videos for I3D model. Args: videos: [batch_size, num_frames, height, width, depth] The videos to be preprocessed. We don't care about the specific dtype of the videos, it can be anything that tf.image.resize_bilinear accepts. Values are expected to be in the range 0-255. target_resolution: (width, height): target video resolution Returns: videos: [batch_size, num_frames, height, width, depth] """ videos_shape = list(videos.shape) all_frames = tf.reshape(videos, [-1] + videos_shape[-3:]) resized_videos = tf.image.resize_bilinear(all_frames, size=target_resolution) target_shape = [videos_shape[0], -1] + list(target_resolution) + [3] output_videos = tf.reshape(resized_videos, target_shape) scaled_videos = 2. * tf.cast(output_videos, tf.float32) / 255. - 1 return scaled_videos def _is_in_graph(tensor_name): """Checks whether a given tensor does exists in the graph.""" try: tf.get_default_graph().get_tensor_by_name(tensor_name) except KeyError: return False return True def create_id3_embedding(videos,warmup=False,batch_size=16): """Embeds the given videos using the Inflated 3D Convolution ne twork. Downloads the graph of the I3D from tf.hub and adds it to the graph on the first call. Args: videos: [batch_size, num_frames, height=224, width=224, depth=3]. Expected range is [-1, 1]. Returns: embedding: [batch_size, embedding_size]. embedding_size depends on the model used. Raises: ValueError: when a provided embedding_layer is not supported. """ # batch_size = 16 module_spec = "https://tfhub.dev/deepmind/i3d-kinetics-400/1" # Making sure that we import the graph separately for # each different input video tensor. module_name = "fvd_kinetics-400_id3_module_" + six.ensure_str( videos.name).replace(":", "_") assert_ops = [ tf.Assert( tf.reduce_max(videos) <= 1.001, ["max value in frame is > 1", videos]), tf.Assert( tf.reduce_min(videos) >= -1.001, ["min value in frame is < -1", videos]), tf.assert_equal( tf.shape(videos)[0], batch_size, ["invalid frame batch size: ", tf.shape(videos)], summarize=6), ] with tf.control_dependencies(assert_ops): videos = tf.identity(videos) module_scope = "%s_apply_default/" % module_name # To check whether the module has already been loaded into the graph, we look # for a given tensor name. If this tensor name exists, we assume the function # has been called before and the graph was imported. Otherwise we import it. # Note: in theory, the tensor could exist, but have wrong shapes. # This will happen if create_id3_embedding is called with a frames_placehoder # of wrong size/batch size, because even though that will throw a tf.Assert # on graph-execution time, it will insert the tensor (with wrong shape) into # the graph. This is why we need the following assert. if warmup: video_batch_size = int(videos.shape[0]) assert video_batch_size in [batch_size, -1, None], f"Invalid batch size {video_batch_size}" tensor_name = module_scope + "RGB/inception_i3d/Mean:0" if not _is_in_graph(tensor_name): i3d_model = hub.Module(module_spec, name=module_name) i3d_model(videos) # gets the kinetics-i3d-400-logits layer tensor_name = module_scope + "RGB/inception_i3d/Mean:0" tensor = tf.get_default_graph().get_tensor_by_name(tensor_name) return tensor def calculate_fvd(real_activations, generated_activations): """Returns a list of ops that compute metrics as funcs of activations. Args: real_activations: [num_samples, embedding_size] generated_activations: [num_samples, embedding_size] Returns: A scalar that contains the requested FVD. """ return tfgan.eval.frechet_classifier_distance_from_activations( real_activations, generated_activations)