Source code for imaginaire.evaluation.common

# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
#
# This work is made available under the Nvidia Source Code License-NC.
# To view a copy of this license, check out LICENSE.md
import math
import os
from functools import partial
import torch
import torch.distributed as dist
from torch import nn
from torch.nn import functional as F
from torchvision.models import inception_v3
from cleanfid.features import feature_extractor
from cleanfid.resize import build_resizer
from imaginaire.evaluation.lpips import get_lpips_model
from imaginaire.evaluation.segmentation import get_segmentation_hist_model, get_miou
from imaginaire.evaluation.caption import get_image_encoder, get_r_precision
from imaginaire.evaluation.pretrained import TFInceptionV3, InceptionV3, Vgg16, SwAV
from imaginaire.utils.distributed import (dist_all_gather_tensor, get_rank,
                                          get_world_size, is_master,
                                          is_local_master)
from imaginaire.utils.distributed import master_only_print
from imaginaire.utils.misc import apply_imagenet_normalization, to_cuda
[docs]@torch.no_grad()
def compute_all_metrics(act_dir,
                        data_loader,
                        net_G,
                        key_real='images',
                        key_fake='fake_images',
                        sample_size=None,
                        preprocess=None,
                        is_video=False,
                        few_shot_video=False,
                        kid_num_subsets=1,
                        kid_subset_size=None,
                        key_prefix='',
                        prdc_k=5,
                        metrics=None,
                        dataset_name='',
                        aws_credentials=None,
                        **kwargs):
    r"""
    Args:
        act_dir (string): Path to a directory to temporarily save feature activations.
        data_loader (obj): PyTorch dataloader object.
        net_G (obj): The generator module.
        key_real (str): Dictionary key value for the real data.
        key_fake (str): Dictionary key value for the fake data.
        sample_size (int or None): How many samples to use for FID.
        preprocess (func or None): Pre-processing function to use.
        is_video (bool): Whether we are handling video sequences.
        few_shot_video (bool): If ``True``, uses few-shot video synthesis.
        kid_num_subsets (int): Number of subsets for KID evaluation.
        kid_subset_size (int or None): The number of samples in each subset for KID evaluation.
        key_prefix (string): Add this string before all keys of the output dictionary.
        prdc_k (int): The K used for computing K-NN when evaluating precision/recall/density/coverage.
        metrics (list of strings): Which metrics we want to evaluate.
        dataset_name (string): The name of the dataset, currently only used to determine which segmentation network to
            use for segmentation evaluation.
    Returns:
        batch_y (tensor): Inception features of the current batch. Note that
            only the master gpu will get it.
    """
    from imaginaire.evaluation.fid import _calculate_frechet_distance
    from imaginaire.evaluation.kid import _polynomial_mmd_averages
    from imaginaire.evaluation.prdc import _get_prdc
    from imaginaire.evaluation.msid import _get_msid
    from imaginaire.evaluation.knn import _get_1nn_acc
    if metrics is None:
        metrics = []
    act_path = os.path.join(act_dir, 'activations_real.pt')
    # Get feature activations and other outputs computed from fake images.
    output_module_dict = nn.ModuleDict()
    if "seg_mIOU" in metrics:
        output_module_dict["seg_mIOU"] = get_segmentation_hist_model(dataset_name, aws_credentials)
    if "caption_rprec" in metrics:
        output_module_dict["caption_rprec"] = get_image_encoder(aws_credentials)
    if "LPIPS" in metrics:
        output_module_dict["LPIPS"] = get_lpips_model()
    fake_outputs = get_outputs(
        data_loader, key_real, key_fake, net_G, sample_size, preprocess,
        output_module_dict=output_module_dict, **kwargs
    )
    fake_act = fake_outputs["activations"]
    # Get feature activations computed from real images.
    real_act = load_or_compute_activations(
        act_path, data_loader, key_real, key_fake, None,
        sample_size, preprocess, is_video=is_video,
        few_shot_video=few_shot_video, **kwargs
    )
    metrics_from_activations = {
        "1NN": _get_1nn_acc,
        "MSID": _get_msid,
        "FID": _calculate_frechet_distance,
        "KID": partial(_polynomial_mmd_averages,
                       n_subsets=kid_num_subsets,
                       subset_size=kid_subset_size,
                       ret_var=True),
        "PRDC": partial(_get_prdc, nearest_k=prdc_k)
    }
    other_metrics = {
        "seg_mIOU": get_miou,
        "caption_rprec": get_r_precision,
        "LPIPS": lambda x: {"LPIPS": torch.mean(x).item()}
    }
    all_metrics = {}
    if is_master():
        for metric in metrics:
            if metric in metrics_from_activations:
                metric_function = metrics_from_activations[metric]
                metric_dict = metric_function(real_act, fake_act)
            elif metric in other_metrics:
                metric_function = other_metrics[metric]
                if fake_outputs[metric] is not None:
                    metric_dict = metric_function(fake_outputs[metric])
            else:
                print(f"{metric} is not implemented!")
                raise NotImplementedError
            for k, v in metric_dict.items():
                all_metrics.update({key_prefix + k: v})
    if dist.is_initialized():
        dist.barrier()
    return all_metrics
[docs]@torch.no_grad()
def compute_all_metrics_data(data_loader_a,
                             data_loader_b,
                             key_a='images',
                             key_b='images',
                             sample_size=None,
                             preprocess=None,
                             kid_num_subsets=1,
                             kid_subset_size=None,
                             key_prefix='',
                             prdc_k=5,
                             metrics=None,
                             dataset_name='',
                             aws_credentials=None,
                             **kwargs):
    r"""
    Args:
        act_dir (string): Path to a directory to temporarily save feature activations.
        data_loader (obj): PyTorch dataloader object.
        net_G (obj): The generator module.
        key_a (str): Dictionary key value for the real data.
        key_b (str): Dictionary key value for the fake data.
        sample_size (int or None): How many samples to use for FID.
        preprocess (func or None): Pre-processing function to use.
        is_video (bool): Whether we are handling video sequences.
        few_shot_video (bool): If ``True``, uses few-shot video synthesis.
        kid_num_subsets (int): Number of subsets for KID evaluation.
        kid_subset_size (int or None): The number of samples in each subset for KID evaluation.
        key_prefix (string): Add this string before all keys of the output dictionary.
        prdc_k (int): The K used for computing K-NN when evaluating precision/recall/density/coverage.
        metrics (list of strings): Which metrics we want to evaluate.
        dataset_name (string): The name of the dataset, currently only used to determine which segmentation network to
            use for segmentation evaluation.
    Returns:
        batch_y (tensor): Inception features of the current batch. Note that
            only the master gpu will get it.
    """
    from imaginaire.evaluation.fid import _calculate_frechet_distance
    from imaginaire.evaluation.kid import _polynomial_mmd_averages
    from imaginaire.evaluation.prdc import _get_prdc
    from imaginaire.evaluation.msid import _get_msid
    from imaginaire.evaluation.knn import _get_1nn_acc
    if metrics is None:
        metrics = []
    min_data_size = min(len(data_loader_a.dataset),
                        len(data_loader_b.dataset))
    if sample_size is None:
        sample_size = min_data_size
    else:
        sample_size = min(sample_size, min_data_size)
    # Get feature activations and other outputs computed from fake images.
    output_module_dict = nn.ModuleDict()
    if "seg_mIOU" in metrics:
        output_module_dict["seg_mIOU"] = get_segmentation_hist_model(dataset_name, aws_credentials)
    if "caption_rprec" in metrics:
        output_module_dict["caption_rprec"] = get_image_encoder(aws_credentials)
    if "LPIPS" in metrics:
        output_module_dict["LPIPS"] = get_lpips_model()
    fake_outputs = get_outputs(
        data_loader_b, key_a, key_b, None, sample_size, preprocess,
        output_module_dict=output_module_dict, **kwargs
    )
    act_b = fake_outputs["activations"]
    act_a = load_or_compute_activations(
        None, data_loader_a, key_a, key_b, None, sample_size, preprocess,
        output_module_dict=output_module_dict, **kwargs
    )
    # act_b = load_or_compute_activations(
    #     None, data_loader_b, key_a, key_b, None, sample_size, preprocess,
    #     output_module_dict=output_module_dict, generate_twice=generate_twice, **kwargs
    # )
    metrics_from_activations = {
        "1NN": _get_1nn_acc,
        "MSID": _get_msid,
        "FID": _calculate_frechet_distance,
        "KID": partial(_polynomial_mmd_averages,
                       n_subsets=kid_num_subsets,
                       subset_size=kid_subset_size,
                       ret_var=True),
        "PRDC": partial(_get_prdc, nearest_k=prdc_k)
    }
    other_metrics = {
        "seg_mIOU": get_miou,
        "caption_rprec": get_r_precision,
        "LPIPS": lambda x: {"LPIPS": torch.mean(x).item()}
    }
    all_metrics = {}
    if is_master():
        for metric in metrics:
            if metric in metrics_from_activations:
                metric_function = metrics_from_activations[metric]
                metric_dict = metric_function(act_a, act_b)
            elif metric in other_metrics:
                metric_function = other_metrics[metric]
                if fake_outputs[metric] is not None:
                    metric_dict = metric_function(fake_outputs[metric])
            else:
                print(f"{metric} is not implemented!")
                raise NotImplementedError
            for k, v in metric_dict.items():
                all_metrics.update({key_prefix + k: v})
    if dist.is_initialized():
        dist.barrier()
    return all_metrics
[docs]@torch.no_grad()
def get_activations(data_loader, key_real, key_fake,
                    generator=None, sample_size=None, preprocess=None,
                    align_corners=True, network='inception', **kwargs):
    r"""Compute activation values and pack them in a list.
    Args:
        data_loader (obj): PyTorch dataloader object.
        key_real (str): Dictionary key value for the real data.
        key_fake (str): Dictionary key value for the fake data.
        generator (obj): PyTorch trainer network.
        sample_size (int): How many samples to use for FID.
        preprocess (func): Pre-processing function to use.
        align_corners (bool): The ``'align_corners'`` parameter to be used for
            `torch.nn.functional.interpolate`.
    Returns:
        batch_y (tensor): Inception features of the current batch. Note that
            only the master gpu will get it.
    """
    if dist.is_initialized() and not is_local_master():
        # Make sure only the first process in distributed training downloads
        # the model, and the others will use the cache
        # noinspection PyUnresolvedReferences
        torch.distributed.barrier()
    if network == 'tf_inception':
        model = TFInceptionV3()
    elif network == 'inception':
        model = InceptionV3()
    elif network == 'vgg16':
        model = Vgg16()
    elif network == 'swav':
        model = SwAV()
    elif network == 'clean_inception':
        model = CleanInceptionV3()
    else:
        raise NotImplementedError(f'Network "{network}" is not supported!')
    if dist.is_initialized() and is_local_master():
        # Make sure only the first process in distributed training downloads
        # the model, and the others will use the cache
        # noinspection PyUnresolvedReferences
        dist.barrier()
    model = model.to('cuda').eval()
    world_size = get_world_size()
    batch_y = []
    # Iterate through the dataset to compute the activation.
    for it, data in enumerate(data_loader):
        data = to_cuda(data)
        # Preprocess the data.
        if preprocess is not None:
            data = preprocess(data)
        # Load real data if the generator is not specified.
        if generator is None:
            images = data[key_real]
        else:
            # Compute the generated image.
            net_G_output = generator(data, **kwargs)
            images = net_G_output[key_fake]
        # Clamp the image for models that do not set the output to between
        # -1, 1. For models that employ tanh, this has no effect.
        images.clamp_(-1, 1)
        y = model(images, align_corners=align_corners)
        batch_y.append(y)
        if sample_size is not None and \
                data_loader.batch_size * world_size * (it + 1) >= sample_size:
            # Reach the number of samples we need.
            break
    batch_y = torch.cat(dist_all_gather_tensor(torch.cat(batch_y)))
    if sample_size is not None:
        batch_y = batch_y[:sample_size]
    print(f"Computed feature activations of size {batch_y.shape}")
    return batch_y
[docs]class CleanInceptionV3(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = feature_extractor(name="torchscript_inception", resize_inside=False)
[docs]    def forward(self, img_batch, transform=True, **_kwargs):
        if transform:
            # Assume the input is (-1, 1). We transform it to (0, 255) and round it to the closest integer.
            img_batch = torch.round(255 * (0.5 * img_batch + 0.5))
        resized_batch = clean_resize(img_batch)
        return self.model(resized_batch)
[docs]def clean_resize(img_batch):
    # Resize images from arbitrary resolutions to 299x299.
    batch_size = img_batch.size(0)
    img_batch = img_batch.cpu().numpy()
    fn_resize = build_resizer('clean')
    resized_batch = torch.zeros(batch_size, 3, 299, 299, device='cuda')
    for idx in range(batch_size):
        curr_img = img_batch[idx]
        img_np = curr_img.transpose((1, 2, 0))
        img_resize = fn_resize(img_np)
        resized_batch[idx] = torch.tensor(img_resize.transpose((2, 0, 1)), device='cuda')
    resized_batch = resized_batch.cuda()
    return resized_batch
[docs]@torch.no_grad()
def get_outputs(data_loader, key_real, key_fake,
                generator=None, sample_size=None, preprocess=None,
                align_corners=True, network='inception',
                output_module_dict=None, **kwargs):
    r"""Compute activation values and pack them in a list.
    Args:
        data_loader (obj): PyTorch dataloader object.
        key_real (str): Dictionary key value for the real data.
        key_fake (str): Dictionary key value for the fake data.
        generator (obj): PyTorch trainer network.
        sample_size (int): How many samples to use for FID.
        preprocess (func): Pre-processing function to use.
        align_corners (bool): The ``'align_corners'`` parameter to be used for `torch.nn.functional.interpolate`.
    Returns:
        batch_y (tensor): Inception features of the current batch. Note that
            only the master gpu will get it.
    """
    if output_module_dict is None:
        output_module_dict = nn.ModuleDict()
    if dist.is_initialized() and not is_local_master():
        # Make sure only the first process in distributed training downloads
        # the model, and the others will use the cache
        # noinspection PyUnresolvedReferences
        torch.distributed.barrier()
    if network == 'tf_inception':
        model = TFInceptionV3()
    elif network == 'inception':
        model = InceptionV3()
    elif network == 'vgg16':
        model = Vgg16()
    elif network == 'swav':
        model = SwAV()
    elif network == 'clean_inception':
        model = CleanInceptionV3()
    else:
        raise NotImplementedError(f'Network "{network}" is not supported!')
    if dist.is_initialized() and is_local_master():
        # Make sure only the first process in distributed training downloads
        # the model, and the others will use the cache
        # noinspection PyUnresolvedReferences
        dist.barrier()
    model = model.to('cuda').eval()
    world_size = get_world_size()
    output = {}
    for k in output_module_dict.keys():
        output[k] = []
    output["activations"] = []
    # Iterate through the dataset to compute the activation.
    for it, data in enumerate(data_loader):
        data = to_cuda(data)
        # Preprocess the data.
        if preprocess is not None:
            data = preprocess(data)
        # Load real data if the generator is not specified.
        if generator is None:
            images = data[key_real]
        else:
            # Compute the generated image.
            net_G_output = generator(data, **kwargs)
            images = net_G_output[key_fake]
        for metric_name, metric_module in output_module_dict.items():
            if metric_module is not None:
                if metric_name == 'LPIPS':
                    assert generator is not None
                    net_G_output_another = generator(data, **kwargs)
                    images_another = net_G_output_another[key_fake]
                    output[metric_name].append(metric_module(images, images_another))
                else:
                    output[metric_name].append(metric_module(data, images, align_corners=align_corners))
        # Clamp the image for models that do not set the output to between
        # -1, 1. For models that employ tanh, this has no effect.
        images.clamp_(-1, 1)
        y = model(images, align_corners=align_corners)
        output["activations"].append(y)
        if sample_size is not None and data_loader.batch_size * world_size * (it + 1) >= sample_size:
            # Reach the number of samples we need.
            break
    for k, v in output.items():
        if len(v) > 0:
            output[k] = torch.cat(dist_all_gather_tensor(torch.cat(v)))[:sample_size]
        else:
            output[k] = None
    return output
[docs]@torch.no_grad()
def get_video_activations(data_loader, key_real, key_fake, trainer=None,
                          sample_size=None, preprocess=None, few_shot=False):
    r"""Compute activation values and pack them in a list. We do not do all
    reduce here.
    Args:
        data_loader (obj): PyTorch dataloader object.
        key_real (str): Dictionary key value for the real data.
        key_fake (str): Dictionary key value for the fake data.
        trainer (obj): Trainer. Video generation is more involved, we rely on
            the "reset" and "test" function to conduct the evaluation.
        sample_size (int): For computing video activation, we will use .
        preprocess (func): The preprocess function to be applied to the data.
        few_shot (bool): If ``True``, uses the few-shot setting.
    Returns:
        batch_y (tensor): Inception features of the current batch. Note that
            only the master gpu will get it.
    """
    inception = inception_init()
    batch_y = []
    # We divide video sequences to different GPUs for testing.
    num_sequences = data_loader.dataset.num_inference_sequences()
    if sample_size is None:
        num_videos_to_test = 10
        num_frames_per_video = 5
    else:
        num_videos_to_test, num_frames_per_video = sample_size
    if num_videos_to_test == -1:
        num_videos_to_test = num_sequences
    else:
        num_videos_to_test = min(num_videos_to_test, num_sequences)
    master_only_print('Number of videos used for evaluation: {}'.format(num_videos_to_test))
    master_only_print('Number of frames per video used for evaluation: {}'.format(num_frames_per_video))
    world_size = get_world_size()
    if num_videos_to_test < world_size:
        seq_to_run = [get_rank() % num_videos_to_test]
    else:
        num_videos_to_test = num_videos_to_test // world_size * world_size
        seq_to_run = range(get_rank(), num_videos_to_test, world_size)
    for sequence_idx in seq_to_run:
        data_loader = set_sequence_idx(few_shot, data_loader, sequence_idx)
        if trainer is not None:
            trainer.reset()
        for it, data in enumerate(data_loader):
            if few_shot and it == 0:
                continue
            if it >= num_frames_per_video:
                break
            # preprocess the data is preprocess is not none.
            if trainer is not None:
                data = trainer.pre_process(data)
            elif preprocess is not None:
                data = preprocess(data)
            data = to_cuda(data)
            if trainer is None:
                images = data[key_real][:, -1]
            else:
                net_G_output = trainer.test_single(data)
                images = net_G_output[key_fake]
            y = inception_forward(inception, images)
            batch_y += [y]
    batch_y = torch.cat(batch_y)
    batch_y = dist_all_gather_tensor(batch_y)
    if is_local_master():
        batch_y = torch.cat(batch_y)
    return batch_y
[docs]def inception_init():
    inception = inception_v3(pretrained=True, transform_input=False)
    inception = inception.to('cuda')
    inception.eval()
    inception.fc = torch.nn.Sequential()
    return inception
[docs]def inception_forward(inception, images):
    images.clamp_(-1, 1)
    images = apply_imagenet_normalization(images)
    images = F.interpolate(images, size=(299, 299),
                           mode='bicubic', align_corners=True)
    return inception(images)
[docs]def gather_tensors(batch_y):
    batch_y = torch.cat(batch_y)
    batch_y = dist_all_gather_tensor(batch_y)
    if is_local_master():
        batch_y = torch.cat(batch_y)
    return batch_y
[docs]def set_sequence_idx(few_shot, data_loader, sequence_idx):
    r"""Get sequence index
    Args:
        few_shot (bool): If ``True``, uses the few-shot setting.
        data_loader: dataloader object
        sequence_idx (int): which sequence to use.
    """
    if few_shot:
        data_loader.dataset.set_inference_sequence_idx(sequence_idx,
                                                       sequence_idx,
                                                       0)
    else:
        data_loader.dataset.set_inference_sequence_idx(sequence_idx)
    return data_loader
[docs]def load_or_compute_activations(act_path, data_loader, key_real, key_fake,
                                generator=None, sample_size=None,
                                preprocess=None,
                                is_video=False, few_shot_video=False,
                                **kwargs):
    r"""Load mean and covariance from saved npy file if exists. Otherwise,
    compute the mean and covariance.
    Args:
        act_path (str or None): Location for the numpy file to store or to load
            the activations.
        data_loader (obj): PyTorch dataloader object.
        key_real (str): Dictionary key value for the real data.
        key_fake (str): Dictionary key value for the fake data.
        generator (obj): PyTorch trainer network.
        sample_size (int): How many samples to be used for computing the KID.
        preprocess (func): The preprocess function to be applied to the data.
        is_video (bool): Whether we are handling video sequences.
        few_shot_video (bool): If ``True``, uses few-shot video synthesis.
    Returns:
        (torch.Tensor) Feature activations.
    """
    if act_path is not None and os.path.exists(act_path):
        # Loading precomputed activations.
        print('Load activations from {}'.format(act_path))
        act = torch.load(act_path, map_location='cpu').cuda()
    else:
        # Compute activations.
        if is_video:
            act = get_video_activations(
                data_loader, key_real, key_fake, generator,
                sample_size, preprocess, few_shot_video, **kwargs
            )
        else:
            act = get_activations(
                data_loader, key_real, key_fake, generator,
                sample_size, preprocess, **kwargs
            )
        if act_path is not None and is_local_master():
            print('Save activations to {}'.format(act_path))
            if not os.path.exists(os.path.dirname(act_path)):
                os.makedirs(os.path.dirname(act_path), exist_ok=True)
            torch.save(act, act_path)
    return act
[docs]def compute_pairwise_distance(data_x, data_y=None, num_splits=10):
    r"""
    Args:
        data_x: numpy.ndarray([N, feature_dim], dtype=np.float32)
        data_y: numpy.ndarray([N, feature_dim], dtype=np.float32)
    Returns:
        numpy.ndarray([N, N], dtype=np.float32) of pairwise distances.
    """
    if data_y is None:
        data_y = data_x
    num_samples = data_x.shape[0]
    assert data_x.shape[0] == data_y.shape[0]
    dists = []
    for i in range(num_splits):
        batch_size = math.ceil(num_samples / num_splits)
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, num_samples)
        dists.append(torch.cdist(data_x[start_idx:end_idx],
                                 data_y).cpu())
    dists = torch.cat(dists, dim=0)
    return dists
[docs]def compute_nn(input_features, k, num_splits=50):
    num_samples = input_features.shape[0]
    all_indices = []
    all_values = []
    for i in range(num_splits):
        batch_size = math.ceil(num_samples / num_splits)
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, num_samples)
        dist = torch.cdist(input_features[start_idx:end_idx],
                           input_features)
        dist[:, start_idx:end_idx] += torch.diag(
            float('inf') * torch.ones(dist.size(0), device=dist.device)
        )
        k_smallests, indices = torch.topk(dist, k, dim=-1, largest=False)
        all_indices.append(indices)
        all_values.append(k_smallests)
    return torch.cat(all_values, dim=0), torch.cat(all_indices, dim=0)