model.py

from scipy.stats import wasserstein_distance
from utils import SMPLXWrapper, get_upper_body_joint_names_and_idxs
from os.path import join
from glow.learning_rate_schedule import noam_learning_rate_decay
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm
import glow
from glow.models import FlowNet
from pytorch_lightning import LightningModule
from dataset import inv_standardize


class StyleGestures(LightningModule):
    def __init__(
        self,
        # ----------------------
        # Dataset args, set by the datamodule
        # ----------------------
        x_channels: int,
        cond_channels: int,
        n_lookahead: int,
        seqlen: int,
        # ----------------------
        # Model params
        # ----------------------
        hidden_channels: int,
        K: int,
        actnorm_scale: float,
        flow_permutation: str,
        flow_coupling: str,
        network_model: str,
        num_layers: int,
        LU_decomposed: bool,
        distribution: str,
        # ----------------------
        # Training params
        # ----------------------
        batch_size: int,
        max_grad_clip: float,
        max_grad_norm: float,
        beta_1: float,
        beta_2: float,
        eps: float,
        learning_rate: float,
        lr_scheduler: str = "none",
        # Only for Noam LR scheduler:
        noam_warmup_steps: int = None,
        noam_min_lr: float = None,
        # ----------------------
        # Logging
        # ----------------------
        save_videos_every_n_epochs: int = 30,
        track_velocities_every_n_epochs: int = 30,
        sampling_temp: float = 1.0,
    ):
        super().__init__()
        self.flow = FlowNet(
            x_channels=x_channels,
            hidden_channels=hidden_channels,
            cond_channels=cond_channels,
            K=K,
            actnorm_scale=actnorm_scale,
            flow_permutation=flow_permutation,
            flow_coupling=flow_coupling,
            network_model=network_model,
            num_layers=num_layers,
            LU_decomposed=LU_decomposed,
        )
        self.save_hyperparameters()

        # register prior hidden
        self.z_shape = [batch_size, x_channels, 1]
        if distribution == "normal":
            self.distribution = glow.modules.GaussianDiag()
        elif distribution == "studentT":
            self.distribution = glow.modules.StudentT(distribution_param, x_channels)

        self.learning_rate = learning_rate

    def setup(self, stage):
        """ 
        Called at the beginning of trainer.fit() and trainer.test().
        """
        sample_len = self.trainer.datamodule.velocity_histogram_output.shape[1]
        self.motion_converter = SMPLXWrapper(sequence_length=sample_len)

        self.validation_sequence_input = self.trainer.datamodule.eval_batch["control"]
        self.velocity_tracking_input = self.trainer.datamodule.velocity_histogram_input
        self.motion_scaler = self.trainer.datamodule.motion_scaler

        # We need to instantiate two SMPL-X models:
        # One for animating the selected segments for subjective evaluation
        anim_seq_len = (
            self.validation_sequence_input.shape[1] - self.hparams.n_lookahead
        )
        self.animator = SMPLXWrapper(sequence_length=anim_seq_len)

        # And another for converting very long samples to joint positions
        # for joint velocity tracking
        vel_seq_len = self.velocity_tracking_input.shape[1] - self.hparams.n_lookahead
        self.motion_converter = SMPLXWrapper(sequence_length=vel_seq_len)

    def init_lstm_hidden(self):
        self.flow.init_lstm_hidden()

    def training_step(self, batch, batch_idx):
        """
        Args:
            batch:  a dict containing the data:
            batch["x"]:  motion data of shape (batch_size, n_joints * 3, 125)
        """
        x = batch["x"]
        cond = batch["cond"]
        # init LSTM hidden
        self.init_lstm_hidden()

        # at first time, initialize ActNorm
        if self.trainer.global_step == 0:
            self.init_actnorm(batch["x"], batch["cond"])

        # forward phase
        z, nll = self(x=x, cond=cond)

        # loss
        loss = StyleGestures.loss_generative(nll)

        self.log("loss/loss_generative", loss)
        self.log("lr/lr", self.learning_rate, on_step=True)

        # velocity histogram

        return loss

    def validation_step(self, batch, batch_idx):
        # Validation forward phase
        self.init_lstm_hidden()

        z_val, nll_val = self(x=batch["x"], cond=batch["cond"])
        loss = StyleGestures.loss_generative(nll_val)

        return loss

    def log_vel_histograms(self):
        """
        Args:
            motion:     (seq_len, n_feats, batch_size)
            cond:       (seq_len, n_cond_feats, batch_size)
        """
        # 64, 39, 125
        motion = batch["x"]
        # 64, 39, 125
        cond = batch["cond"]

        n_motion_features = motion.shape[1]
        n_autoregr_features = self.hparams.seqlen * n_motion_features
        autoreg = torch.zeros(
            (self.hparams.batch_size, n_autoregr_features, 1), device=self.device
        )

        sampled_all = torch.empty((250, self.hparams.batch_size, n_motion_features, 1))
        for i in tqdm(range(120)):
            curr_cond = torch.cat(
                (autoreg, cond[:, n_autoregr_features:, i : i + 1]), axis=1
            )
            sampled = self(z=None, cond=curr_cond, reverse=True)
            sampled_all[i] = sampled

            # Update the autoregressive poses with the generated pose
            autoreg = torch.cat((autoreg[:, n_motion_features:], sampled), axis=1,)

        print(sampled.shape)
        print(sampled_all.shape)
        exit("what")

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(
            self.parameters(),
            lr=self.hparams.learning_rate,
            betas=(self.hparams.beta_1, self.hparams.beta_2),
            eps=self.hparams.eps,
        )

        return optimizer

    def on_after_backward(self):
        """
        Apply gradient clipping after the backward pass.
        """
        if self.hparams.max_grad_clip > 0:
            torch.nn.utils.clip_grad_value_(
                self.parameters(), self.hparams.max_grad_clip
            )
        if self.hparams.max_grad_norm > 0:
            grad_norm = torch.nn.utils.clip_grad_norm_(
                self.parameters(), self.hparams.max_grad_norm
            )

        self.log("grad_norm/grad_norm", grad_norm)

    def optimizer_step(
        self,
        epoch,
        batch_idx,
        optimizer,
        optimizer_idx,
        optimizer_closure,
        on_tpu,
        using_native_amp,
        using_lbfgs,
    ):
        """
        Update the learning rate using the configured LR scheduler, then call optimizer.step().
        """
        if self.hparams.lr_scheduler == "noam":
            self.learning_rate = noam_learning_rate_decay(
                init_lr=self.hparams.learning_rate,
                global_step=self.trainer.global_step,
                warmup_steps=self.hparams.noam_warmup_steps,
                minimum=self.hparams.noam_min_lr,
            )

            for pg in optimizer.param_groups:
                pg["lr"] = self.learning_rate
        elif self.hparams.lr_scheduler == "none":
            pass
        else:
            raise ValueError("Unknown scheduler:", self.hparams.lr_scheduler)

        # TODO(RN) we cannot log from this function, we do it in training step as a workaround
        # in the long term we should use LRMonitor
        # self.log("lr/lr", learning_rate, on_step=True)

        # update params
        optimizer.step(closure=optimizer_closure)

    def prepare_cond(self, jt_data, ctrl_data):
        nn, seqlen, n_feats = jt_data.shape

        jt_data = jt_data.reshape((nn, seqlen * n_feats))

        nn, seqlen, n_feats = ctrl_data.shape
        ctrl_data = ctrl_data.reshape((nn, seqlen * n_feats))

        cond = torch.cat((jt_data, ctrl_data), axis=1).unsqueeze(-1)

        return cond.cuda()

    def save_videos(self, motion_data, output_file_path):
        for clip_idx, motion_segment in tqdm(
            enumerate(motion_data),
            desc="Visualizing model predictions...",
            leave=False,
        ):
            audio_file = self.trainer.datamodule.visualization_audio_paths[clip_idx]
            caption = self.trainer.datamodule.visualization_text_transcripts[clip_idx]

            self.animator.create_video(
                upper_body_joint_rotations=motion_segment,
                output_file=f"{output_file_path}_{clip_idx}.mp4",
                audio_file=audio_file,
                caption=caption,
            )

    def validation_epoch_end(self, val_losses):
        loss_val = torch.Tensor(val_losses).mean(axis=0)
        self.log("val_loss/val_loss_generative", loss_val)

        if (
            self.current_epoch > 0
            and self.current_epoch % self.hparams.track_velocities_every_n_epochs == 0
        ):
            samples = self.sample_from_model(
                control_data=self.velocity_tracking_input,
                eps_std=self.hparams.sampling_temp,
            )

            self.log_velocity_stats(samples)

        if (
            self.current_epoch > 0
            and self.current_epoch % self.hparams.save_videos_every_n_epochs == 0
        ):
            samples = self.sample_from_model(
                control_data=self.validation_sequence_input,
                eps_std=self.hparams.sampling_temp,
            )

            self.save_videos(
                samples.numpy(),
                join(
                    self.trainer.logger.log_dir,
                    f"sampled_{self.current_epoch}_temp{self.hparams.sampling_temp*100}",
                ),
            )

    def compute_velocitites(self, motion_batch, bin_width=0.001):
        joint_positions = np.stack(
            [
                self.motion_converter.to_joint_positions(joint_rotations)
                for joint_rotations in motion_batch
            ]
        )

        coordinate_velocities = np.diff(joint_positions, n=1, axis=1)
        joint_velocities = np.linalg.norm(coordinate_velocities, axis=3)

        n_joints = joint_velocities.shape[2]
        joint_velocities = joint_velocities.reshape(-1, n_joints)

        return joint_velocities

    def log_velocity_stats(self, samples):
        samples = samples.detach().cpu()
        ground_truth = self.trainer.datamodule.velocity_histogram_output[
            :, : -self.hparams.n_lookahead
        ]

        sample_joint_velocities = self.compute_velocitites(samples)
        gt_joint_velocities = self.compute_velocitites(ground_truth)

        wasserstein_distances_per_joint = [
            wasserstein_distance(
                sample_joint_velocities[:, i], gt_joint_velocities[:, i]
            )
            for i in range(gt_joint_velocities.shape[1])
        ]

        self.log_dict(
            {
                "velocity_wasserstein_dist/mean": np.mean(
                    wasserstein_distances_per_joint
                ),
                "velocity_wasserstein_dist/max": max(wasserstein_distances_per_joint),
                "velocity_wasserstein_dist/min": min(wasserstein_distances_per_joint),
            },
        )

        joint_names, _ = get_upper_body_joint_names_and_idxs()
        self.log_dict(
            {
                f"velocity_wasserstein_dist/{joint_names[i]}": joint_dist
                for i, joint_dist in enumerate(wasserstein_distances_per_joint)
            },
        )

    def sample_from_model(self, control_data, eps_std):
        self.init_lstm_hidden()
        control_data = control_data.to(self.device)

        n_motion_feats = self.hparams.x_channels
        n_samples, n_timesteps, _ = control_data.shape

        sampled_all = torch.zeros(
            (n_samples, n_timesteps - self.hparams.n_lookahead, n_motion_feats)
        ).to(self.device)

        # initialize from a mean pose
        autoreg = torch.zeros((n_samples, self.hparams.seqlen, n_motion_feats)).to(
            self.device
        )

        sampled_all[:, : self.hparams.seqlen, :] = autoreg

        # Temporarily change the expected batch size to the number of evaluation seq's
        orig_z_shape = self.z_shape
        self.z_shape[0] = n_samples
        # Loop through control sequence and generate new data
        for i in tqdm(
            range(
                0,
                control_data.shape[1] - self.hparams.seqlen - self.hparams.n_lookahead,
            ),
            leave=False,
            desc="Sampling from model...",
        ):
            reserved = self.hparams.seqlen + 1 + self.hparams.n_lookahead
            control = control_data[:, i : i + reserved, :]
            # prepare conditioning for moglow (control + previous poses)
            cond = self.prepare_cond(autoreg, control)

            # sample from Moglow
            sampled = self(z=None, cond=cond, eps_std=eps_std, reverse=True).squeeze(-1)
            # store the sampled frame
            sampled_all[:, i + self.hparams.seqlen, :] = sampled

            # update saved pose sequence
            autoreg = torch.cat((autoreg[:, 1:, :], sampled[:, None, :]), axis=1)

        # Reset the expected batch size
        self.z_shape = orig_z_shape

        # Unnormalize the samples
        sampled_all = torch.as_tensor(
            inv_standardize(sampled_all.cpu().numpy(), self.motion_scaler)
        )

        return sampled_all

    def init_actnorm(self, x, cond):
        self(
            x[: self.hparams.batch_size, ...],
            cond[: self.hparams.batch_size, ...] if cond is not None else None,
        )

        # Reinitialize the LSTM
        self.init_lstm_hidden()

    def forward(self, x=None, cond=None, z=None, eps_std=None, reverse=False):
        if not reverse:
            return self.normal_flow(x, cond)
        else:
            return self.reverse_flow(z, cond, eps_std)

    def normal_flow(self, x, cond):

        n_timesteps = glow.thops.timesteps(x)

        logdet = torch.zeros_like(x[:, 0, 0])

        # encode
        z, objective = self.flow(x, cond, logdet=logdet, reverse=False)

        # prior
        objective += self.distribution.logp(z)

        # return
        nll = (-objective) / float(np.log(2.0) * n_timesteps)
        return z, nll

    def reverse_flow(self, z, cond, eps_std):
        with torch.no_grad():

            z_shape = self.z_shape
            if z is None:
                z = self.distribution.sample(z_shape, eps_std, device=cond.device)

            x = self.flow(z, cond, eps_std=eps_std, reverse=True)
        return x

    def set_actnorm_init(self, inited=True):
        for name, m in self.named_modules():
            if m.__class__.__name__.find("ActNorm") >= 0:
                m.inited = inited

    @staticmethod
    def loss_generative(nll):
        # Generative loss
        return torch.mean(nll)