From c88aa4fe19836a142eee4c5f2190c6391c011db5 Mon Sep 17 00:00:00 2001 From: Payam Jome Yazdian Date: Tue, 6 Feb 2024 23:48:07 -0800 Subject: [PATCH] Add files via upload --- scripts/data_loader/data_preprocessor.py | 946 ++++---- scripts/data_loader/gesture_labels.txt | 822 +++---- scripts/data_loader/lmdb_data_loader.py | 2646 +++++++++++----------- 3 files changed, 2207 insertions(+), 2207 deletions(-) diff --git a/scripts/data_loader/data_preprocessor.py b/scripts/data_loader/data_preprocessor.py index 8194eb0..d4d0b89 100644 --- a/scripts/data_loader/data_preprocessor.py +++ b/scripts/data_loader/data_preprocessor.py @@ -1,473 +1,473 @@ -"""create data samples -""" - -import math -import pickle -import os -from typing import Tuple - -import lmdb -import numpy as np -import pyarrow -import torch -import librosa -from tqdm import tqdm -from configargparse import argparse -from model.vocab import Vocab - -import utils - -device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - -class DataPreprocessor: - """Loads and extracts skeleton, audio and video data from Lmdb files and writes those entires into a separate new Lmdb file. - - Attributes: - src_lmdb_env: A Lmdb object containing the origin database environment (similar to PostgreSQL schema). - dst_lmdb_env: A Lmdb object containing the destination database environment. - n_videos: An integer number of entries in the database (equal to the number of videos in the training set). - n_poses: An integer number of frames in each clip in the dataset (normally 30 (in 30 fps)). - subdivision_stride: An integer number of frames between the start of one clip and the start of the next clip (clips can overlap). - skeleton_resampling_fps: An integer frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps). - audio_sample_length: An integer length of the audio clip in hertz (sampled at 16,000 Hz). - n_out_samples: An integer total number of database entries (audio, video and skeleton) that has been extracted from the original videos. - sentence_frame_length: An integer number of frames in each clip but for sentences rather than gestures. - audio_sampling_rate: An integer sampling rate for an audio signal. - DAE_frame_level: A DAE model only if args.name in the initialization method is not 'DAE'. - rnn_representation: A VQVAE model only if 'sentence_level' is True else None. - ckpt_path_DAE: A string filepath to a saved 'DAE' checkpoint model. - ckpt_path_Autoencode: A string filepath to a saved VQVAE checkpoint model. - """ - def __init__(self, args: argparse.Namespace, clip_lmdb_dir: str, out_lmdb_dir: str, n_poses: int, subdivision_stride: int, pose_resampling_fps: int, sentence_level: bool = False): - """Initialize with several dataset parameters. - - Initializes database connections to the lmdb files. Note that the connections are open until closed by the run method. - - The args argument must contain the following keys: - name: A string name of the model (ex. 'DAE' or 'autoencoder_vq'). - rep_learning_checkpoint: If name is not 'DAE', a string filepath to a saved 'DAE' checkpoint model. - autoencoder_checkpoint: If sentence level is True, a string filepath to a saved VQVAE checkpoint model. - sentence_frame_length: An integer number of frames in each clip (for a sentence instead of gesture). - - Args: - args: A configargparser object with specific parameters (See above). - clip_lmdb_dir: A string filepath containing the lmdb dataset files. - out_lmdb_dir: A string filepath to save output as lmdb files. - n_poses: An integer number of frames per second in each clip in the dataset (ex. 30 in 30 fps). - subdivision_stride: An integer number of frames between the start of one clip and the start of the next clip (may overlap). - pose_resampling_fps: An integer frames per second for training (may differ from clip fps). - sentence_level: A boolean flag to add a language model. - """ - self.n_poses = n_poses - self.subdivision_stride = subdivision_stride - self.skeleton_resampling_fps = pose_resampling_fps - self.sentence_level = sentence_level - self.src_lmdb_env: lmdb.Environment = lmdb.open(clip_lmdb_dir, readonly=True, lock=False) - self.out_lmdb_dir = out_lmdb_dir - with self.src_lmdb_env.begin() as txn: - self.n_videos: int = txn.stat()['entries'] - - self.audio_sample_length = int(self.n_poses / self.skeleton_resampling_fps * 16000) - - self.ckpt_path_DAE: str = args.rep_learning_checkpoint - self.ckpt_path_Autoencode: str = args.autoencoder_checkpoint - - if args.name != "DAE": - self.DAE_frame_level: Tuple[argparse.Namespace, torch.nn.Module, torch.nn.MSELoss, Vocab, int] = utils.train_utils.load_checkpoint_and_model( - self.ckpt_path_DAE, device,'DAE') - - if self.sentence_level: - self.rnn_representation: Tuple[argparse.Namespace, torch.nn.Module, torch.nn.MSELoss, Vocab, int] = utils.train_utils.load_checkpoint_and_model( - self.ckpt_path_Autoencode, device, 'autoencoder_vq') - - # create db for samples - map_size = 1024 * 50 # in MB - map_size <<= 20 # in B - self.dst_lmdb_env: lmdb.Environment = lmdb.open(out_lmdb_dir, map_size=map_size) - self.n_out_samples = 0 - - self.sentence_frame_length = args.sentence_frame_length - self.audio_sampling_rate = 16000 - - - def run(self) -> None: - """Extract skeleton, audio, word data from source and write entries into a destination Lmdb file. - - Closes both src_lmdb_env and dst_lmdb_env database connections upon completion. - Does not return any values. Modifies internal state of the object (Close db connection). - """ - src_txn = self.src_lmdb_env.begin(write=False) - total_count = src_txn.stat()['entries'] - - # sampling and normalization - cursor = src_txn.cursor() - counter = 0 - for key, value in tqdm(cursor): - print("video ", counter, "of", total_count, '\n') - video = pyarrow.deserialize(value) - vid = video['vid'] - clips = video['clips'] - for clip_idx, clip in enumerate(clips): - self._sample_from_clip(vid, clip) - counter = counter + 1 - - # print number of samples - with self.dst_lmdb_env.begin() as txn: - print("Sample_counter", txn.stat()['entries']) - - # close db - self.src_lmdb_env.close() - self.dst_lmdb_env.sync() - self.dst_lmdb_env.close() - - def _sample_from_clip(self, vid: str, clip: dict) -> None: - """Internal function to extract and write skeleton, audio and word data from provided clip. - - Modifies internal state of the object (n_out_samples, n_poses, audio_sample_length). - #TODO - - Args: - vid: A string representing the name or id of the clip. - clip: A dictionary containing the following string keys: - 'poses': A Numpy array of pose/gesture data. - 'audio_raw': A Numpy array of audio data. - 'words': A list of lists. Each internal list contains 3 elements: - index 0: A float start time. - index 1: A float end time. - index 2: A string word. - """ - clip_skeleton: np.ndarray = clip['poses'] - clip_audio_raw: np.ndarray = clip['audio_raw'] - clip_word_list: list[list] = clip['words'] - - # divide - aux_info = [] - sample_skeletons_list = [] - sample_words_list = [] - - sample_audio_list_mels = [] - sample_audio_list_raws = [] - - sentence_leve_latents_list = [] - GPT_3_STR_list = [] - GPT_3_Embedding_list = [] - - if self.sentence_level: - self.n_poses = self.sentence_frame_length - self.audio_sample_length = int(self.n_poses / self.skeleton_resampling_fps * self.audio_sampling_rate) - - num_subdivision = math.floor( - (len(clip_skeleton) - self.n_poses) - / self.subdivision_stride) + 1 # floor((K - (N+M)) / S) + 1 - - # Sentence level preparation: - - ''' - self.sentence_level = False - if self.sentence_level: - print("Sentence level") - - str_transcript = "" - for i_t in range(len(clip_word_list)): - str_transcript = str_transcript + " " + clip_word_list[i_t][0] - - import nltk # library - # nltk.download() - sentences = nltk.sent_tokenize(str_transcript) # whole paragraph break into sentence. - - start_sentence_word_index = 0 - end_sentence_word_index = 0 - for sentence in sentences: - x = sentence.replace('.', '').strip().split(' ') - - end_sentence_word_index = start_sentence_word_index + len(sentence.replace('.', '').strip().split(' ')) - 1 # to index - # process - - reconstructed_sentence = clip_word_list[start_sentence_word_index:end_sentence_word_index+1] - - start_idx = int( clip_word_list[start_sentence_word_index][1] * self.skeleton_resampling_fps ) - try: - fin_idx = int( clip_word_list[end_sentence_word_index][2] * self.skeleton_resampling_fps ) - except: - - print() - # ........................ - sample_skeletons = clip_skeleton[start_idx:fin_idx] - subdivision_start_time = start_idx / self.skeleton_resampling_fps - subdivision_end_time = fin_idx / self.skeleton_resampling_fps - sample_words = self.get_words_in_time_range(word_list=clip_word_list, - start_time=subdivision_start_time, - end_time=subdivision_end_time) - if len(sample_words) < 2: - continue - - # raw audio - audio_start = math.floor(start_idx / len(clip_skeleton) * len(clip_audio_raw)) - audio_end = audio_start + self.audio_sample_length - sample_audio = clip_audio_raw[audio_start:audio_end] - - motion_info = {'vid': vid, - 'start_frame_no': start_idx, - 'end_frame_no': fin_idx, - 'start_time': subdivision_start_time, - 'end_time': subdivision_end_time} - - sample_skeletons_list.append(sample_skeletons) - sample_words_list.append(sample_words) - sample_audio_list.append(sample_audio) - aux_info.append(motion_info) - - - - start_sentence_word_index = end_sentence_word_index + 1 - print() - ''' - # Loading cached GP3 requestes - address = self.out_lmdb_dir + '_' + vid + '.gpt' - if os.path.exists(self.out_lmdb_dir + '_' + vid + '.gpt'): - loaded_gpt3 = pickle.load(open(address, 'rb')) - else: - loaded_gpt3 = None - - for i in tqdm(range(num_subdivision)): - - - - start_idx = i * self.subdivision_stride - fin_idx = start_idx + self.n_poses - - sample_skeletons = clip_skeleton[start_idx:fin_idx] - subdivision_start_time = start_idx / self.skeleton_resampling_fps - subdivision_end_time = fin_idx / self.skeleton_resampling_fps - sample_words = self.get_words_in_time_range(word_list=clip_word_list, - start_time=subdivision_start_time, - end_time=subdivision_end_time) - - - if len(sample_words) < 4: - continue - - # raw audio - audio_start = math.floor(start_idx / len(clip_skeleton) * len(clip_audio_raw)) - audio_end = audio_start + self.audio_sample_length - sample_audio = clip_audio_raw[audio_start:audio_end] - - mel_chunks = [] - raw_chunks = [] - for audio_sub in range(self.audio_sample_length//self.audio_sampling_rate): - audio_chunk = sample_audio[audio_sub*self.audio_sampling_rate: (audio_sub+1)*self.audio_sampling_rate] - signal = librosa.feature.melspectrogram(y=audio_chunk, sr=self.audio_sampling_rate) - signal = librosa.power_to_db(signal, ref=np.max) - mel_chunks.append(signal) - # raw_chunks.append(audio_chunk) - raw_chunks.append(0) - # signal = librosa.amplitude_to_db(signal) - - - - - - - - motion_info = {'vid': vid, - 'start_frame_no': start_idx, - 'end_frame_no': fin_idx, - 'start_time': subdivision_start_time, - 'end_time': subdivision_end_time} - - sample_skeletons_list.append(sample_skeletons) - sample_words_list.append(sample_words) - - sample_audio_list_mels.append(mel_chunks) - sample_audio_list_raws.append(raw_chunks) - - aux_info.append(motion_info) - if self.sentence_level: - - # For the input side: - # GPT-3 Embeddomg - str_input = "" - for word in sample_words: - str_input += ' ' + word[0] - - GPT_3_features = self.GPT_3_caller(str_input, loaded_gpt3) - GPT_3_Embedding_list.append(GPT_3_features) - GPT_3_STR_list.append(str_input) - # Discretization for the output side: - sentence_leve_latents_list.append(self.get_pose_latent(sample_skeletons)) - - # if i>100: - # break - - - - object_to_save = {'sample_words_list':GPT_3_STR_list, - 'GPT_3_Embedding_list': GPT_3_Embedding_list} - pickle.dump(object_to_save, open(self.out_lmdb_dir + '_' + vid + '.gpt','wb')) - print("Embedding Saved!") - if len(sample_skeletons_list) > 0: - with self.dst_lmdb_env.begin(write=True) as txn: - if self.sentence_level: - for words, poses, audio_raws, audio_mels, aux, sentence_leve_latents , GPT_3_Embedding in \ - zip(sample_words_list, sample_skeletons_list, - sample_audio_list_raws, sample_audio_list_mels, - aux_info, sentence_leve_latents_list, GPT_3_Embedding_list): - poses = np.asarray(poses) - GPT_3_Embedding = np.array(GPT_3_Embedding) - # save - k = '{:010}'.format(self.n_out_samples).encode('ascii') - v = [words, poses, audio_raws, audio_mels, aux, sentence_leve_latents, GPT_3_Embedding] - v = pyarrow.serialize(v).to_buffer() - txn.put(k, v) - self.n_out_samples += 1 - else: - for words, poses, audio, aux in zip(sample_words_list, sample_skeletons_list, - sample_audio_list, aux_info): - poses = np.asarray(poses) - - # save - k = '{:010}'.format(self.n_out_samples).encode('ascii') - v = [words, poses, audio, aux] - v = pyarrow.serialize(v).to_buffer() - txn.put(k, v) - self.n_out_samples += 1 - - @staticmethod - def get_words_in_time_range(word_list: list[list], start_time: float, end_time: float) -> list[list]: - """Retrieves words in the list that fall between the start_time and end_time provided. - - Args: - word_list: A list that each element contains a list with three elements: - index 0: A float start time. - index 1: A float end time. - index 2: A string word. - start_time: A float indicating when to start filtering. - end_time: A float indicating when to end filtering. - - Returns: - A list containing all elements in the word_list that fall between the start_time and end_time provided. - """ - words = [] - - for word in word_list: - _, word_s, word_e = word[0], word[1], word[2] - - if word_s >= end_time: - break - - if word_e <= start_time: - continue - - words.append(word) - - return words - - - def get_pose_latent(self, poses: np.ndarray) -> np.ndarray: - """TODO - """ - - # 1. Load models - args, DAE, loss_fn, lang_model, out_dim = self.DAE_frame_level - args, rnn, loss_fn, lang_model, out_dim = self.rnn_representation - DAE.train(False) - rnn.train(False) - - - # poses, poses_mirror = process_bvh(bvh_file) - - mean = np.array(args.data_mean).squeeze() - std = np.array(args.data_std).squeeze() - std = np.clip(std, a_min=0.01, a_max=None) - out_poses = (np.copy(poses) - mean) / std - - target = torch.from_numpy(out_poses) - # target = torch.unsqueeze(target,2) - target = target.to(device).float() - reconstructed = [] - # for i in range(len(out_poses)): - # input = torch.unsqueeze(target[i],0) - # current_out = pose_decoder(input) - # reconstructed.append(current_out) - - if DAE.encoder == None: - encoded = target - else: - encoded = DAE.encoder(target) - # encoded = torch.squeeze(encoded, 2) - # encoded = encoded.to('cpu') - # encoded = encoded.detach().numpy() - # all_frames_from_rnn = None - - result = np.zeros((len(encoded)//args.n_poses, rnn.decoder.n_layers, args.hidden_size)) - result_index = 0 - for i in range(0, len(encoded), args.n_poses): - current_dict = dict() - input_seq = encoded[i:i + args.n_poses] - - current_dict['original'] = poses[i: i + args.n_poses] - current_dict['latent_linear'] = encoded[i: i + args.n_poses].detach().clone().to('cpu').numpy() - - input_pre_seq = encoded[i] - output_seq = encoded[i:i + args.n_poses] - - input_seq = torch.unsqueeze(input_seq, 0) - input_seq = input_seq.transpose(0, 1) - use_drivitive = args.use_derivitive=='True' - if use_drivitive: - diff = [(input_seq[n, :] - input_seq[n - 1, :]) for n in range(1, input_seq.shape[0])] - diff.insert(0, torch.zeros_like(input_seq[0, :])) - input_seq = torch.cat((input_seq, torch.stack(diff)), dim=2) - - # output_seq = torch.unsqueeze(output_seq, 0) - # output_seq = output_seq.transpose(0, 1) - output_seq = input_seq.clone() - - - - - reconstructed_rnn = torch.zeros(args.n_poses, output_seq.size(1), rnn.decoder.output_size) \ - .to(output_seq.device) - # run words through encoder - encoder_outputs, encoder_hidden = rnn.encoder(input_seq, None) - - if rnn.VAE: - decoder_hidden = encoder_hidden[:rnn.decoder.n_layers] # use last hidden state from encoder - # [2, 128, 200] - # print("decoder_hidden!!! org", decoder_hidden.shape) - decoder_hidden = decoder_hidden.transpose(1, 0).contiguous() # [128, 2, 200] - decoder_hidden = torch.reshape(decoder_hidden, (decoder_hidden.shape[0], -1)) - mean = rnn.VAE_fc_mean(decoder_hidden) - logvar = rnn.VAE_fc_std(decoder_hidden) - z = rnn.reparameterize(mean, logvar, train=False) - z = rnn.VAE_fc_decoder(z) - decoder_hidden = z.reshape(decoder_hidden.shape[0], - rnn.decoder.n_layers, -1) - decoder_hidden = decoder_hidden.transpose(1, 0).contiguous() - # print("decoder_hidden!!! modified", decoder_hidden.shape) - decoder_first_hidden = decoder_hidden - else: - decoder_hidden = encoder_hidden[:rnn.decoder.n_layers] # use last hidden state from encoder - # print("decoder_hidden!!! not VAE ", decoder_hidden.shape) - - current_dict['latent_rnn'] = torch.squeeze(decoder_hidden.detach().clone(), 1).to('cpu').numpy() - result[result_index] = torch.squeeze(decoder_hidden.detach().clone(), 1).to('cpu').numpy() - result_index = result_index + 1 - - return result - - def GPT_3_caller(self, str_input, pickle_file_loaded): - - return 1 - - if pickle_file_loaded != None: - for i in range(len(pickle_file_loaded['sample_words_list'])): - if str_input == pickle_file_loaded['sample_words_list'][i]: - print("Embedding found:", str_input) - return pickle_file_loaded['GPT_3_Embedding_list'][i] - - request = openai.Embedding.create(input=str_input, engine="text-similarity-ada-001") - embedding = request['data'][0]['embedding'] - print("!!!!!!!!!!!!!!!!!!!Embedding requested:", str_input) - return embedding - +"""create data samples +""" + +import math +import pickle +import os +from typing import Tuple + +import lmdb +import numpy as np +import pyarrow +import torch +import librosa +from tqdm import tqdm +from configargparse import argparse +from model.vocab import Vocab + +import utils + +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + +class DataPreprocessor: + """Loads and extracts skeleton, audio and video data from Lmdb files and writes those entires into a separate new Lmdb file. + + Attributes: + src_lmdb_env: A Lmdb object containing the origin database environment (similar to PostgreSQL schema). + dst_lmdb_env: A Lmdb object containing the destination database environment. + n_videos: An integer number of entries in the database (equal to the number of videos in the training set). + n_poses: An integer number of frames in each clip in the dataset (normally 30 (in 30 fps)). + subdivision_stride: An integer number of frames between the start of one clip and the start of the next clip (clips can overlap). + skeleton_resampling_fps: An integer frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps). + audio_sample_length: An integer length of the audio clip in hertz (sampled at 16,000 Hz). + n_out_samples: An integer total number of database entries (audio, video and skeleton) that has been extracted from the original videos. + sentence_frame_length: An integer number of frames in each clip but for sentences rather than gestures. + audio_sampling_rate: An integer sampling rate for an audio signal. + DAE_frame_level: A DAE model only if args.name in the initialization method is not 'DAE'. + rnn_representation: A VQVAE model only if 'sentence_level' is True else None. + ckpt_path_DAE: A string filepath to a saved 'DAE' checkpoint model. + ckpt_path_Autoencode: A string filepath to a saved VQVAE checkpoint model. + """ + def __init__(self, args: argparse.Namespace, clip_lmdb_dir: str, out_lmdb_dir: str, n_poses: int, subdivision_stride: int, pose_resampling_fps: int, sentence_level: bool = False): + """Initialize with several dataset parameters. + + Initializes database connections to the lmdb files. Note that the connections are open until closed by the run method. + + The args argument must contain the following keys: + name: A string name of the model (ex. 'DAE' or 'autoencoder_vq'). + rep_learning_checkpoint: If name is not 'DAE', a string filepath to a saved 'DAE' checkpoint model. + autoencoder_checkpoint: If sentence level is True, a string filepath to a saved VQVAE checkpoint model. + sentence_frame_length: An integer number of frames in each clip (for a sentence instead of gesture). + + Args: + args: A configargparser object with specific parameters (See above). + clip_lmdb_dir: A string filepath containing the lmdb dataset files. + out_lmdb_dir: A string filepath to save output as lmdb files. + n_poses: An integer number of frames per second in each clip in the dataset (ex. 30 in 30 fps). + subdivision_stride: An integer number of frames between the start of one clip and the start of the next clip (may overlap). + pose_resampling_fps: An integer frames per second for training (may differ from clip fps). + sentence_level: A boolean flag to add a language model. + """ + self.n_poses = n_poses + self.subdivision_stride = subdivision_stride + self.skeleton_resampling_fps = pose_resampling_fps + self.sentence_level = sentence_level + self.src_lmdb_env: lmdb.Environment = lmdb.open(clip_lmdb_dir, readonly=True, lock=False) + self.out_lmdb_dir = out_lmdb_dir + with self.src_lmdb_env.begin() as txn: + self.n_videos: int = txn.stat()['entries'] + + self.audio_sample_length = int(self.n_poses / self.skeleton_resampling_fps * 16000) + + self.ckpt_path_DAE: str = args.rep_learning_checkpoint + self.ckpt_path_Autoencode: str = args.autoencoder_checkpoint + + if args.name != "Frame_Level": + self.DAE_frame_level: Tuple[argparse.Namespace, torch.nn.Module, torch.nn.MSELoss, Vocab, int] = utils.train_utils.load_checkpoint_and_model( + self.ckpt_path_DAE, device,'DAE') + + if self.sentence_level: + self.rnn_representation: Tuple[argparse.Namespace, torch.nn.Module, torch.nn.MSELoss, Vocab, int] = utils.train_utils.load_checkpoint_and_model( + self.ckpt_path_Autoencode, device, 'autoencoder_vq') + + # create db for samples + map_size = 1024 * 50 # in MB + map_size <<= 20 # in B + self.dst_lmdb_env: lmdb.Environment = lmdb.open(out_lmdb_dir, map_size=map_size) + self.n_out_samples = 0 + + self.sentence_frame_length = args.sentence_frame_length + self.audio_sampling_rate = 16000 + + + def run(self) -> None: + """Extract skeleton, audio, word data from source and write entries into a destination Lmdb file. + + Closes both src_lmdb_env and dst_lmdb_env database connections upon completion. + Does not return any values. Modifies internal state of the object (Close db connection). + """ + src_txn = self.src_lmdb_env.begin(write=False) + total_count = src_txn.stat()['entries'] + + # sampling and normalization + cursor = src_txn.cursor() + counter = 0 + for key, value in tqdm(cursor): + print("video ", counter, "of", total_count, '\n') + video = pyarrow.deserialize(value) + vid = video['vid'] + clips = video['clips'] + for clip_idx, clip in enumerate(clips): + self._sample_from_clip(vid, clip) + counter = counter + 1 + + # print number of samples + with self.dst_lmdb_env.begin() as txn: + print("Sample_counter", txn.stat()['entries']) + + # close db + self.src_lmdb_env.close() + self.dst_lmdb_env.sync() + self.dst_lmdb_env.close() + + def _sample_from_clip(self, vid: str, clip: dict) -> None: + """Internal function to extract and write skeleton, audio and word data from provided clip. + + Modifies internal state of the object (n_out_samples, n_poses, audio_sample_length). + #TODO + + Args: + vid: A string representing the name or id of the clip. + clip: A dictionary containing the following string keys: + 'poses': A Numpy array of pose/gesture data. + 'audio_raw': A Numpy array of audio data. + 'words': A list of lists. Each internal list contains 3 elements: + index 0: A float start time. + index 1: A float end time. + index 2: A string word. + """ + clip_skeleton: np.ndarray = clip['poses'] + clip_audio_raw: np.ndarray = clip['audio_raw'] + clip_word_list: list[list] = clip['words'] + + # divide + aux_info = [] + sample_skeletons_list = [] + sample_words_list = [] + + sample_audio_list_mels = [] + sample_audio_list_raws = [] + + sentence_leve_latents_list = [] + GPT_3_STR_list = [] + GPT_3_Embedding_list = [] + + if self.sentence_level: + self.n_poses = self.sentence_frame_length + self.audio_sample_length = int(self.n_poses / self.skeleton_resampling_fps * self.audio_sampling_rate) + + num_subdivision = math.floor( + (len(clip_skeleton) - self.n_poses) + / self.subdivision_stride) + 1 # floor((K - (N+M)) / S) + 1 + + # Sentence level preparation: + + ''' + self.sentence_level = False + if self.sentence_level: + print("Sentence level") + + str_transcript = "" + for i_t in range(len(clip_word_list)): + str_transcript = str_transcript + " " + clip_word_list[i_t][0] + + import nltk # library + # nltk.download() + sentences = nltk.sent_tokenize(str_transcript) # whole paragraph break into sentence. + + start_sentence_word_index = 0 + end_sentence_word_index = 0 + for sentence in sentences: + x = sentence.replace('.', '').strip().split(' ') + + end_sentence_word_index = start_sentence_word_index + len(sentence.replace('.', '').strip().split(' ')) - 1 # to index + # process + + reconstructed_sentence = clip_word_list[start_sentence_word_index:end_sentence_word_index+1] + + start_idx = int( clip_word_list[start_sentence_word_index][1] * self.skeleton_resampling_fps ) + try: + fin_idx = int( clip_word_list[end_sentence_word_index][2] * self.skeleton_resampling_fps ) + except: + + print() + # ........................ + sample_skeletons = clip_skeleton[start_idx:fin_idx] + subdivision_start_time = start_idx / self.skeleton_resampling_fps + subdivision_end_time = fin_idx / self.skeleton_resampling_fps + sample_words = self.get_words_in_time_range(word_list=clip_word_list, + start_time=subdivision_start_time, + end_time=subdivision_end_time) + if len(sample_words) < 2: + continue + + # raw audio + audio_start = math.floor(start_idx / len(clip_skeleton) * len(clip_audio_raw)) + audio_end = audio_start + self.audio_sample_length + sample_audio = clip_audio_raw[audio_start:audio_end] + + motion_info = {'vid': vid, + 'start_frame_no': start_idx, + 'end_frame_no': fin_idx, + 'start_time': subdivision_start_time, + 'end_time': subdivision_end_time} + + sample_skeletons_list.append(sample_skeletons) + sample_words_list.append(sample_words) + sample_audio_list.append(sample_audio) + aux_info.append(motion_info) + + + + start_sentence_word_index = end_sentence_word_index + 1 + print() + ''' + # Loading cached GP3 requestes + address = self.out_lmdb_dir + '_' + vid + '.gpt' + if os.path.exists(self.out_lmdb_dir + '_' + vid + '.gpt'): + loaded_gpt3 = pickle.load(open(address, 'rb')) + else: + loaded_gpt3 = None + + for i in tqdm(range(num_subdivision)): + + + + start_idx = i * self.subdivision_stride + fin_idx = start_idx + self.n_poses + + sample_skeletons = clip_skeleton[start_idx:fin_idx] + subdivision_start_time = start_idx / self.skeleton_resampling_fps + subdivision_end_time = fin_idx / self.skeleton_resampling_fps + sample_words = self.get_words_in_time_range(word_list=clip_word_list, + start_time=subdivision_start_time, + end_time=subdivision_end_time) + + + if len(sample_words) < 4: + continue + + # raw audio + audio_start = math.floor(start_idx / len(clip_skeleton) * len(clip_audio_raw)) + audio_end = audio_start + self.audio_sample_length + sample_audio = clip_audio_raw[audio_start:audio_end] + + mel_chunks = [] + raw_chunks = [] + for audio_sub in range(self.audio_sample_length//self.audio_sampling_rate): + audio_chunk = sample_audio[audio_sub*self.audio_sampling_rate: (audio_sub+1)*self.audio_sampling_rate] + signal = librosa.feature.melspectrogram(y=audio_chunk, sr=self.audio_sampling_rate) + signal = librosa.power_to_db(signal, ref=np.max) + mel_chunks.append(signal) + # raw_chunks.append(audio_chunk) + raw_chunks.append(0) + # signal = librosa.amplitude_to_db(signal) + + + + + + + + motion_info = {'vid': vid, + 'start_frame_no': start_idx, + 'end_frame_no': fin_idx, + 'start_time': subdivision_start_time, + 'end_time': subdivision_end_time} + + sample_skeletons_list.append(sample_skeletons) + sample_words_list.append(sample_words) + + sample_audio_list_mels.append(mel_chunks) + sample_audio_list_raws.append(raw_chunks) + + aux_info.append(motion_info) + if self.sentence_level: + + # For the input side: + # GPT-3 Embeddomg + str_input = "" + for word in sample_words: + str_input += ' ' + word[0] + + GPT_3_features = self.GPT_3_caller(str_input, loaded_gpt3) + GPT_3_Embedding_list.append(GPT_3_features) + GPT_3_STR_list.append(str_input) + # Discretization for the output side: + sentence_leve_latents_list.append(self.get_pose_latent(sample_skeletons)) + + # if i>100: + # break + + + + object_to_save = {'sample_words_list':GPT_3_STR_list, + 'GPT_3_Embedding_list': GPT_3_Embedding_list} + pickle.dump(object_to_save, open(self.out_lmdb_dir + '_' + vid + '.gpt','wb')) + print("Embedding Saved!") + if len(sample_skeletons_list) > 0: + with self.dst_lmdb_env.begin(write=True) as txn: + if self.sentence_level: + for words, poses, audio_raws, audio_mels, aux, sentence_leve_latents , GPT_3_Embedding in \ + zip(sample_words_list, sample_skeletons_list, + sample_audio_list_raws, sample_audio_list_mels, + aux_info, sentence_leve_latents_list, GPT_3_Embedding_list): + poses = np.asarray(poses) + GPT_3_Embedding = np.array(GPT_3_Embedding) + # save + k = '{:010}'.format(self.n_out_samples).encode('ascii') + v = [words, poses, audio_raws, audio_mels, aux, sentence_leve_latents, GPT_3_Embedding] + v = pyarrow.serialize(v).to_buffer() + txn.put(k, v) + self.n_out_samples += 1 + else: + for words, poses, audio, aux in zip(sample_words_list, sample_skeletons_list, + sample_audio_list_raws, aux_info): + poses = np.asarray(poses) + + # save + k = '{:010}'.format(self.n_out_samples).encode('ascii') + v = [words, poses, audio, aux] + v = pyarrow.serialize(v).to_buffer() + txn.put(k, v) + self.n_out_samples += 1 + + @staticmethod + def get_words_in_time_range(word_list, start_time: float, end_time: float): + """Retrieves words in the list that fall between the start_time and end_time provided. + + Args: + word_list: A list that each element contains a list with three elements: + index 0: A float start time. + index 1: A float end time. + index 2: A string word. + start_time: A float indicating when to start filtering. + end_time: A float indicating when to end filtering. + + Returns: + A list containing all elements in the word_list that fall between the start_time and end_time provided. + """ + words = [] + + for word in word_list: + _, word_s, word_e = word[0], word[1], word[2] + + if word_s >= end_time: + break + + if word_e <= start_time: + continue + + words.append(word) + + return words + + + def get_pose_latent(self, poses: np.ndarray) -> np.ndarray: + """TODO + """ + + # 1. Load models + args, DAE, loss_fn, lang_model, out_dim = self.DAE_frame_level + args, rnn, loss_fn, lang_model, out_dim = self.rnn_representation + DAE.train(False) + rnn.train(False) + + + # poses, poses_mirror = process_bvh(bvh_file) + + mean = np.array(args.data_mean).squeeze() + std = np.array(args.data_std).squeeze() + std = np.clip(std, a_min=0.01, a_max=None) + out_poses = (np.copy(poses) - mean) / std + + target = torch.from_numpy(out_poses) + # target = torch.unsqueeze(target,2) + target = target.to(device).float() + reconstructed = [] + # for i in range(len(out_poses)): + # input = torch.unsqueeze(target[i],0) + # current_out = pose_decoder(input) + # reconstructed.append(current_out) + + if DAE.encoder == None: + encoded = target + else: + encoded = DAE.encoder(target) + # encoded = torch.squeeze(encoded, 2) + # encoded = encoded.to('cpu') + # encoded = encoded.detach().numpy() + # all_frames_from_rnn = None + + result = np.zeros((len(encoded)//args.n_poses, rnn.decoder.n_layers, args.hidden_size)) + result_index = 0 + for i in range(0, len(encoded), args.n_poses): + current_dict = dict() + input_seq = encoded[i:i + args.n_poses] + + current_dict['original'] = poses[i: i + args.n_poses] + current_dict['latent_linear'] = encoded[i: i + args.n_poses].detach().clone().to('cpu').numpy() + + input_pre_seq = encoded[i] + output_seq = encoded[i:i + args.n_poses] + + input_seq = torch.unsqueeze(input_seq, 0) + input_seq = input_seq.transpose(0, 1) + use_drivitive = args.use_derivitive=='True' + if use_drivitive: + diff = [(input_seq[n, :] - input_seq[n - 1, :]) for n in range(1, input_seq.shape[0])] + diff.insert(0, torch.zeros_like(input_seq[0, :])) + input_seq = torch.cat((input_seq, torch.stack(diff)), dim=2) + + # output_seq = torch.unsqueeze(output_seq, 0) + # output_seq = output_seq.transpose(0, 1) + output_seq = input_seq.clone() + + + + + reconstructed_rnn = torch.zeros(args.n_poses, output_seq.size(1), rnn.decoder.output_size) \ + .to(output_seq.device) + # run words through encoder + encoder_outputs, encoder_hidden = rnn.encoder(input_seq, None) + + if rnn.VAE: + decoder_hidden = encoder_hidden[:rnn.decoder.n_layers] # use last hidden state from encoder + # [2, 128, 200] + # print("decoder_hidden!!! org", decoder_hidden.shape) + decoder_hidden = decoder_hidden.transpose(1, 0).contiguous() # [128, 2, 200] + decoder_hidden = torch.reshape(decoder_hidden, (decoder_hidden.shape[0], -1)) + mean = rnn.VAE_fc_mean(decoder_hidden) + logvar = rnn.VAE_fc_std(decoder_hidden) + z = rnn.reparameterize(mean, logvar, train=False) + z = rnn.VAE_fc_decoder(z) + decoder_hidden = z.reshape(decoder_hidden.shape[0], + rnn.decoder.n_layers, -1) + decoder_hidden = decoder_hidden.transpose(1, 0).contiguous() + # print("decoder_hidden!!! modified", decoder_hidden.shape) + decoder_first_hidden = decoder_hidden + else: + decoder_hidden = encoder_hidden[:rnn.decoder.n_layers] # use last hidden state from encoder + # print("decoder_hidden!!! not VAE ", decoder_hidden.shape) + + current_dict['latent_rnn'] = torch.squeeze(decoder_hidden.detach().clone(), 1).to('cpu').numpy() + result[result_index] = torch.squeeze(decoder_hidden.detach().clone(), 1).to('cpu').numpy() + result_index = result_index + 1 + + return result + + def GPT_3_caller(self, str_input, pickle_file_loaded): + + return 1 + + if pickle_file_loaded != None: + for i in range(len(pickle_file_loaded['sample_words_list'])): + if str_input == pickle_file_loaded['sample_words_list'][i]: + print("Embedding found:", str_input) + return pickle_file_loaded['GPT_3_Embedding_list'][i] + + request = openai.Embedding.create(input=str_input, engine="text-similarity-ada-001") + embedding = request['data'][0]['embedding'] + print("!!!!!!!!!!!!!!!!!!!Embedding requested:", str_input) + return embedding + diff --git a/scripts/data_loader/gesture_labels.txt b/scripts/data_loader/gesture_labels.txt index 016b546..ea700c0 100644 --- a/scripts/data_loader/gesture_labels.txt +++ b/scripts/data_loader/gesture_labels.txt @@ -1,411 +1,411 @@ -payam,8720,1210,2930,neither,24.5149699 -payam,960,6830,7040,right,19.103924 -payam,2400,7930,7770,right,12.2262586 -payam,4850,2520,2830,neither,15.1091679 -payam,740,2820,2750,neither,18.6786855 -payam,1840,1090,7750,left,7.826561 -payam,7290,6890,6760,left,18.6590548 -payam,910,7710,7630,neither,15.9142669 -payam,110,3080,3020,right,19.3148321 -payam,4310,8430,6020,right,18.3946849 -payam,2370,6450,6640,neither,8.0961591 -payam,3760,3670,8090,left,35.6075189 -payam,7290,6890,6760,left,6.1512555 -payam,2330,1610,4290,neither,9.3917529 -payam,4960,1360,6940,neither,17.0703494 -payam,4880,2220,4050,neither,6.66984 -payam,4150,6320,5120,left,11.1385341 -payam,2380,2290,3830,neither,8.175581 -payam,6990,4350,7850,right,24.8897172 -payam,1260,1300,1150,left,8.7496713 -payam,4040,8150,270,neither,6.5148909 -payam,8210,4200,540,neither,9.3981568 -payam,1840,1090,7750,left,3.4069544 -payam,80,1770,5440,neither,6.8139126 -payam,4740,6380,6320,left,15.6245139 -payam,350,1710,820,neither,6.9307515 -payam,7980,380,440,right,6.9549693 -payam,4710,4520,6560,left,15.8368305 -payam,4150,1030,5450,right,5.8812649 -payam,4090,8210,1650,left,9.3473628 -payam,3160,3050,3300,right,15.4507672 -payam,3700,5340,4990,neither,19.4590834 -payam,5790,3250,4560,right,5.8536474 -payam,2820,2730,410,left,8.6303164 -payam,4180,6270,6780,neither,10.6426271 -payam,3450,3010,3200,neither,9.5873883 -payam,1480,2740,2400,neither,7.4411407 -payam,5200,140,2280,neither,10.3224914 -payam,7920,5360,5370,right,12.6515 -payam,8140,6670,6500,neither,8.3602584 -payam,1880,1810,5430,right,23.2110811 -payam,3190,1380,3600,left,13.4339459 -payam,50,4340,5020,neither,21.3263438 -payam,8350,8660,720,neither,5.8660808 -payam,3380,3280,1440,neither,12.1587357 -payam,1990,1450,3180,right,11.0167687 -payam,5220,3470,2030,right,17.4336008 -payam,3030,1430,7950,neither,5.1681943 -payam,2070,2690,1080,right,7.0990855 -payam,5270,760,2620,right,7.8710579 -payam,1500,5160,5060,right,13.7366622 -payam,3720,5320,980,right,3.9487614 -payam,6360,5830,6430,right,16.9997513 -payam,8100,2890,2200,neither,11.3478833 -payam,8190,8240,5770,left,9.2111841 -payam,6660,4820,3920,right,7.2671531 -payam,1380,1480,6320,left,8.1044273 -payam,2920,6550,6590,right,4.2769958 -payam,8030,5080,8270,left,8.6034509 -payam,7140,7510,4590,neither,16.6477046 -payam,2820,6050,8100,left,24.6404202 -payam,8760,7440,5770,right,13.7314397 -payam,1500,3220,670,left,11.3452669 -payam,7760,7550,5210,neither,6.2482816 -payam,1480,3220,1500,right,12.5323615 -payam,3370,5490,770,neither,7.1390339 -payam,7810,1860,900,right,10.9310926 -payam,1480,60,1000,left,22.48273 -payam,880,370,100,right,6.3977794 -payam,6530,6470,8270,left,23.1602921 -payam,7870,5570,5710,neither,10.2806256 -payam,6440,5560,1420,left,6.4094853 -payam,3080,3090,5170,left,21.2385869 -payam,2360,3630,3810,neither,8.7701795 -payam,490,1260,1220,neither,6.672139 -payam,2290,610,2460,left,6.049926 -payam,3380,3280,1440,neither,20.8177628 -payam,2660,6610,6450,left,42.8382073 -payam,5880,2790,2520,right,9.3886082 -payam,6520,5860,5480,neither,7.0841901 -payam,3640,2700,1130,left,32.19488 -payam,2800,6870,6710,neither,11.8798787 -payam,740,2820,2750,neither,9.5489998 -payam,7590,7770,6800,neither,19.0458094 -payam,4240,2560,270,right,12.5144577 -payam,4170,8630,3110,neither,9.0806503 -payam,4870,6970,6740,neither,16.2745591 -payam,6840,6060,6290,neither,6.9093279 -payam,90,3320,4740,neither,9.6742201 -payam,7140,7510,4590,left,22.7795439 -payam,5010,2290,7770,neither,19.3744291 -payam,5960,6240,8600,right,9.0793041 -payam,1250,6260,7880,right,4.93853 -payam,4270,1320,1350,right,17.2790887 -payam,4670,1250,3220,neither,10.618701 -payam,1480,2740,2400,neither,7.0960476 -payam,4270,1320,1350,right,4.4593416 -payam,7500,6670,450,neither,6.8258252 -payam,2330,3690,3800,neither,9.1139991 -payam,6890,3890,3700,neither,4.0416935 -payam,2160,8690,5990,neither,11.627078 -payam,5230,5350,5200,left,12.7237877 -payam,5410,5190,5000,neither,14.5101507 -payam,8310,8680,3050,left,7.9895167 -payam,1600,400,980,neither,10.7849002 -payam,170,2250,1110,right,8.0488411 -payam,540,2300,1470,left,15.8741313 -payam,6520,1530,2860,neither,7.0883511 -payam,5640,5810,4930,neither,7.4146026 -payam,3220,6620,5720,right,7.7339321 -test,30,3890,1900,right,22.9397707 -test,8530,4240,8100,neither,5.9101173 -test,7420,150,560,neither,137.2592382 -test,2130,5290,6180,neither,10.5838116 -test,1370,1510,1710,neither,6.7253406 -test,2750,1950,3380,right,17.9191102 -test,6460,2490,2850,neither,29.0007036 -test,3030,1430,7950,neither,3.9712144 -test,1390,7990,5990,right,42.9268688 -Payam,7010,7880,1480,neither,29.2939266 -Payam,3670,4380,1690,neither,13.5852305 -payam3,510,3840,2050,neither,19.60391 -payam3,3120,5240,400,neither,7.5961915 -payam3,6610,1980,5910,neither,32.5094664 -payam3,2790,6640,7410,neither,6.8419322 -payam3,2500,2680,520,neither,4.9135896 -payam3,3000,6310,6340,right,15.3105667 -payam3,3430,5560,80,neither,7.3489193 -payam3,5430,6460,570,neither,18.0880914 -payam3,5710,2830,2520,neither,11.3542066 -payam3,8740,1560,590,neither,9.5529691 -payam3,260,5430,4780,neither,8.8140994 -payam3,5380,5730,6680,neither,9.4738672 -payam3,990,190,3210,neither,6.6699878 -payam3,820,1910,4510,left,6.098936 -payam3,610,5890,7960,right,13.1935591 -payam3,4870,4910,6980,left,79.8618806 -payam3,120,1450,1240,left,23.4519851 -payam3,6940,7090,4180,left,7.9929938 -payam3,6360,5830,6430,neither,27.7130248 -payam,6980,8150,440,left,27.8841035 -payam,580,7490,7090,neither,10.9262192 -payam,2340,8460,6140,neither,26.7986099 -payam,5860,5670,4210,neither,18.7140217 -payam,1860,7700,7940,neither,14.8113515 -payam,8140,6110,4520,neither,24.3878234 -payam,2710,300,430,neither,21.5049204 -payam,3630,3590,2680,left,23.1192547 -payam,3740,820,3660,right,5.682812 -payam,1480,2740,2400,right,12.0348692 -payam,910,7710,7630,right,7.8845036 -payam,230,1540,4340,left,7.4580693 -payam,3170,1410,490,neither,10.9175666 -payam,500,2870,260,neither,12.8639616 -payam,7930,7780,5110,right,26.1968788 -payam,2300,1100,1090,right,8.9982824 -payam,2960,8730,4210,neither,15.1996149 -payam,4500,5370,2490,neither,20.4606212 -payam,2440,1140,6330,left,8.8519273 -payam,3930,1500,2990,right,12.3335319 -payam,4960,7920,3680,left,8.5003094 -payam,8600,6700,7470,right,8.8673901 -payam,5580,6610,1700,left,23.4709146 -payam,1320,3160,1380,right,19.3742774 -payam,8220,8660,1510,left,4.5353338 -payam,3120,6550,1760,neither,11.938804 -payam,3120,6550,1760,neither,24.8215028 -payam,3120,6550,1760,neither,32.6924222 -payam,3120,6550,1760,neither,199.0291749 -aaa,7680,780,3320,neither,13.9752481 -aaa,1170,1680,8400,right,72.1732308 -aaa,190,1570,590,neither,23.8951871 -aaa,890,190,8790,neither,15.8835947 -aaa,1110,850,1120,neither,12.2388968 -aaa,6990,70,840,neither,10.1972333 -aaa,1570,600,1040,neither,58.0329853 -payam,2610,3470,8790,neither,23.478667 -payam,2820,970,2610,neither,17.2883431 -payam,5130,3330,1970,neither,25.0759505 -payam,200,970,3200,neither,10.951438 -payam,6160,890,870,neither,13.3928642 -payam,4030,870,190,right,21.7972779 -payam,4640,3260,4480,neither,15.6459803 -payam,3750,3400,250,left,15.7365793 -payam,5820,5110,260,neither,11.9040374 -payam,4960,7920,2720,left,13.7575275 -payam,3340,5410,4670,neither,11.0850437 -payam,3220,5500,3450,neither,12.7476892 -payam,4830,3340,1750,neither,8.2270027 -payam,3280,5500,6100,neither,11.6609682 -payam,5570,3380,4830,neither,12.5343834 -payam,2170,3490,5410,left,20.2327963 -payam,1310,7950,5980,right,17.9011993 -payam,4830,8250,5060,right,12.6581358 -payam,2410,1730,1420,left,8.4628379 -payam,1610,140,1040,neither,39.4259191 -payam,2720,290,660,neither,9.004165 -payam,290,1770,1500,neither,8.7271764 -payam,2590,1750,2280,left,21.5393153 -payam,940,4110,5100,neither,14.6486691 -payam,3880,5100,8100,right,6.654937 -payam,2820,4740,3760,neither,8.9366303 -payam,2620,3800,3640,neither,32.7218192 -payam,490,3630,50,neither,15.6962632 -payam,6500,4740,3950,neither,10.4569726 -payam,3860,3960,3670,neither,5.6011954 -payam,8160,4670,3320,neither,9.9990773 -payam,5130,4670,3960,neither,10.0627506 -payam,1970,3670,6110,neither,8.9863573 -payam,5240,3800,4670,neither,35.1511656 -pp,6520,3000,3730,neither,13.2667115 -pp,3950,4670,8530,neither,10.3694821 -pp,3960,3670,1100,right,14.2283085 -pp,1080,2690,4240,left,8.0165702 -pp,4060,8590,7870,right,39.0669929 -pp,2730,860,5670,neither,21.0611187 -pp,1970,4410,2750,neither,23.8645963 -pp,2520,4410,7870,neither,7.0013341 -pp,2280,2790,2800,right,10.0719624 -pp,5730,1590,1630,neither,60.7203313 -pp,2170,2630,970,right,15.679544 -pp,7170,4640,1940,neither,12.8070448 -pp,1910,3790,3370,right,10.4345838 -pp,1940,5050,4170,neither,15.4618189 -pp,4240,6180,5690,neither,11.6433038 -pp,1360,4960,4100,right,22.1847562 -pp,2620,3610,3280,neither,16.611427 -payam,1140,1770,140,neither,32.3426861 -payam,5410,2110,3480,neither,9.2396016 -payam,6350,4540,2160,neither,15.8730432 -payam,1090,3280,5400,right,7.9992393 -payam,6870,7500,6480,left,16.2428709 -payam,4890,7850,2720,left,11.0865865 -payam,6340,6250,450,left,7.1841906 -payam,2410,1240,1360,right,3.8704583 -payam,4010,8090,2620,left,11.8036291 -payam,1940,4910,4050,right,10.9399087 -payam,8580,6390,5740,neither,19.4242683 -payam,360,6430,6520,neither,39.232415 -payam,4330,5830,6420,neither,14.8946759 -payam,7070,6430,280,neither,9.1711564 -payam,440,6390,5570,neither,16.4782124 -payam,5830,6650,3080,neither,27.4215719 -payam,6620,5790,3630,neither,7.9312582 -payam,3520,6680,5860,neither,15.903332 -payam,6680,6600,1570,neither,7.0110227 -payam,1050,5650,5570,neither,9.8728923 -payam,4540,5740,6430,right,7.6298997 -payam,2560,1730,1030,left,64.8227024 -payam,1910,6760,7300,right,5.3704937 -payam,5730,3020,3080,right,14.8289519 -payam,4440,1750,1640,neither,17.3421493 -payam,3160,910,140,neither,7.7735092 -payam,5280,80,2260,right,10.4654106 -p,8630,3910,1190,neither,12.2734672 -p,4610,8400,6030,neither,25.0126291 -p,4250,5200,8530,right,11.2253927 -p,370,660,6480,neither,8.051555 -p,5420,150,180,right,7.0392668 -p,3360,3600,560,right,12.5866966 -p,3900,3980,3760,right,60.7114951 -p,5010,8340,6970,neither,25.5606582 -p,6210,8650,2790,neither,14.1406681 -p,640,8210,5010,neither,11.7996506 -p,3200,8170,8030,neither,88.1912972 -p,7970,8110,7150,neither,9.3730532 -p,7980,4090,8290,right,14.0371127 -p,1070,430,410,neither,9.7275574 -p,310,410,4240,left,5.8301648 -p,7030,2460,2840,right,7.3928905 -p,3830,5260,3130,neither,7.6125024 -p,3850,4450,7780,neither,15.566987 -p,3830,2120,6260,neither,15.1221466 -p,5340,4510,1340,neither,10.4645626 -p,3830,3710,8580,neither,10.3860476 -p,7090,2180,3720,neither,13.3298749 -p,3410,2120,6400,neither,6.5504196 -p,1570,1990,3880,left,16.0690818 -p,8280,7720,7760,left,19.2019649 -p,180,4850,980,right,12.3407615 -payam,4460,2880,5160,neither,31.7953241 -payam,4990,4230,1360,left,26.7975901 -payam,3690,2640,2750,neither,13.5067165 -payam,2820,2520,5540,neither,14.3831229 -payam,640,2830,2670,neither,22.702336 -payam,2640,2520,1790,right,9.7770384 -payam,2220,3330,4770,neither,17.5712949 -payam,4000,2130,4350,neither,8.9440079 -payam,2470,5370,4710,neither,25.1191583 -payam,60,4790,3780,neither,6.8597862 -payam,5290,4760,4360,neither,39.3547437 -payam,2680,5470,4730,right,9.0767636 -payam,460,1700,4490,left,9.5737932 -payam,640,4640,3240,left,9.0991829 -payam,5360,8340,7920,neither,43.4682622 -payam,8370,8320,5650,left,13.9077435 -payam,8800,6350,2520,neither,23.3640544 -pp,1270,4440,2720,neither,15.9654004 -pp,6050,5870,8630,neither,64.0813694 -pp,4960,4100,1380,left,20.2915712 -pp,2410,2030,3340,right,5.8915843 -pp,7520,6470,3160,left,4.4095175 -pp,2490,7850,5920,right,19.7446139 -pp,7870,8190,6080,left,12.2258913 -payam,6910,8130,5200,left,22.1673536 -payam,1270,1310,8550,neither,4.9356615 -payam,1200,3010,8090,neither,6.2535121 -payam,1640,1200,3010,neither,7.2112379 -payam,2160,3180,1450,neither,12.2517686 -payam,2290,2920,1450,neither,11.0051224 -payam,4620,2970,1270,neither,6.0543814 -payam,3180,1510,360,neither,7.1203205 -payam,1110,3010,1200,neither,4.9222736 -payam,500,1240,2970,neither,244.7143959 -payam,1370,2970,1540,neither,69.5212346 -payam,3010,3200,5450,neither,12.7399181 -payam,3120,1270,1240,right,7.1447759 -payam,4860,8750,5670,neither,19.6800883 -payam,190,8410,8790,right,4.0843236 -payam,4620,7980,4870,neither,11.6172416 -payam,4870,5080,1040,neither,9.651124 -payam,590,4910,4880,neither,17.2420091 -payam,340,4880,5080,neither,11.3620724 -payam,8740,4050,4020,right,71.2654567 -payam,2310,1960,6000,neither,21.7536178 -payam,6620,5740,5500,neither,6.9649316 -payam,5670,5650,5300,right,7.7041501 -payam,3040,7940,7700,neither,25.42975 -payam,7710,7780,1620,left,195.092137 -payam,7430,7320,5660,right,13.8465693 -payam,7980,2240,2230,right,13.2360946 -payam,6790,7300,3100,neither,15.0350203 -payam,2650,7200,7300,right,6.4745868 -payam,1430,1440,2920,neither,7.7909788 -payam,3060,1290,2590,left,56.5645663 -payam,6500,790,2720,neither,12.4153127 -payam,1750,1740,7740,neither,45.0387307 -payam,3860,290,1750,right,127.7812023 -payam,2900,3120,3160,left,8.5753824 -payam,560,2300,2550,right,18.1986405 -payam,1210,6030,8530,neither,12.6016999 -payam,5130,8400,4060,right,13.3131464 -payam,1060,320,250,right,11.4700255 -payam,3250,4560,7510,left,9.6844356 -payam,2140,1010,2570,neither,19.8406119 -payam,1000,400,1450,neither,20.5022202 -payam,930,900,5540,left,9.8994706 -payam,3760,3520,1860,right,47.1336049 -payam,6140,1230,1260,left,108.9202734 -payam,590,430,310,neither,11.3733376 -payam,7880,2690,2680,neither,5.1495561 -payam,3170,2680,310,neither,8.5025821 -payam,3050,1080,410,neither,8.527219 -payam,2770,1080,7490,neither,402.8720653 -payam,3190,410,2680,neither,10.4691005 -payam,2770,410,3720,left,4.3418474 -payam,1410,2460,340,neither,12.1507813 -payam,3150,2440,2840,neither,11.3710139 -payam,2820,800,2310,neither,30.0199888 -payam,2460,710,160,neither,9.4166814 -payam,2440,30,6480,neither,7.3646978 -payam,7870,1140,340,neither,14.3368254 -payam,2930,1720,2590,neither,18.4583712 -payam,240,2450,5190,left,8.7122048 -payam,3890,2120,6000,right,17.7283225 -payam,6190,4120,6640,left,12.888971 -payam,5240,3650,5320,right,4.7470391 -payam,4330,5950,6030,left,9.3768817 -payam,1110,590,430,left,11.9393439 -payam,3000,4830,8350,neither,17.2718661 -payam,1790,2730,2750,neither,9.0715062 -payam,2820,2760,5640,neither,136.1871848 -payam,2790,2830,0,left,7.9951019 -payam,6890,3350,5370,right,10.1746638 -payam,4100,460,850,neither,15.272502 -payam,2880,190,2630,neither,11.2373856 -payam,7290,1590,460,right,328.9525274 -payam,3270,2010,590,right,42.705704 -payam,500,6020,8370,neither,10.2211368 -payam,6160,8320,2700,neither,5.8553287 -payam,2910,8510,6260,neither,20.3620241 -payam,8660,5940,6070,left,10.4425546 -payam,6670,4170,4960,right,28.4874515 -payam,1350,2030,5410,neither,8.5564246 -payam,2160,2110,2680,left,5.1389665 -payam,7520,6530,4530,neither,13.91322 -payam,7230,6670,2250,neither,8.7192474 -payam,7500,7230,8500,neither,12.1839644 -payam,6530,7500,7660,neither,13.3554725 -payam,6670,7520,7950,left,7.8109235 -payam,4440,6070,1490,neither,22.8200555 -payam,6290,4350,530,left,30.2692709 -payam,7870,8190,6080,left,18.4469568 -payam,8520,3180,2920,neither,15.4442971 -payam,1450,3180,1960,left,14.0648741 -payam,1770,8490,4180,neither,15.9577787 -payam,5060,8790,3120,neither,19.0112589 -payam,4310,8230,3100,neither,18.039143 -payam,8720,7890,5100,right,27.7411957 -payam,2620,8740,8270,neither,10.0985898 -payam,5080,4870,8800,neither,285.7399943 -payam,390,4910,4880,neither,31.1095841 -payam,4880,8270,2720,neither,6.3076338 -payam,3030,4880,8740,neither,17.0361449 -payam,4910,7980,2910,neither,8.4894469 -payam,4050,7980,1710,neither,6.3573893 -payam,4910,4880,5450,neither,4.4998203 -payam,1750,4050,8740,right,96.8739301 -payam,6650,6620,5540,neither,11.0029129 -payam,5740,5830,3310,neither,12.8770997 -payam,6390,6560,1970,neither,40.2096949 +payam,8720,1210,2930,neither,24.5149699 +payam,960,6830,7040,right,19.103924 +payam,2400,7930,7770,right,12.2262586 +payam,4850,2520,2830,neither,15.1091679 +payam,740,2820,2750,neither,18.6786855 +payam,1840,1090,7750,left,7.826561 +payam,7290,6890,6760,left,18.6590548 +payam,910,7710,7630,neither,15.9142669 +payam,110,3080,3020,right,19.3148321 +payam,4310,8430,6020,right,18.3946849 +payam,2370,6450,6640,neither,8.0961591 +payam,3760,3670,8090,left,35.6075189 +payam,7290,6890,6760,left,6.1512555 +payam,2330,1610,4290,neither,9.3917529 +payam,4960,1360,6940,neither,17.0703494 +payam,4880,2220,4050,neither,6.66984 +payam,4150,6320,5120,left,11.1385341 +payam,2380,2290,3830,neither,8.175581 +payam,6990,4350,7850,right,24.8897172 +payam,1260,1300,1150,left,8.7496713 +payam,4040,8150,270,neither,6.5148909 +payam,8210,4200,540,neither,9.3981568 +payam,1840,1090,7750,left,3.4069544 +payam,80,1770,5440,neither,6.8139126 +payam,4740,6380,6320,left,15.6245139 +payam,350,1710,820,neither,6.9307515 +payam,7980,380,440,right,6.9549693 +payam,4710,4520,6560,left,15.8368305 +payam,4150,1030,5450,right,5.8812649 +payam,4090,8210,1650,left,9.3473628 +payam,3160,3050,3300,right,15.4507672 +payam,3700,5340,4990,neither,19.4590834 +payam,5790,3250,4560,right,5.8536474 +payam,2820,2730,410,left,8.6303164 +payam,4180,6270,6780,neither,10.6426271 +payam,3450,3010,3200,neither,9.5873883 +payam,1480,2740,2400,neither,7.4411407 +payam,5200,140,2280,neither,10.3224914 +payam,7920,5360,5370,right,12.6515 +payam,8140,6670,6500,neither,8.3602584 +payam,1880,1810,5430,right,23.2110811 +payam,3190,1380,3600,left,13.4339459 +payam,50,4340,5020,neither,21.3263438 +payam,8350,8660,720,neither,5.8660808 +payam,3380,3280,1440,neither,12.1587357 +payam,1990,1450,3180,right,11.0167687 +payam,5220,3470,2030,right,17.4336008 +payam,3030,1430,7950,neither,5.1681943 +payam,2070,2690,1080,right,7.0990855 +payam,5270,760,2620,right,7.8710579 +payam,1500,5160,5060,right,13.7366622 +payam,3720,5320,980,right,3.9487614 +payam,6360,5830,6430,right,16.9997513 +payam,8100,2890,2200,neither,11.3478833 +payam,8190,8240,5770,left,9.2111841 +payam,6660,4820,3920,right,7.2671531 +payam,1380,1480,6320,left,8.1044273 +payam,2920,6550,6590,right,4.2769958 +payam,8030,5080,8270,left,8.6034509 +payam,7140,7510,4590,neither,16.6477046 +payam,2820,6050,8100,left,24.6404202 +payam,8760,7440,5770,right,13.7314397 +payam,1500,3220,670,left,11.3452669 +payam,7760,7550,5210,neither,6.2482816 +payam,1480,3220,1500,right,12.5323615 +payam,3370,5490,770,neither,7.1390339 +payam,7810,1860,900,right,10.9310926 +payam,1480,60,1000,left,22.48273 +payam,880,370,100,right,6.3977794 +payam,6530,6470,8270,left,23.1602921 +payam,7870,5570,5710,neither,10.2806256 +payam,6440,5560,1420,left,6.4094853 +payam,3080,3090,5170,left,21.2385869 +payam,2360,3630,3810,neither,8.7701795 +payam,490,1260,1220,neither,6.672139 +payam,2290,610,2460,left,6.049926 +payam,3380,3280,1440,neither,20.8177628 +payam,2660,6610,6450,left,42.8382073 +payam,5880,2790,2520,right,9.3886082 +payam,6520,5860,5480,neither,7.0841901 +payam,3640,2700,1130,left,32.19488 +payam,2800,6870,6710,neither,11.8798787 +payam,740,2820,2750,neither,9.5489998 +payam,7590,7770,6800,neither,19.0458094 +payam,4240,2560,270,right,12.5144577 +payam,4170,8630,3110,neither,9.0806503 +payam,4870,6970,6740,neither,16.2745591 +payam,6840,6060,6290,neither,6.9093279 +payam,90,3320,4740,neither,9.6742201 +payam,7140,7510,4590,left,22.7795439 +payam,5010,2290,7770,neither,19.3744291 +payam,5960,6240,8600,right,9.0793041 +payam,1250,6260,7880,right,4.93853 +payam,4270,1320,1350,right,17.2790887 +payam,4670,1250,3220,neither,10.618701 +payam,1480,2740,2400,neither,7.0960476 +payam,4270,1320,1350,right,4.4593416 +payam,7500,6670,450,neither,6.8258252 +payam,2330,3690,3800,neither,9.1139991 +payam,6890,3890,3700,neither,4.0416935 +payam,2160,8690,5990,neither,11.627078 +payam,5230,5350,5200,left,12.7237877 +payam,5410,5190,5000,neither,14.5101507 +payam,8310,8680,3050,left,7.9895167 +payam,1600,400,980,neither,10.7849002 +payam,170,2250,1110,right,8.0488411 +payam,540,2300,1470,left,15.8741313 +payam,6520,1530,2860,neither,7.0883511 +payam,5640,5810,4930,neither,7.4146026 +payam,3220,6620,5720,right,7.7339321 +test,30,3890,1900,right,22.9397707 +test,8530,4240,8100,neither,5.9101173 +test,7420,150,560,neither,137.2592382 +test,2130,5290,6180,neither,10.5838116 +test,1370,1510,1710,neither,6.7253406 +test,2750,1950,3380,right,17.9191102 +test,6460,2490,2850,neither,29.0007036 +test,3030,1430,7950,neither,3.9712144 +test,1390,7990,5990,right,42.9268688 +Payam,7010,7880,1480,neither,29.2939266 +Payam,3670,4380,1690,neither,13.5852305 +payam3,510,3840,2050,neither,19.60391 +payam3,3120,5240,400,neither,7.5961915 +payam3,6610,1980,5910,neither,32.5094664 +payam3,2790,6640,7410,neither,6.8419322 +payam3,2500,2680,520,neither,4.9135896 +payam3,3000,6310,6340,right,15.3105667 +payam3,3430,5560,80,neither,7.3489193 +payam3,5430,6460,570,neither,18.0880914 +payam3,5710,2830,2520,neither,11.3542066 +payam3,8740,1560,590,neither,9.5529691 +payam3,260,5430,4780,neither,8.8140994 +payam3,5380,5730,6680,neither,9.4738672 +payam3,990,190,3210,neither,6.6699878 +payam3,820,1910,4510,left,6.098936 +payam3,610,5890,7960,right,13.1935591 +payam3,4870,4910,6980,left,79.8618806 +payam3,120,1450,1240,left,23.4519851 +payam3,6940,7090,4180,left,7.9929938 +payam3,6360,5830,6430,neither,27.7130248 +payam,6980,8150,440,left,27.8841035 +payam,580,7490,7090,neither,10.9262192 +payam,2340,8460,6140,neither,26.7986099 +payam,5860,5670,4210,neither,18.7140217 +payam,1860,7700,7940,neither,14.8113515 +payam,8140,6110,4520,neither,24.3878234 +payam,2710,300,430,neither,21.5049204 +payam,3630,3590,2680,left,23.1192547 +payam,3740,820,3660,right,5.682812 +payam,1480,2740,2400,right,12.0348692 +payam,910,7710,7630,right,7.8845036 +payam,230,1540,4340,left,7.4580693 +payam,3170,1410,490,neither,10.9175666 +payam,500,2870,260,neither,12.8639616 +payam,7930,7780,5110,right,26.1968788 +payam,2300,1100,1090,right,8.9982824 +payam,2960,8730,4210,neither,15.1996149 +payam,4500,5370,2490,neither,20.4606212 +payam,2440,1140,6330,left,8.8519273 +payam,3930,1500,2990,right,12.3335319 +payam,4960,7920,3680,left,8.5003094 +payam,8600,6700,7470,right,8.8673901 +payam,5580,6610,1700,left,23.4709146 +payam,1320,3160,1380,right,19.3742774 +payam,8220,8660,1510,left,4.5353338 +payam,3120,6550,1760,neither,11.938804 +payam,3120,6550,1760,neither,24.8215028 +payam,3120,6550,1760,neither,32.6924222 +payam,3120,6550,1760,neither,199.0291749 +aaa,7680,780,3320,neither,13.9752481 +aaa,1170,1680,8400,right,72.1732308 +aaa,190,1570,590,neither,23.8951871 +aaa,890,190,8790,neither,15.8835947 +aaa,1110,850,1120,neither,12.2388968 +aaa,6990,70,840,neither,10.1972333 +aaa,1570,600,1040,neither,58.0329853 +payam,2610,3470,8790,neither,23.478667 +payam,2820,970,2610,neither,17.2883431 +payam,5130,3330,1970,neither,25.0759505 +payam,200,970,3200,neither,10.951438 +payam,6160,890,870,neither,13.3928642 +payam,4030,870,190,right,21.7972779 +payam,4640,3260,4480,neither,15.6459803 +payam,3750,3400,250,left,15.7365793 +payam,5820,5110,260,neither,11.9040374 +payam,4960,7920,2720,left,13.7575275 +payam,3340,5410,4670,neither,11.0850437 +payam,3220,5500,3450,neither,12.7476892 +payam,4830,3340,1750,neither,8.2270027 +payam,3280,5500,6100,neither,11.6609682 +payam,5570,3380,4830,neither,12.5343834 +payam,2170,3490,5410,left,20.2327963 +payam,1310,7950,5980,right,17.9011993 +payam,4830,8250,5060,right,12.6581358 +payam,2410,1730,1420,left,8.4628379 +payam,1610,140,1040,neither,39.4259191 +payam,2720,290,660,neither,9.004165 +payam,290,1770,1500,neither,8.7271764 +payam,2590,1750,2280,left,21.5393153 +payam,940,4110,5100,neither,14.6486691 +payam,3880,5100,8100,right,6.654937 +payam,2820,4740,3760,neither,8.9366303 +payam,2620,3800,3640,neither,32.7218192 +payam,490,3630,50,neither,15.6962632 +payam,6500,4740,3950,neither,10.4569726 +payam,3860,3960,3670,neither,5.6011954 +payam,8160,4670,3320,neither,9.9990773 +payam,5130,4670,3960,neither,10.0627506 +payam,1970,3670,6110,neither,8.9863573 +payam,5240,3800,4670,neither,35.1511656 +pp,6520,3000,3730,neither,13.2667115 +pp,3950,4670,8530,neither,10.3694821 +pp,3960,3670,1100,right,14.2283085 +pp,1080,2690,4240,left,8.0165702 +pp,4060,8590,7870,right,39.0669929 +pp,2730,860,5670,neither,21.0611187 +pp,1970,4410,2750,neither,23.8645963 +pp,2520,4410,7870,neither,7.0013341 +pp,2280,2790,2800,right,10.0719624 +pp,5730,1590,1630,neither,60.7203313 +pp,2170,2630,970,right,15.679544 +pp,7170,4640,1940,neither,12.8070448 +pp,1910,3790,3370,right,10.4345838 +pp,1940,5050,4170,neither,15.4618189 +pp,4240,6180,5690,neither,11.6433038 +pp,1360,4960,4100,right,22.1847562 +pp,2620,3610,3280,neither,16.611427 +payam,1140,1770,140,neither,32.3426861 +payam,5410,2110,3480,neither,9.2396016 +payam,6350,4540,2160,neither,15.8730432 +payam,1090,3280,5400,right,7.9992393 +payam,6870,7500,6480,left,16.2428709 +payam,4890,7850,2720,left,11.0865865 +payam,6340,6250,450,left,7.1841906 +payam,2410,1240,1360,right,3.8704583 +payam,4010,8090,2620,left,11.8036291 +payam,1940,4910,4050,right,10.9399087 +payam,8580,6390,5740,neither,19.4242683 +payam,360,6430,6520,neither,39.232415 +payam,4330,5830,6420,neither,14.8946759 +payam,7070,6430,280,neither,9.1711564 +payam,440,6390,5570,neither,16.4782124 +payam,5830,6650,3080,neither,27.4215719 +payam,6620,5790,3630,neither,7.9312582 +payam,3520,6680,5860,neither,15.903332 +payam,6680,6600,1570,neither,7.0110227 +payam,1050,5650,5570,neither,9.8728923 +payam,4540,5740,6430,right,7.6298997 +payam,2560,1730,1030,left,64.8227024 +payam,1910,6760,7300,right,5.3704937 +payam,5730,3020,3080,right,14.8289519 +payam,4440,1750,1640,neither,17.3421493 +payam,3160,910,140,neither,7.7735092 +payam,5280,80,2260,right,10.4654106 +p,8630,3910,1190,neither,12.2734672 +p,4610,8400,6030,neither,25.0126291 +p,4250,5200,8530,right,11.2253927 +p,370,660,6480,neither,8.051555 +p,5420,150,180,right,7.0392668 +p,3360,3600,560,right,12.5866966 +p,3900,3980,3760,right,60.7114951 +p,5010,8340,6970,neither,25.5606582 +p,6210,8650,2790,neither,14.1406681 +p,640,8210,5010,neither,11.7996506 +p,3200,8170,8030,neither,88.1912972 +p,7970,8110,7150,neither,9.3730532 +p,7980,4090,8290,right,14.0371127 +p,1070,430,410,neither,9.7275574 +p,310,410,4240,left,5.8301648 +p,7030,2460,2840,right,7.3928905 +p,3830,5260,3130,neither,7.6125024 +p,3850,4450,7780,neither,15.566987 +p,3830,2120,6260,neither,15.1221466 +p,5340,4510,1340,neither,10.4645626 +p,3830,3710,8580,neither,10.3860476 +p,7090,2180,3720,neither,13.3298749 +p,3410,2120,6400,neither,6.5504196 +p,1570,1990,3880,left,16.0690818 +p,8280,7720,7760,left,19.2019649 +p,180,4850,980,right,12.3407615 +payam,4460,2880,5160,neither,31.7953241 +payam,4990,4230,1360,left,26.7975901 +payam,3690,2640,2750,neither,13.5067165 +payam,2820,2520,5540,neither,14.3831229 +payam,640,2830,2670,neither,22.702336 +payam,2640,2520,1790,right,9.7770384 +payam,2220,3330,4770,neither,17.5712949 +payam,4000,2130,4350,neither,8.9440079 +payam,2470,5370,4710,neither,25.1191583 +payam,60,4790,3780,neither,6.8597862 +payam,5290,4760,4360,neither,39.3547437 +payam,2680,5470,4730,right,9.0767636 +payam,460,1700,4490,left,9.5737932 +payam,640,4640,3240,left,9.0991829 +payam,5360,8340,7920,neither,43.4682622 +payam,8370,8320,5650,left,13.9077435 +payam,8800,6350,2520,neither,23.3640544 +pp,1270,4440,2720,neither,15.9654004 +pp,6050,5870,8630,neither,64.0813694 +pp,4960,4100,1380,left,20.2915712 +pp,2410,2030,3340,right,5.8915843 +pp,7520,6470,3160,left,4.4095175 +pp,2490,7850,5920,right,19.7446139 +pp,7870,8190,6080,left,12.2258913 +payam,6910,8130,5200,left,22.1673536 +payam,1270,1310,8550,neither,4.9356615 +payam,1200,3010,8090,neither,6.2535121 +payam,1640,1200,3010,neither,7.2112379 +payam,2160,3180,1450,neither,12.2517686 +payam,2290,2920,1450,neither,11.0051224 +payam,4620,2970,1270,neither,6.0543814 +payam,3180,1510,360,neither,7.1203205 +payam,1110,3010,1200,neither,4.9222736 +payam,500,1240,2970,neither,244.7143959 +payam,1370,2970,1540,neither,69.5212346 +payam,3010,3200,5450,neither,12.7399181 +payam,3120,1270,1240,right,7.1447759 +payam,4860,8750,5670,neither,19.6800883 +payam,190,8410,8790,right,4.0843236 +payam,4620,7980,4870,neither,11.6172416 +payam,4870,5080,1040,neither,9.651124 +payam,590,4910,4880,neither,17.2420091 +payam,340,4880,5080,neither,11.3620724 +payam,8740,4050,4020,right,71.2654567 +payam,2310,1960,6000,neither,21.7536178 +payam,6620,5740,5500,neither,6.9649316 +payam,5670,5650,5300,right,7.7041501 +payam,3040,7940,7700,neither,25.42975 +payam,7710,7780,1620,left,195.092137 +payam,7430,7320,5660,right,13.8465693 +payam,7980,2240,2230,right,13.2360946 +payam,6790,7300,3100,neither,15.0350203 +payam,2650,7200,7300,right,6.4745868 +payam,1430,1440,2920,neither,7.7909788 +payam,3060,1290,2590,left,56.5645663 +payam,6500,790,2720,neither,12.4153127 +payam,1750,1740,7740,neither,45.0387307 +payam,3860,290,1750,right,127.7812023 +payam,2900,3120,3160,left,8.5753824 +payam,560,2300,2550,right,18.1986405 +payam,1210,6030,8530,neither,12.6016999 +payam,5130,8400,4060,right,13.3131464 +payam,1060,320,250,right,11.4700255 +payam,3250,4560,7510,left,9.6844356 +payam,2140,1010,2570,neither,19.8406119 +payam,1000,400,1450,neither,20.5022202 +payam,930,900,5540,left,9.8994706 +payam,3760,3520,1860,right,47.1336049 +payam,6140,1230,1260,left,108.9202734 +payam,590,430,310,neither,11.3733376 +payam,7880,2690,2680,neither,5.1495561 +payam,3170,2680,310,neither,8.5025821 +payam,3050,1080,410,neither,8.527219 +payam,2770,1080,7490,neither,402.8720653 +payam,3190,410,2680,neither,10.4691005 +payam,2770,410,3720,left,4.3418474 +payam,1410,2460,340,neither,12.1507813 +payam,3150,2440,2840,neither,11.3710139 +payam,2820,800,2310,neither,30.0199888 +payam,2460,710,160,neither,9.4166814 +payam,2440,30,6480,neither,7.3646978 +payam,7870,1140,340,neither,14.3368254 +payam,2930,1720,2590,neither,18.4583712 +payam,240,2450,5190,left,8.7122048 +payam,3890,2120,6000,right,17.7283225 +payam,6190,4120,6640,left,12.888971 +payam,5240,3650,5320,right,4.7470391 +payam,4330,5950,6030,left,9.3768817 +payam,1110,590,430,left,11.9393439 +payam,3000,4830,8350,neither,17.2718661 +payam,1790,2730,2750,neither,9.0715062 +payam,2820,2760,5640,neither,136.1871848 +payam,2790,2830,0,left,7.9951019 +payam,6890,3350,5370,right,10.1746638 +payam,4100,460,850,neither,15.272502 +payam,2880,190,2630,neither,11.2373856 +payam,7290,1590,460,right,328.9525274 +payam,3270,2010,590,right,42.705704 +payam,500,6020,8370,neither,10.2211368 +payam,6160,8320,2700,neither,5.8553287 +payam,2910,8510,6260,neither,20.3620241 +payam,8660,5940,6070,left,10.4425546 +payam,6670,4170,4960,right,28.4874515 +payam,1350,2030,5410,neither,8.5564246 +payam,2160,2110,2680,left,5.1389665 +payam,7520,6530,4530,neither,13.91322 +payam,7230,6670,2250,neither,8.7192474 +payam,7500,7230,8500,neither,12.1839644 +payam,6530,7500,7660,neither,13.3554725 +payam,6670,7520,7950,left,7.8109235 +payam,4440,6070,1490,neither,22.8200555 +payam,6290,4350,530,left,30.2692709 +payam,7870,8190,6080,left,18.4469568 +payam,8520,3180,2920,neither,15.4442971 +payam,1450,3180,1960,left,14.0648741 +payam,1770,8490,4180,neither,15.9577787 +payam,5060,8790,3120,neither,19.0112589 +payam,4310,8230,3100,neither,18.039143 +payam,8720,7890,5100,right,27.7411957 +payam,2620,8740,8270,neither,10.0985898 +payam,5080,4870,8800,neither,285.7399943 +payam,390,4910,4880,neither,31.1095841 +payam,4880,8270,2720,neither,6.3076338 +payam,3030,4880,8740,neither,17.0361449 +payam,4910,7980,2910,neither,8.4894469 +payam,4050,7980,1710,neither,6.3573893 +payam,4910,4880,5450,neither,4.4998203 +payam,1750,4050,8740,right,96.8739301 +payam,6650,6620,5540,neither,11.0029129 +payam,5740,5830,3310,neither,12.8770997 +payam,6390,6560,1970,neither,40.2096949 diff --git a/scripts/data_loader/lmdb_data_loader.py b/scripts/data_loader/lmdb_data_loader.py index a11fd16..956782d 100644 --- a/scripts/data_loader/lmdb_data_loader.py +++ b/scripts/data_loader/lmdb_data_loader.py @@ -1,1323 +1,1323 @@ -""" -""" - - -from __future__ import annotations -import logging -import os -import pickle -import random -from typing import Tuple - -import numpy as np -import lmdb as lmdb -import torch -from torch.nn.utils.rnn import pad_sequence -from torch.utils.data import Dataset -from torch.utils.data.dataloader import default_collate -from data_loader.data_preprocessor import DataPreprocessor -import pyarrow -from configargparse import argparse -from sklearn.cluster import DBSCAN, KMeans, AgglomerativeClustering - -from model.vocab import Vocab -import utils - -device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - - -def word_seq_collate_fn( - data: list, -) -> ( - Tuple[ - torch.Tensor, - torch.Tensor, - torch.Tensor, - torch.Tensor, - dict, - torch.Tensor, - torch.Tensor, - torch.Tensor, - ] - | Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, dict] -): - """Collate function for loading word sequences in variable lengths. - - This function returns extra values depending if sentence_leve is True or False. - sentence_leve is set to always True and must be manually changed. - - Args: - data: A list of samples including the following values in each element: - word_seq: A list of Tensors of word vector representations. - words_lengths: A list of Tensors of word lengths in word_seq. - poses_seq: A list of Tensors of poses/gestures. - audio: A list of Tensors of audio data. - - Returns: - A 5-Tuple or 8-Tuple: - - 8-Tuple (if sentence_level is set to True): - word_seq: A Tensor of word vector representations. - words_lengths: A Tensor of word lengths in word_seq. - poses_seq: A Tensor of poses/gestures. - audio: A Tensor of audio data. - aux_info: A dict containing info such as name, start/end frames and times. - sentence_leve_latents: #TODO - cluster_portion: #TODO - GPT3_Embedding: #TODO - - 5-Tuple: - The first five values in the 8-Tuple. - """ - # sort a list by sequence length (descending order) to use pack_padded_sequence - data.sort(key=lambda x: len(x[0]), reverse=True) - - # separate source and target sequences - # Todo: fix this using args - sentence_leve = True - if not sentence_leve: - ( - word_seq, - poses_seq, - audio, - aux_info, - ) = zip(*data) - else: - ( - word_seq, - poses_seq, - audio, - aux_info, - sentence_leve_latents, - cluster_portion, - GPT3_Embedding, - ) = zip(*data) - - # merge sequences - words_lengths = torch.LongTensor([len(x) for x in word_seq]) - word_seq = pad_sequence(word_seq, batch_first=True).long() - - poses_seq = default_collate(poses_seq) - audio = default_collate(audio) - if sentence_leve: - sentence_leve_latents = default_collate(sentence_leve_latents) - cluster_portion = default_collate(cluster_portion) - GPT3_Embedding = default_collate(GPT3_Embedding) - - # audio = default_collate(audio) - aux_info = {key: default_collate([d[key] for d in aux_info]) for key in aux_info[0]} - - if sentence_leve: - return ( - word_seq, - words_lengths, - poses_seq, - audio, - aux_info, - sentence_leve_latents, - cluster_portion, - GPT3_Embedding, - ) - else: - return word_seq, words_lengths, poses_seq, audio, aux_info - - -class TrinityDataset(Dataset): - """Contains information and associated parameters of a (Trinity) dataset. - - This is a PyTorch Dataset subclass containing information of a Trinity dataset. - https://trinityspeechgesture.scss.tcd.ie/data/Trinity%20Speech-Gesture%20I/GENEA_Challenge_2020_data_release/ - - Attributes: - lmdb_dir: A string representing the filepath of the directory containing the actual dataset. - lmdb_env: A Lmdb ('Lightning Memory-Mapped' Database) object loaded from .mdb files located at the lmdb_dir. - n_poses: An int representing the number of frames in each clip in the dataset (normally 30 (in 30 fps)). - subdivision_stride: An int representing the number of frames between the start of one clip and the start of the next clip (clips can overlap). - skeleton_resampling_fps: An int representing the frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps). - n_samples: An int representing the number of clips/entries in the original dataset. - lang_model: A pre-trained (English) language vector representation contained in the 'Vocab' custom class. - data_mean: A mean calculated from each video in the original dataset. - data_std: A standard deviation calculcated from each video in the original dataset. - """ - - def __init__( - self, - args: argparse.Namespace, - lmdb_dir: str, - n_poses: int, - subdivision_stride: int, - pose_resampling_fps: int, - data_mean: list[float], - data_std: list[float], - ): - """Initialize with multiple dataset parameters. - - The args argument must contain the following keys: - name: A string name of the model (ex. 'DAE' or 'autoencoder_vq'). - rep_learning_checkpoint: If name is not 'DAE', a string filepath to a saved 'DAE' checkpoint model. - autoencoder_checkpoint: If sentence level is True, a string filepath to a saved VQVAE checkpoint model. - sentence_frame_length: An integer number of frames in each clip (for a sentence instead of gesture). - - Args: - args: A configargparse object containing parameters (See above). - lmdb_dir: A string representing the filepath of the directory containing the actual dataset. - n_poses: An int representing the number of frames in each clip in the dataset (normally 30 (in 30 fps)). - subdivision_stride: An int representing the number of frames between the start of one clip and the start of the next clip (clips can overlap). - pose_resampling_fps: An int representing the frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps). - data_mean: A mean calculated from each video in the original dataset. - data_std: A standard deviation calculcated from each video in the original dataset. - """ - self.lmdb_dir = lmdb_dir - self.n_poses = n_poses - self.subdivision_stride = subdivision_stride - self.skeleton_resampling_fps = pose_resampling_fps - self.lang_model = None - self.data_mean = np.array(data_mean).squeeze() - self.data_std = np.array(data_std).squeeze() - - logging.info("Reading data '{}'...".format(lmdb_dir)) - preloaded_dir = lmdb_dir + "_cache" - if not os.path.exists(preloaded_dir): - data_sampler = DataPreprocessor( - args, - lmdb_dir, - preloaded_dir, - n_poses, - subdivision_stride, - pose_resampling_fps, - sentence_level=False, - ) - data_sampler.run() - else: - logging.info("Found pre-loaded samples from {}".format(preloaded_dir)) - - # init lmdb - self.lmdb_env: lmdb.Environment = lmdb.open( - preloaded_dir, readonly=True, lock=False - ) - with self.lmdb_env.begin() as txn: - self.n_samples = txn.stat()["entries"] - - def __len__(self) -> int: - """Get the size of the dataset. - - Returns: - The number of samples in the original dataset. - """ - return self.n_samples - - def __getitem__( - self, idx: int - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, dict]: - """Get an item at a specific index in the dataset. - - Retrieves a specific entry in the original dataset and - associated information of that entry such as words, audio, poses, etc. - - Args: - idx: The index of the item to retrieve. - - Returns: - A 4-Tuple: - word_seq_tensor: A Tensor of word vector representations. - pose_seq: A Tensor of pose/gesture data. - audio: A Tensor of audio data. - aux_info: A dict containing the string keys: - 'vid': A string name for the information. - 'start_frame_no': An integer of the start index. - 'end_frame_no': An integer of the end index. - 'start_time': A float of the start time of the clip. - 'end_time': A float of the end time of the clip. - """ - with self.lmdb_env.begin(write=False) as txn: - key = "{:010}".format(idx).encode("ascii") - sample = txn.get(key) - - sample = pyarrow.deserialize(sample) - word_seq, pose_seq, audio, aux_info = sample - - def words_to_tensor(lang, words, end_time=None): - indexes = [lang.SOS_token] - for word in words: - if end_time is not None and word[1] > end_time: - break - indexes.append(lang.get_word_index(word[0])) - indexes.append(lang.EOS_token) - return torch.Tensor(indexes).long() - - # normalize - std = np.clip(self.data_std, a_min=0.01, a_max=None) - pose_seq: torch.Tensor = (pose_seq - self.data_mean) / std - - # to tensors - word_seq_tensor = words_to_tensor( - self.lang_model, word_seq, aux_info["end_time"] - ) - pose_seq = torch.from_numpy(pose_seq).reshape((pose_seq.shape[0], -1)).float() - audio = torch.from_numpy(audio).float() - - return word_seq_tensor, pose_seq, audio, aux_info - - def set_lang_model(self, lang_model: Vocab) -> None: - """Set the word vector representation to be used with the dataset. - - Modifies the internal state of the object at the attribute 'lang_model'. - - Args: - lang_model: A pre-trained word vector representation contained in the 'Vocab' class. - """ - self.lang_model = lang_model - - -class TrinityDataset_DAE(Dataset): - """Contains information and parameters of a (Trinity) dataset. - - This is a PyTorch Dataset subclass containing information of a Trinity dataset. - https://trinityspeechgesture.scss.tcd.ie/data/Trinity%20Speech-Gesture%20I/GENEA_Challenge_2020_data_release/ - - This class is focussed on the pose/gesture data only. - - Attributes: - lmdb_dir: A string filepath of the directory containing the actual dataset. - lmdb_env: A Lmdb object loaded from .mdb files in the lmdb_dir. - n_poses: An integer number of frames in each clip in the dataset (normally 30 (in 30 fps)). - subdivision_stride: An integer number of frames between the start of one clip and the start of the next clip (clips can overlap). - skeleton_resampling_fps: An integer frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps). - n_samples: An integer number of clips/entries in the original dataset. - lang_model: A 'Vocab' pre-trained word vector representation. - data_mean: A mean calculated from each video in the original dataset. - data_std: A standard deviation calculcated from each video. - all_poses: A list where each element is a dict containing string keys: - 'original': A Tensor of each pose/gesture data. - 'noisy': A Tensor with the same values as 'original'. - """ - - def __init__( - self, - args: argparse.Namespace, - lmdb_dir: str, - n_poses: int, - subdivision_stride: int, - pose_resampling_fps: int, - data_mean: list[float], - data_std: list[float], - ): - """Initialize with dataset location and several parameters. - - The args argument must contain the following keys: - name: A string name of the model (ex. 'DAE' or 'autoencoder_vq'). - rep_learning_checkpoint: If name is not 'DAE', a string filepath to a saved 'DAE' checkpoint model. - autoencoder_checkpoint: If sentence level is True, a string filepath to a saved VQVAE checkpoint model. - sentence_frame_length: An integer number of frames in each clip (for a sentence instead of gesture). - - Args: - args: A configargparse object containing parameters (See above). - lmdb_dir: A string representing the filepath of the directory containing the actual dataset. - n_poses: An int representing the number of frames in each clip in the dataset (normally 30 (in 30 fps)). - subdivision_stride: An int representing the number of frames between the start of one clip and the start of the next clip (clips can overlap). - pose_resampling_fps: An int representing the frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps). - data_mean: A mean calculated from each video in the original dataset. - data_std: A standard deviation calculcated from each video in the original dataset. - """ - self.lmdb_dir = lmdb_dir - self.n_poses = n_poses - self.subdivision_stride = subdivision_stride - self.skeleton_resampling_fps = pose_resampling_fps - self.lang_model = None - self.data_mean = np.array(data_mean).squeeze() - self.data_std = np.array(data_std).squeeze() - - logging.info("Reading data '{}'...".format(lmdb_dir)) - preloaded_dir = lmdb_dir + "_cache" - if not os.path.exists(preloaded_dir): - data_sampler = DataPreprocessor( - args, - lmdb_dir, - preloaded_dir, - n_poses, - subdivision_stride, - pose_resampling_fps, - ) - data_sampler.run() - else: - logging.info("Found pre-loaded samples from {}".format(preloaded_dir)) - - # init lmdb - self.lmdb_env: lmdb.Environment = lmdb.open( - preloaded_dir, readonly=True, lock=False - ) - with self.lmdb_env.begin() as txn: - self.n_samples = txn.stat()["entries"] - - # Initiate all poses - self.all_poses = [] - self.create_all_poses() - print("data init finished!") - - def __len__(self) -> int: - """Get the size of the dataset. - - Returns: - The number of samples in the dataset. - """ - # return 500 - # return (self.n_samples * self.n_poses) //5 - return self.n_samples - - def create_all_poses(self) -> None: - """Move all poses/gestures from database into object. - - Modifies the internal state of the object at attribute 'all_poses'. - """ - with self.lmdb_env.begin(write=False) as txn: - for i in range(self.n_samples): - key = "{:010}".format(i).encode("ascii") - sample = txn.get(key) - - sample = pyarrow.deserialize(sample) - word_seq, pose_seq, audio, aux_info = sample - - # normalize - std = np.clip(self.data_std, a_min=0.01, a_max=None) - pose_seq = (pose_seq - self.data_mean) / std - - for j in range(0, len(pose_seq)): - original = pose_seq[j, :] - var_coef = 1 # std - sigma = 1 - # noisy = self.add_noise(original, 0.0, std) - noisy = original # dropout layer adds noise instead - self.all_poses.append({"original": original, "noisy": noisy}) - - def get_item_Memory_Efficient(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]: - """Get the item at a specific index in the dataset. - - This method improves memory performance by returning - the pose/gesture data of the item only. - - Args: - idx: An integer index of the item to get. - - Returns: - A 2-Tuple: - noisy: A Tensor of pose/gesture data. - original: A Tensor with the same values as 'noisy'. - """ - idx_lmdb = idx // self.n_poses - - with self.lmdb_env.begin(write=False) as txn: - key = "{:010}".format(idx_lmdb).encode("ascii") - sample = txn.get(key) - - sample = pyarrow.deserialize(sample) - word_seq, pose_seq, audio, aux_info = sample - - # normalize - std = np.clip(self.data_std, a_min=0.01, a_max=None) - pose_seq = (pose_seq - self.data_mean) / std - - original = pose_seq[idx % self.n_poses, :] - noisy = original - - original = ( - torch.from_numpy(original).reshape((original.shape[0], -1)).float() - ) - noisy = torch.from_numpy(noisy).reshape((noisy.shape[0], -1)).float() - - return noisy, original - - def add_noise( - self, x: np.ndarray, variance_multiplier: np.ndarray, sigma: np.ndarray - ) -> np.ndarray: - """Add Gaussian noise to the input data. - - The noise added is based on the variance_multiplier and sigma array values. - - Args: - x: A numeric numpy array of data. - variance_multiplier: A numeric numpy array of coefficients to multiple variance of the noise on - sigma: A numeric numpy array of the variance of the dataset - - Returns: - A numeric numpy array of the data with noise added. - """ - eps = 1e-15 - noise = np.random.normal( - 0.0, np.multiply(sigma, variance_multiplier) + eps, x.shape - ) - x = x + noise - return x - - def add_noise2(self, x: np.ndarray, prob: float) -> np.ndarray: - """Add Gaussian noise to the input data. - - The noise is added by zeroing the specific elements. - - Args: - x: A numeric numpy array of data. - prob: A float percentage of adding noise to a single element. - - Returns: - A numeric numpy array of the data with noise added. - """ - for i in range(len(x)): - rnd = random.random() - if rnd < prob: - x[i] = 0 - return x - - def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]: - """Get the item at a specific index in the dataset. - - This method returns the data exactly as stored. The 'noisy' data does - not have noise added but assumes that noise has previously been added. - - Args: - idx: An integer index of the item to get. - - Returns: - A 2-Tuple: - noisy: A Tensor of pose/gesture data (with noise). - original: A Tensor of pose/gesture data. - """ - # Keep and document for only large datasets - # Todo: I think this way is more efficient than the prev. approach - # I need to double check and make sure everything works as before. - # return self.get_item_Memory_Efficient(idx) - # original, noisy = self.get_item_Memory_Efficient(idx) - # return noisy, original - - original = self.all_poses[idx]["original"] - noisy = self.all_poses[idx]["noisy"] - - original = torch.from_numpy(original).reshape((original.shape[0], -1)).float() - noisy = torch.from_numpy(noisy).reshape((noisy.shape[0], -1)).float() - - return noisy, original - - def set_lang_model(self, lang_model: Vocab) -> None: - """Set the word vector representation to be used with the dataset. - - Modifies the internal state of the object at the attribute 'lang_model'. - - Args: - lang_model: A pre-trained word vector representation contained in the 'Vocab' class. - """ - self.lang_model = lang_model - - -class TrinityDataset_DAEed_Autoencoder(Dataset): - """Contains information and parameters of a (Trinity) dataset. - - This is a PyTorch Dataset subclass containing information of a Trinity dataset. - https://trinityspeechgesture.scss.tcd.ie/data/Trinity%20Speech-Gesture%20I/GENEA_Challenge_2020_data_release/ - - This class is #TODO. - - Attributes: - lmdb_dir: A string filepath of the directory containing the actual dataset. - lmdb_env: A Lmdb object loaded from .mdb files in the lmdb_dir. - n_poses: An integer number of frames in each clip in the dataset (normally 30 (in 30 fps)). - subdivision_stride: An integer number of frames between the start of one clip and the start of the next clip (clips can overlap). - skeleton_resampling_fps: An integer frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps). - n_samples: An integer number of clips/entries in the original dataset. - lang_model: A 'Vocab' pre-trained word vector representation or None. - data_mean: A mean calculated from each video in the original dataset. - data_std: A standard deviation calculcated from each video. - pairwise_enabled: #TODO - use_derivative: Boolean to stack gradients to data during training. - encoded_labeled_poses: #TODO - rep_learning_dim: An integer dimension of the model (unused). - rep_learning_checkpoint: A string filepath to saved DAE model checkpoints. - rep_model: A DAE neural net model loaded from the above checkpoint. - Models: VQ_Frame, VAE_Network, DAE_Network in DAE_model.py. - """ - - def __init__( - self, - args: argparse.Namespace, - lmdb_dir: str, - n_poses: int, - subdivision_stride: int, - pose_resampling_fps: int, - data_mean: list[float], - data_std: list[float], - ): - """Initialize with dataset location and several parameters. - - The args argument must contain the following keys: - name: A string name of the model (ex. 'DAE' or 'autoencoder_vq'). - rep_learning_checkpoint: If name is not 'DAE', a string filepath to a saved 'DAE' checkpoint model. - autoencoder_checkpoint: If sentence level is True, a string filepath to a saved VQVAE checkpoint model. - sentence_frame_length: An integer number of frames in each clip (for a sentence instead of gesture). - rep_learning_checkpoint: A string filepath to saved model checkpoints. - use_derivative: A boolean whether to use derivatives. - - Args: - args: A configargparse object containing parameters (See above). - lmdb_dir: A string representing the filepath of the directory containing the actual dataset. - n_poses: An int representing the number of frames in each clip in the dataset (normally 30 (in 30 fps)). - subdivision_stride: An int representing the number of frames between the start of one clip and the start of the next clip (clips can overlap). - pose_resampling_fps: An int representing the frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps). - data_mean: A mean calculated from each video in the original dataset. - data_std: A standard deviation calculcated from each video in the original dataset. - """ - self.lmdb_dir = lmdb_dir - self.n_poses = n_poses - self.subdivision_stride = subdivision_stride - self.skeleton_resampling_fps = pose_resampling_fps - self.lang_model: Vocab | None = None - self.data_mean = np.array(data_mean).squeeze() - self.data_std = np.array(data_std).squeeze() - - # We will change it to true once we call the creat_similarity_dataset - self.pairwise_enabled: bool = False - self.use_derivative: bool = args.use_derivative == "True" - - logging.info("Reading data '{}'...".format(lmdb_dir)) - preloaded_dir = lmdb_dir + "_cache" - if not os.path.exists(preloaded_dir): - data_sampler = DataPreprocessor( - args, - lmdb_dir, - preloaded_dir, - n_poses, - subdivision_stride, - pose_resampling_fps, - ) - data_sampler.run() - else: - logging.info("Found pre-loaded samples from {}".format(preloaded_dir)) - - # init lmdb - self.lmdb_env: lmdb.Environment = lmdb.open( - preloaded_dir, readonly=True, lock=False - ) - with self.lmdb_env.begin() as txn: - self.n_samples = txn.stat()["entries"] - - # Todo: we need to initiate pre-trained representation learning model - checkpoint_path: str = args.rep_learning_checkpoint - self.rep_learning_dim: int = args.rep_learning_dim - ( - rep_learning_args, - rep_model, - rep_loss_fn, - rep_lang_model, - rep_out_dim, - ) = utils.train_utils.load_checkpoint_and_model(checkpoint_path, device, "DAE") - self.rep_model: torch.nn.Module = rep_model.to("cpu") - self.rep_model.train(False) - - def __len__(self) -> int: - """Get the number of samples in the dataset. - - Returns: - The integer size of samples in the dataset. - """ - return self.n_samples - - def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]: - """Get the item at a specific index in the dataset. - - Args: - idx: An integer index of the item to get. - - Returns: - A 2-Tuple: - encoded_poses: A Tensor of pose/gesture data. - encoded_poses: The same tensor as above. - """ - with self.lmdb_env.begin(write=False) as txn: - key = "{:010}".format(idx).encode("ascii") - sample = txn.get(key) - - sample: Tuple[ - np.ndarray, np.ndarray, np.ndarray, dict - ] = pyarrow.deserialize(sample) - word_seq, pose_seq, audio, aux_info = sample - - # normalize - std = np.clip(self.data_std, a_min=0.01, a_max=None) - pose_seq = (pose_seq - self.data_mean) / std - - # Todo: Here we should apply rep_learning - target = torch.from_numpy(pose_seq) - # target = torch.unsqueeze(target, 2) - target = target.float() - # target = target.to(device) - with torch.no_grad(): - if self.rep_model.encoder == None: # for ablation study - encoded_poses = target - else: - encoded_poses: torch.Tensor = self.rep_model.encoder(target) - - # encoded_poses = torch.squeeze(encoded_poses, 2) - # encoded_poses = encoded_poses.to('cpu') - # reconstructed = encoded_poses.detach().numpy() - - # to tensors - # word_seq_tensor = words_to_tensor(self.lang_model, word_seq, aux_info['end_time']) - # pose_seq = torch.from_numpy(pose_seq).reshape((pose_seq.shape[0], -1)).float() - encoded_poses = encoded_poses.reshape((encoded_poses.shape[0], -1)).float() - # audio = torch.from_numpy(audio).float() - - if self.use_derivative: - diff = [ - (encoded_poses[n, :] - encoded_poses[n - 1, :]) - for n in range(1, encoded_poses.shape[0]) - ] - diff.insert(0, torch.zeros_like(encoded_poses[0, :])) - encoded_poses = torch.hstack((encoded_poses, torch.stack(diff))) - - # return word_seq_tensor, pose_seq, audio, aux_info - return encoded_poses, encoded_poses - - def create_similarity_dataset(self, pickle_file: str, labelstxt_file: str) -> None: - """TODO""" - # Todo: 1. Thos function gets the pickle file that I made in the clustering.py(or flowgmm) process as well - # Todo: as the labels text file that I annotated in the Unity application. - # Todo: 2. Then I will creat those pairs of similarity and dissimilarity - # Todo: 3. Finally, we store the pairs into the class. - # Todo: We will use pairwise label and an extra loss in backpropagation process later. - - # 1. call preprocess load - ( - self.data_rnn, - self.labels, - self.pairwise_labels, - self.data_original, - ) = self.load_gesture_data(pickle_file, labelstxt_file) - - # normalize - std = np.clip(self.data_std, a_min=0.01, a_max=None) - self.data_original = (self.data_original - self.data_mean) / std - - target = torch.from_numpy(self.data_original) - - target = target.float() - # target = target.to(device) - with torch.no_grad(): - self.encoded_labeled_poses = self.rep_model.encoder(target) - if self.use_derivative: - diff = [ - ( - self.encoded_labeled_poses[n, :] - - self.encoded_labeled_poses[n - 1, :] - ) - for n in range(1, self.encoded_labeled_poses.shape[0]) - ] - diff.insert(0, torch.zeros_like(self.encoded_labeled_poses[0, :])) - self.encoded_labeled_poses = torch.cat( - (self.encoded_labeled_poses, torch.stack(diff)), dim=2 - ) - self.pairwise_enabled = True - pass - - def get_labeled_( - self, count: int - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """TODO""" - stack_pairs1 = torch.zeros( - count, - self.encoded_labeled_poses.shape[1], - self.encoded_labeled_poses.shape[2], - ) - stack_pairs2 = torch.zeros( - count, - self.encoded_labeled_poses.shape[1], - self.encoded_labeled_poses.shape[2], - ) - stack_labels = torch.zeros(count) - rnds = random.sample(range(1, len(self.pairwise_labels)), 3) - k = 0 - for rnd in rnds: - current_pair = self.pairwise_labels[rnd] - s1_ = self.encoded_labeled_poses[current_pair[0]] - s2_ = self.encoded_labeled_poses[current_pair[1]] - ss_label = current_pair[2] - stack_pairs1[k, :, :] = s1_ - stack_pairs2[k, :, :] = s2_ - stack_labels[k] = ss_label - k = k + 1 - - return stack_pairs1, stack_pairs2, stack_labels - - def set_lang_model(self, lang_model: Vocab) -> None: - """Set the word vector representation to be used with the dataset. - - Modifies the internal state of the object at the attribute 'lang_model'. - - Args: - lang_model: A pre-trained word vector representation contained in the 'Vocab' class. - """ - self.lang_model = lang_model - - def load_gesture_data( - self, pre_processed_pickle_adress: str, labelstxt_file: str - ) -> Tuple[np.ndarray, np.ndarray, list, list]: - """TODO""" - loaded = pickle.load(open(pre_processed_pickle_adress, "rb")) - liaded_len = len(loaded) - loaded = np.hstack(loaded) - print("len loaded", len(loaded)) - print("Loaded successfully") - - data_latent_rnn = [] - data_latent_linear = np.zeros( - [ - len(loaded), - loaded[0]["latent_linear"].shape[0], - loaded[0]["latent_linear"].shape[1], - ] - ) - data_latent_linear_listwise = [] - data_original = [] - count = len(loaded) - # count = 4000 - for i in range(count): - # 1 - current_latent_linear = loaded[i]["latent_linear"] - current_latent_rnn = loaded[i]["latent_rnn"] - current_original = loaded[i]["original"] - - if len(current_original) != len(loaded[0]["original"]): - continue - - # 2 - # current_latent_linear = np.hstack(current_latent_linear) - current_latent_rnn = np.hstack(current_latent_rnn) - # current_original = np.hstack(current_original) - - # 3 - data_latent_linear[i] = current_latent_linear - data_latent_linear_listwise.append(current_latent_linear) - data_latent_rnn.append(current_latent_rnn) - data_original.append(current_original) - - # Should be constructed here since it is not obvious how many we will have at the end. - data_latent_linear = np.zeros( - [ - len(data_latent_linear_listwise), - loaded[0]["latent_linear"].shape[0], - loaded[0]["latent_linear"].shape[1], - ] - ) - for i in range(len(data_latent_linear_listwise)): - data_latent_linear[i] = data_latent_linear_listwise[i] - - data_latent_rnn_ndarray = np.array(data_latent_rnn) - - first_order_labels = np.ones(len(data_latent_rnn_ndarray)) * -1 - - labels_list = [] - label_file = open(labelstxt_file, "r") - - for line in label_file: - str = line.split(",") - lbl = str[4] - left = int(str[1]) - middle = int(str[2]) - right = int(str[3]) - chance = random.random() - # if chance > 0.1: - # continue - if lbl == "neither": - # continue - labels_list.append([right, middle, 0]) - labels_list.append([left, middle, 0]) - first_order_labels[right] = 1 - first_order_labels[left] = 1 - first_order_labels[middle] = 1 - if lbl == "right": - labels_list.append([right, middle, 1]) - first_order_labels[right] = 1 - # first_order_labels[left] = 1 - first_order_labels[middle] = 1 - if lbl == "left": - labels_list.append([left, middle, 1]) - # first_order_labels[right] = 1 - first_order_labels[left] = 1 - first_order_labels[middle] = 1 - - # Todo: read from file - - return ( - data_latent_rnn_ndarray[:, 0:200], - first_order_labels, - (labels_list), - data_original, - ) - - -class TrinityDataset_with_cluster(Dataset): - """Contains information and parameters of a (Trinity) dataset. - - This is a PyTorch Dataset subclass containing information of a Trinity dataset. - https://trinityspeechgesture.scss.tcd.ie/data/Trinity%20Speech-Gesture%20I/GENEA_Challenge_2020_data_release/ - - This class is #TODO - - Attributes: - lmdb_dir: A string filepath of the directory containing the actual dataset. - lmdb_env: A Lmdb object loaded from .mdb files in the lmdb_dir. - n_poses: An integer number of frames in each clip in the dataset (normally 30 (in 30 fps)). - subdivision_stride: An integer number of frames between the start of one clip and the start of the next clip (clips can overlap). - skeleton_resampling_fps: An integer frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps). - n_samples: An integer number of clips/entries in the original dataset. - lang_model: A 'Vocab' pre-trained word vector representation or None. - data_mean: A mean calculated from each video in the original dataset. - data_std: A standard deviation calculcated from each video. - pairwise_enabled: #TODO - use_derivative: Boolean to stack gradients onto data during training. - encoded_labeled_poses: #TODO - sentence_level: #TODO - RNNautoencoder_model: #TODO - rep_learning_dim: An integer dimension of the model (unused). - rep_model: A DAE neural net model loaded from the above checkpoint. - Models: VQ_Frame, VAE_Network, DAE_Network in DAE_model.py. - """ - - def __init__( - self, - args, - lmdb_dir, - n_poses, - subdivision_stride, - pose_resampling_fps, - data_mean, - data_std, - ): - self.lmdb_dir = lmdb_dir - self.n_poses = n_poses - self.subdivision_stride = subdivision_stride - self.skeleton_resampling_fps = pose_resampling_fps - self.lang_model = None - self.data_mean = np.array(data_mean).squeeze() - self.data_std = np.array(data_std).squeeze() - self.use_derivative = args.use_derivative == "True" - self.kmeanmodel: DBSCAN | KMeans | AgglomerativeClustering = pickle.load( - open("../output/clustering_results/kmeans_model.pk", "rb") - ) - - if args.sentence_level == "True": - self.sentence_level = True - else: - self.sentence_level = False - - logging.info("Reading data '{}'...".format(lmdb_dir)) - if self.sentence_level: - preloaded_dir = lmdb_dir + "_sentence_level" + "_cache" - else: - preloaded_dir = lmdb_dir + "_cache" - - if not os.path.exists(preloaded_dir): - data_sampler = DataPreprocessor( - lmdb_dir, - preloaded_dir, - n_poses, - subdivision_stride, - pose_resampling_fps, - ) - data_sampler.run() - else: - logging.info("Found pre-loaded samples from {}".format(preloaded_dir)) - - # init lmdb - self.lmdb_env: lmdb.Environment = lmdb.open( - preloaded_dir, readonly=True, lock=False - ) - with self.lmdb_env.begin() as txn: - self.n_samples = txn.stat()["entries"] - - # Representation model - checkpoint_path: str = args.rep_learning_checkpoint - self.rep_learning_dim: int = args.rep_learning_dim - ( - rep_learning_args, - rep_model, - rep_loss_fn, - rep_lang_model, - rep_out_dim, - ) = utils.train_utils.load_checkpoint_and_model(checkpoint_path, device, "DAE") - self.rep_model: torch.nn.Module = rep_model.to("cpu") - self.rep_model.train(False) - # RNN autoencoder - checkpoint_path: str = args.autoencoder_checkpoint - - # RNNAutoencoder_args, RNNAutoencoder_model, RNNAutoencoder_fn,\ - # RNNAutoencoder_model, RNNAutoencoder_dim = utils.train_utils.load_checkpoint_and_model( - # checkpoint_path, device) - - ( - args, - rnn, - loss_fn, - lang_model, - out_dim, - ) = utils.train_utils.load_checkpoint_and_model( - checkpoint_path, device, "autoencoder" - ) - self.RNNAutoencoder_model: torch.nn.Module = rnn.to("cpu") - self.RNNAutoencoder_model.train(False) - - def __len__(self) -> int: - """Get the size of the dataset. - - Returns: - The number of samples in the dataset. - """ - return self.n_samples - - def __getitem__( - self, idx: int - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, dict, torch.Tensor]: - """TODO""" - with self.lmdb_env.begin(write=False) as txn: - key = "{:010}".format(idx).encode("ascii") - sample = txn.get(key) - - sample = pyarrow.deserialize(sample) - word_seq, pose_seq, audio, aux_info, portion = sample - - def words_to_tensor( - lang: Vocab | None, words: list[list], end_time: float | None = None - ): - indexes = [lang.SOS_token] - for word in words: - if end_time is not None and word[1] > end_time: - break - indexes.append(lang.get_word_index(word[0])) - indexes.append(lang.EOS_token) - return torch.Tensor(indexes).long() - - # normalize - std = np.clip(self.data_std, a_min=0.01, a_max=None) - pose_seq = (pose_seq - self.data_mean) / std - - # Todo: Here we should apply rep_learning - target = torch.from_numpy(pose_seq) - target = target.float() - with torch.no_grad(): - encoded_poses = self.rep_model.encoder(target) - - encoded_poses = encoded_poses.reshape((encoded_poses.shape[0], -1)).float() - - if self.use_derivative: - diff = [ - (encoded_poses[n, :] - encoded_poses[n - 1, :]) - for n in range(1, encoded_poses.shape[0]) - ] - diff.insert(0, torch.zeros_like(encoded_poses[0, :])) - encoded_poses = torch.hstack((encoded_poses, torch.stack(diff))) - with torch.no_grad(): - out_pose, latent, mue, logvar = self.RNNAutoencoder_model( - encoded_poses.unsqueeze(0), encoded_poses.unsqueeze(0) - ) - latent = latent[: self.RNNAutoencoder_model.decoder.n_layers] - latent = latent.squeeze(1) - latent = latent.reshape((1, -1)).float() - # latent = latent.squeeze(0) - cluster_id = self.kmeanmodel.predict(latent.cpu().detach().numpy()) - cluster_id = torch.Tensor(cluster_id).long() - # to tensors - word_seq_tensor = words_to_tensor( - self.lang_model, word_seq, aux_info["end_time"] - ) - pose_seq = torch.from_numpy(pose_seq).reshape((pose_seq.shape[0], -1)).float() - - # audio = torch.from_numpy(audio).float() - - # return cluster_id - return word_seq_tensor, encoded_poses, audio, aux_info, cluster_id - - def set_lang_model(self, lang_model: Vocab) -> None: - """Set the word vector representation to be used with the dataset. - - Modifies the internal state of the object at the attribute 'lang_model'. - - Args: - lang_model: A pre-trained word vector representation contained in the 'Vocab' class. - """ - self.lang_model = lang_model - - -class TrinityDataset_sentencelevel(Dataset): - """Contains information and parameters of a (Trinity) dataset. - - This is a PyTorch Dataset subclass containing information of a Trinity dataset. - https://trinityspeechgesture.scss.tcd.ie/data/Trinity%20Speech-Gesture%20I/GENEA_Challenge_2020_data_release/ - - This class is #TODO. - - Attributes: - lmdb_dir: A string filepath of the directory containing the actual dataset. - lmdb_env: A Lmdb object loaded from .mdb files in the lmdb_dir. - n_poses: An integer number of frames in each clip in the dataset (normally 30 (in 30 fps)). - subdivision_stride: An integer number of frames between the start of one clip and the start of the next clip (clips can overlap). - skeleton_resampling_fps: An integer frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps). - n_samples: An integer number of clips/entries in the original dataset. - lang_model: A 'Vocab' pre-trained word vector representation or None. - data_mean: A mean calculated from each video in the original dataset. - data_std: A standard deviation calculcated from each video. - args: #TODO - kmeanmodel: #TODO - sentence_level: #TODO - RNNAutoencoder_model: #TODO - use_derivative: #TODO - rep_learning_dim: An integer dimension of the model (unused). - rep_model: A DAE neural net model loaded from the above checkpoint. - Models: VQ_Frame, VAE_Network, DAE_Network in DAE_model.py. - """ - - def __init__( - self, - args: argparse.Namespace, - lmdb_dir: str, - n_poses: int, - subdivision_stride: int, - pose_resampling_fps: int, - data_mean: list[float], - data_std: list[float], - ): - """ """ - self.lmdb_dir = lmdb_dir - self.n_poses = n_poses - self.subdivision_stride = subdivision_stride - self.subdivision_stride_sentence = args.subdivision_stride_sentence - self.args = args - self.skeleton_resampling_fps = pose_resampling_fps - self.lang_model = None - self.data_mean = np.array(data_mean).squeeze() - self.data_std = np.array(data_std).squeeze() - self.use_derivative = args.use_derivative == "True" - - # Todo: fix adress -- Fixed - test = os.path.dirname(args.autoencoder_checkpoint) - self.kmeanmodel = pickle.load( - open( - os.path.dirname(args.autoencoder_checkpoint) - + "/clusters/kmeans_model.pk", - "rb", - ) - ) - # self.clustering_transforms = pickle.load(open(os.path.dirname(args.autoencoder_checkpoint) - # + '/clusters/transforms.pkl', 'rb')) - - if args.sentence_level == "True": - self.sentence_level = True - else: - self.sentence_level = False - - logging.info("Reading data '{}'...".format(lmdb_dir)) - - if self.sentence_level: - # preloaded_dir = lmdb_dir + '_sentence_level' + '_cache' - preloaded_dir = ( - args.model_save_path - + "lmdb/" - + os.path.basename((lmdb_dir)) - + "_sentence_level" - + "_cache" - ) - else: - # preloaded_dir = lmdb_dir + '_cache' - preloaded_dir = ( - args.model_save_path + "lmdb/" + os.path.basename((lmdb_dir)) + "_cache" - ) - - if not os.path.exists(args.model_save_path + "lmdb"): - os.mkdir(args.model_save_path + "lmdb") - if not os.path.exists(preloaded_dir): - data_sampler = DataPreprocessor( - args, - lmdb_dir, - preloaded_dir, - n_poses, - self.subdivision_stride_sentence, - pose_resampling_fps, - sentence_level=self.sentence_level, - ) - data_sampler.run() - else: - # Todo: Remove later - - # data_sampler = DataPreprocessor(args, lmdb_dir, preloaded_dir, n_poses, - # subdivision_stride, pose_resampling_fps, - # sentence_level=self.sentence_level) - # data_sampler.run() - logging.info("Found pre-loaded samples from {}".format(preloaded_dir)) - - # init lmdb - self.lmdb_env: lmdb.Environment = lmdb.open( - preloaded_dir, readonly=True, lock=False - ) - with self.lmdb_env.begin() as txn: - self.n_samples = txn.stat()["entries"] - - # Representation model - checkpoint_path: str = args.rep_learning_checkpoint - self.rep_learning_dim: int = args.rep_learning_dim - ( - rep_learning_args, - rep_model, - rep_loss_fn, - rep_lang_model, - rep_out_dim, - ) = utils.train_utils.load_checkpoint_and_model(checkpoint_path, device, "DAE") - self.rep_model = rep_model.to("cpu") - self.rep_model.train(False) - # RNN autoencoder - checkpoint_path: str = args.autoencoder_checkpoint - - # RNNAutoencoder_args, RNNAutoencoder_model, RNNAutoencoder_fn,\ - # RNNAutoencoder_model, RNNAutoencoder_dim = utils.train_utils.load_checkpoint_and_model( - # checkpoint_path, device) - - ( - args_rnn, - rnn, - loss_fn, - lang_model, - out_dim, - ) = utils.train_utils.load_checkpoint_and_model( - checkpoint_path, device, "autoencoder_vq" - ) - self.RNNAutoencoder_model: torch.nn.Module = rnn.to("cpu") - self.RNNAutoencoder_model.train(False) - - def __len__(self) -> int: - """Get the size of the dataset. - - Returns: - The number of samples in the dataset. - """ - return self.n_samples - - def __getitem__( - self, idx: int - ) -> Tuple[ - torch.Tensor, - torch.Tensor, - torch.Tensor, - dict, - torch.Tensor, - torch.Tensor, - torch.Tensor, - ]: - """TODO""" - with self.lmdb_env.begin(write=False) as txn: - key = "{:010}".format(idx).encode("ascii") - sample = txn.get(key) - - sample = pyarrow.deserialize(sample) - ( - word_seq, - pose_seq, - audio_raws, - audio_mels, - aux_info, - sentence_leve_latents, - GP3_Embedding, - ) = sample - - def words_to_tensor( - lang: Vocab | None, words: list[list], end_time: float | None = None - ) -> torch.Tensor: - # indexes = [lang.SOS_token] - indexes = [] - if len(words) <= 0: - print() - for word in words: - if end_time is not None and word[1] > end_time: - break - indexes.append(lang.get_word_index(word[0])) - # indexes.append(lang.EOS_token) - return torch.Tensor(indexes).long() - - def poseIndex_to_tensor(sop, eop, poses_indicies): - indexes = [sop] - for pose_index in poses_indicies: - indexes.append(pose_index) - indexes.append(eop) - return torch.Tensor(indexes).long() - - # normalize - std = np.clip(self.data_std, a_min=0.01, a_max=None) - pose_seq = (pose_seq - self.data_mean) / std - - # to tensors - word_seq_tensor = words_to_tensor( - self.lang_model, word_seq, aux_info["end_time"] - ) - pose_seq = torch.from_numpy(pose_seq).reshape((pose_seq.shape[0], -1)).float() - - sentence_leve_latents = torch.from_numpy(sentence_leve_latents) - sentence_leve_latents = sentence_leve_latents.reshape( - [sentence_leve_latents.shape[0], -1] - ).float() - - # Todo: we may need do some preprocessing. (e.g. padding, resampling, etc.) - # Todo: Move list-->ndarray to the preprocessing. - audio_raw_for_now = False - if audio_raw_for_now: - audio = audio_raws - else: - audio = audio_mels - audio = np.array(audio) - audio = torch.from_numpy(audio).float() - # audio = torch.reshape(audio, (sentence_leve_latents.shape[0], -1)) - # audio = torch.reshape(audio, (4, -1)) - - # return cluster_id - cluster_ids = np.zeros(sentence_leve_latents.shape[0]) - if self.RNNAutoencoder_model.vq == True: - ( - loss_vq, - quantized, - perplexity_vq, - encodings, - ) = self.RNNAutoencoder_model.vq_layer(sentence_leve_latents) - cluster_ids = torch.argmax(encodings, dim=1) - - # q = poseIndex_to_tensor(sop=512, eop=513, - # poses_indicies=cluster_ids.cpu().detach().numpy()) - # cluster_ids = q - # print(cluster_ids) - else: - cluster_ids = self.kmeanmodel.predict( - # self.clustering_transforms['scalar'].transform - (sentence_leve_latents.cpu().detach().numpy()) - ) - cluster_ids = torch.from_numpy(cluster_ids).long() - # cluster_id = torch.Tensor(cluster_id).long() - # print(cluster_ids) - - # clusters_portion = self.clustering_transforms['portion'][cluster_ids] - # clusters_portion = torch.from_numpy(clusters_portion) - - # print((cluster_ids)) - try: - GP3_Embedding = torch.from_numpy(GP3_Embedding).float() - except: - pass - - return ( - word_seq_tensor, - pose_seq, - audio, - aux_info, - sentence_leve_latents, - cluster_ids, - GP3_Embedding, - ) - - def set_lang_model(self, lang_model: Vocab) -> None: - """Set the word vector representation to be used with the dataset. - - Modifies the internal state of the object at the attribute 'lang_model'. - - Args: - lang_model: A pre-trained word vector representation contained in the 'Vocab' class. - """ - self.lang_model = lang_model +""" +""" + + +from __future__ import annotations +import logging +import os +import pickle +import random +from typing import Tuple + +import numpy as np +import lmdb as lmdb +import torch +from torch.nn.utils.rnn import pad_sequence +from torch.utils.data import Dataset +from torch.utils.data.dataloader import default_collate +from data_loader.data_preprocessor import DataPreprocessor +import pyarrow +from configargparse import argparse +from sklearn.cluster import DBSCAN, KMeans, AgglomerativeClustering + +from model.vocab import Vocab +import utils + +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + + +def word_seq_collate_fn( + data: list, +) -> ( + Tuple[ + torch.Tensor, + torch.Tensor, + torch.Tensor, + torch.Tensor, + dict, + torch.Tensor, + torch.Tensor, + torch.Tensor, + ] + | Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, dict] +): + """Collate function for loading word sequences in variable lengths. + + This function returns extra values depending if sentence_leve is True or False. + sentence_leve is set to always True and must be manually changed. + + Args: + data: A list of samples including the following values in each element: + word_seq: A list of Tensors of word vector representations. + words_lengths: A list of Tensors of word lengths in word_seq. + poses_seq: A list of Tensors of poses/gestures. + audio: A list of Tensors of audio data. + + Returns: + A 5-Tuple or 8-Tuple: + + 8-Tuple (if sentence_level is set to True): + word_seq: A Tensor of word vector representations. + words_lengths: A Tensor of word lengths in word_seq. + poses_seq: A Tensor of poses/gestures. + audio: A Tensor of audio data. + aux_info: A dict containing info such as name, start/end frames and times. + sentence_leve_latents: #TODO + cluster_portion: #TODO + GPT3_Embedding: #TODO + + 5-Tuple: + The first five values in the 8-Tuple. + """ + # sort a list by sequence length (descending order) to use pack_padded_sequence + data.sort(key=lambda x: len(x[0]), reverse=True) + + # separate source and target sequences + # Todo: fix this using args + sentence_leve = True + if not sentence_leve: + ( + word_seq, + poses_seq, + audio, + aux_info, + ) = zip(*data) + else: + ( + word_seq, + poses_seq, + audio, + aux_info, + sentence_leve_latents, + cluster_portion, + GPT3_Embedding, + ) = zip(*data) + + # merge sequences + words_lengths = torch.LongTensor([len(x) for x in word_seq]) + word_seq = pad_sequence(word_seq, batch_first=True).long() + + poses_seq = default_collate(poses_seq) + audio = default_collate(audio) + if sentence_leve: + sentence_leve_latents = default_collate(sentence_leve_latents) + cluster_portion = default_collate(cluster_portion) + GPT3_Embedding = default_collate(GPT3_Embedding) + + # audio = default_collate(audio) + aux_info = {key: default_collate([d[key] for d in aux_info]) for key in aux_info[0]} + + if sentence_leve: + return ( + word_seq, + words_lengths, + poses_seq, + audio, + aux_info, + sentence_leve_latents, + cluster_portion, + GPT3_Embedding, + ) + else: + return word_seq, words_lengths, poses_seq, audio, aux_info + + +class TrinityDataset(Dataset): + """Contains information and associated parameters of a (Trinity) dataset. + + This is a PyTorch Dataset subclass containing information of a Trinity dataset. + https://trinityspeechgesture.scss.tcd.ie/data/Trinity%20Speech-Gesture%20I/GENEA_Challenge_2020_data_release/ + + Attributes: + lmdb_dir: A string representing the filepath of the directory containing the actual dataset. + lmdb_env: A Lmdb ('Lightning Memory-Mapped' Database) object loaded from .mdb files located at the lmdb_dir. + n_poses: An int representing the number of frames in each clip in the dataset (normally 30 (in 30 fps)). + subdivision_stride: An int representing the number of frames between the start of one clip and the start of the next clip (clips can overlap). + skeleton_resampling_fps: An int representing the frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps). + n_samples: An int representing the number of clips/entries in the original dataset. + lang_model: A pre-trained (English) language vector representation contained in the 'Vocab' custom class. + data_mean: A mean calculated from each video in the original dataset. + data_std: A standard deviation calculcated from each video in the original dataset. + """ + + def __init__( + self, + args: argparse.Namespace, + lmdb_dir: str, + n_poses: int, + subdivision_stride: int, + pose_resampling_fps: int, + data_mean: list[float], + data_std: list[float], + ): + """Initialize with multiple dataset parameters. + + The args argument must contain the following keys: + name: A string name of the model (ex. 'DAE' or 'autoencoder_vq'). + rep_learning_checkpoint: If name is not 'DAE', a string filepath to a saved 'DAE' checkpoint model. + autoencoder_checkpoint: If sentence level is True, a string filepath to a saved VQVAE checkpoint model. + sentence_frame_length: An integer number of frames in each clip (for a sentence instead of gesture). + + Args: + args: A configargparse object containing parameters (See above). + lmdb_dir: A string representing the filepath of the directory containing the actual dataset. + n_poses: An int representing the number of frames in each clip in the dataset (normally 30 (in 30 fps)). + subdivision_stride: An int representing the number of frames between the start of one clip and the start of the next clip (clips can overlap). + pose_resampling_fps: An int representing the frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps). + data_mean: A mean calculated from each video in the original dataset. + data_std: A standard deviation calculcated from each video in the original dataset. + """ + self.lmdb_dir = lmdb_dir + self.n_poses = n_poses + self.subdivision_stride = subdivision_stride + self.skeleton_resampling_fps = pose_resampling_fps + self.lang_model = None + self.data_mean = np.array(data_mean).squeeze() + self.data_std = np.array(data_std).squeeze() + + logging.info("Reading data '{}'...".format(lmdb_dir)) + preloaded_dir = lmdb_dir + "_cache" + if not os.path.exists(preloaded_dir): + data_sampler = DataPreprocessor( + args, + lmdb_dir, + preloaded_dir, + n_poses, + subdivision_stride, + pose_resampling_fps, + sentence_level=False, + ) + data_sampler.run() + else: + logging.info("Found pre-loaded samples from {}".format(preloaded_dir)) + + # init lmdb + self.lmdb_env: lmdb.Environment = lmdb.open( + preloaded_dir, readonly=True, lock=False + ) + with self.lmdb_env.begin() as txn: + self.n_samples = txn.stat()["entries"] + + def __len__(self) -> int: + """Get the size of the dataset. + + Returns: + The number of samples in the original dataset. + """ + return self.n_samples + + def __getitem__( + self, idx: int + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, dict]: + """Get an item at a specific index in the dataset. + + Retrieves a specific entry in the original dataset and + associated information of that entry such as words, audio, poses, etc. + + Args: + idx: The index of the item to retrieve. + + Returns: + A 4-Tuple: + word_seq_tensor: A Tensor of word vector representations. + pose_seq: A Tensor of pose/gesture data. + audio: A Tensor of audio data. + aux_info: A dict containing the string keys: + 'vid': A string name for the information. + 'start_frame_no': An integer of the start index. + 'end_frame_no': An integer of the end index. + 'start_time': A float of the start time of the clip. + 'end_time': A float of the end time of the clip. + """ + with self.lmdb_env.begin(write=False) as txn: + key = "{:010}".format(idx).encode("ascii") + sample = txn.get(key) + + sample = pyarrow.deserialize(sample) + word_seq, pose_seq, audio, aux_info = sample + + def words_to_tensor(lang, words, end_time=None): + indexes = [lang.SOS_token] + for word in words: + if end_time is not None and word[1] > end_time: + break + indexes.append(lang.get_word_index(word[0])) + indexes.append(lang.EOS_token) + return torch.Tensor(indexes).long() + + # normalize + std = np.clip(self.data_std, a_min=0.01, a_max=None) + pose_seq: torch.Tensor = (pose_seq - self.data_mean) / std + + # to tensors + word_seq_tensor = words_to_tensor( + self.lang_model, word_seq, aux_info["end_time"] + ) + pose_seq = torch.from_numpy(pose_seq).reshape((pose_seq.shape[0], -1)).float() + audio = torch.from_numpy(audio).float() + + return word_seq_tensor, pose_seq, audio, aux_info + + def set_lang_model(self, lang_model: Vocab) -> None: + """Set the word vector representation to be used with the dataset. + + Modifies the internal state of the object at the attribute 'lang_model'. + + Args: + lang_model: A pre-trained word vector representation contained in the 'Vocab' class. + """ + self.lang_model = lang_model + + +class TrinityDataset_DAE(Dataset): + """Contains information and parameters of a (Trinity) dataset. + + This is a PyTorch Dataset subclass containing information of a Trinity dataset. + https://trinityspeechgesture.scss.tcd.ie/data/Trinity%20Speech-Gesture%20I/GENEA_Challenge_2020_data_release/ + + This class is focussed on the pose/gesture data only. + + Attributes: + lmdb_dir: A string filepath of the directory containing the actual dataset. + lmdb_env: A Lmdb object loaded from .mdb files in the lmdb_dir. + n_poses: An integer number of frames in each clip in the dataset (normally 30 (in 30 fps)). + subdivision_stride: An integer number of frames between the start of one clip and the start of the next clip (clips can overlap). + skeleton_resampling_fps: An integer frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps). + n_samples: An integer number of clips/entries in the original dataset. + lang_model: A 'Vocab' pre-trained word vector representation. + data_mean: A mean calculated from each video in the original dataset. + data_std: A standard deviation calculcated from each video. + all_poses: A list where each element is a dict containing string keys: + 'original': A Tensor of each pose/gesture data. + 'noisy': A Tensor with the same values as 'original'. + """ + + def __init__( + self, + args: argparse.Namespace, + lmdb_dir: str, + n_poses: int, + subdivision_stride: int, + pose_resampling_fps: int, + data_mean: list[float], + data_std: list[float], + ): + """Initialize with dataset location and several parameters. + + The args argument must contain the following keys: + name: A string name of the model (ex. 'DAE' or 'autoencoder_vq'). + rep_learning_checkpoint: If name is not 'DAE', a string filepath to a saved 'DAE' checkpoint model. + autoencoder_checkpoint: If sentence level is True, a string filepath to a saved VQVAE checkpoint model. + sentence_frame_length: An integer number of frames in each clip (for a sentence instead of gesture). + + Args: + args: A configargparse object containing parameters (See above). + lmdb_dir: A string representing the filepath of the directory containing the actual dataset. + n_poses: An int representing the number of frames in each clip in the dataset (normally 30 (in 30 fps)). + subdivision_stride: An int representing the number of frames between the start of one clip and the start of the next clip (clips can overlap). + pose_resampling_fps: An int representing the frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps). + data_mean: A mean calculated from each video in the original dataset. + data_std: A standard deviation calculcated from each video in the original dataset. + """ + self.lmdb_dir = lmdb_dir + self.n_poses = n_poses + self.subdivision_stride = subdivision_stride + self.skeleton_resampling_fps = pose_resampling_fps + self.lang_model = None + self.data_mean = np.array(data_mean).squeeze() + self.data_std = np.array(data_std).squeeze() + + logging.info("Reading data '{}'...".format(lmdb_dir)) + preloaded_dir = lmdb_dir + "_cache" + if not os.path.exists(preloaded_dir): + data_sampler = DataPreprocessor( + args, + lmdb_dir, + preloaded_dir, + n_poses, + subdivision_stride, + pose_resampling_fps, + ) + data_sampler.run() + else: + logging.info("Found pre-loaded samples from {}".format(preloaded_dir)) + + # init lmdb + self.lmdb_env: lmdb.Environment = lmdb.open( + preloaded_dir, readonly=True, lock=False + ) + with self.lmdb_env.begin() as txn: + self.n_samples = txn.stat()["entries"] + + # Initiate all poses + self.all_poses = [] + self.create_all_poses() + print("data init finished!") + + def __len__(self) -> int: + """Get the size of the dataset. + + Returns: + The number of samples in the dataset. + """ + # return 500 + # return (self.n_samples * self.n_poses) //5 + return self.n_samples + + def create_all_poses(self) -> None: + """Move all poses/gestures from database into object. + + Modifies the internal state of the object at attribute 'all_poses'. + """ + with self.lmdb_env.begin(write=False) as txn: + for i in range(self.n_samples): + key = "{:010}".format(i).encode("ascii") + sample = txn.get(key) + + sample = pyarrow.deserialize(sample) + word_seq, pose_seq, audio, aux_info = sample + + # normalize + std = np.clip(self.data_std, a_min=0.01, a_max=None) + pose_seq = (pose_seq - self.data_mean) / std + + for j in range(0, len(pose_seq)): + original = pose_seq[j, :] + var_coef = 1 # std + sigma = 1 + # noisy = self.add_noise(original, 0.0, std) + noisy = original # dropout layer adds noise instead + self.all_poses.append({"original": original, "noisy": noisy}) + + def get_item_Memory_Efficient(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]: + """Get the item at a specific index in the dataset. + + This method improves memory performance by returning + the pose/gesture data of the item only. + + Args: + idx: An integer index of the item to get. + + Returns: + A 2-Tuple: + noisy: A Tensor of pose/gesture data. + original: A Tensor with the same values as 'noisy'. + """ + idx_lmdb = idx // self.n_poses + + with self.lmdb_env.begin(write=False) as txn: + key = "{:010}".format(idx_lmdb).encode("ascii") + sample = txn.get(key) + + sample = pyarrow.deserialize(sample) + word_seq, pose_seq, audio, aux_info = sample + + # normalize + std = np.clip(self.data_std, a_min=0.01, a_max=None) + pose_seq = (pose_seq - self.data_mean) / std + + original = pose_seq[idx % self.n_poses, :] + noisy = original + + original = ( + torch.from_numpy(original).reshape((original.shape[0], -1)).float() + ) + noisy = torch.from_numpy(noisy).reshape((noisy.shape[0], -1)).float() + + return noisy, original + + def add_noise( + self, x: np.ndarray, variance_multiplier: np.ndarray, sigma: np.ndarray + ) -> np.ndarray: + """Add Gaussian noise to the input data. + + The noise added is based on the variance_multiplier and sigma array values. + + Args: + x: A numeric numpy array of data. + variance_multiplier: A numeric numpy array of coefficients to multiple variance of the noise on + sigma: A numeric numpy array of the variance of the dataset + + Returns: + A numeric numpy array of the data with noise added. + """ + eps = 1e-15 + noise = np.random.normal( + 0.0, np.multiply(sigma, variance_multiplier) + eps, x.shape + ) + x = x + noise + return x + + def add_noise2(self, x: np.ndarray, prob: float) -> np.ndarray: + """Add Gaussian noise to the input data. + + The noise is added by zeroing the specific elements. + + Args: + x: A numeric numpy array of data. + prob: A float percentage of adding noise to a single element. + + Returns: + A numeric numpy array of the data with noise added. + """ + for i in range(len(x)): + rnd = random.random() + if rnd < prob: + x[i] = 0 + return x + + def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]: + """Get the item at a specific index in the dataset. + + This method returns the data exactly as stored. The 'noisy' data does + not have noise added but assumes that noise has previously been added. + + Args: + idx: An integer index of the item to get. + + Returns: + A 2-Tuple: + noisy: A Tensor of pose/gesture data (with noise). + original: A Tensor of pose/gesture data. + """ + # Keep and document for only large datasets + # Todo: I think this way is more efficient than the prev. approach + # I need to double check and make sure everything works as before. + # return self.get_item_Memory_Efficient(idx) + # original, noisy = self.get_item_Memory_Efficient(idx) + # return noisy, original + + original = self.all_poses[idx]["original"] + noisy = self.all_poses[idx]["noisy"] + + original = torch.from_numpy(original).reshape((original.shape[0], -1)).float() + noisy = torch.from_numpy(noisy).reshape((noisy.shape[0], -1)).float() + + return noisy, original + + def set_lang_model(self, lang_model: Vocab) -> None: + """Set the word vector representation to be used with the dataset. + + Modifies the internal state of the object at the attribute 'lang_model'. + + Args: + lang_model: A pre-trained word vector representation contained in the 'Vocab' class. + """ + self.lang_model = lang_model + + +class TrinityDataset_DAEed_Autoencoder(Dataset): + """Contains information and parameters of a (Trinity) dataset. + + This is a PyTorch Dataset subclass containing information of a Trinity dataset. + https://trinityspeechgesture.scss.tcd.ie/data/Trinity%20Speech-Gesture%20I/GENEA_Challenge_2020_data_release/ + + This class is #TODO. + + Attributes: + lmdb_dir: A string filepath of the directory containing the actual dataset. + lmdb_env: A Lmdb object loaded from .mdb files in the lmdb_dir. + n_poses: An integer number of frames in each clip in the dataset (normally 30 (in 30 fps)). + subdivision_stride: An integer number of frames between the start of one clip and the start of the next clip (clips can overlap). + skeleton_resampling_fps: An integer frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps). + n_samples: An integer number of clips/entries in the original dataset. + lang_model: A 'Vocab' pre-trained word vector representation or None. + data_mean: A mean calculated from each video in the original dataset. + data_std: A standard deviation calculcated from each video. + pairwise_enabled: #TODO + use_derivative: Boolean to stack gradients to data during training. + encoded_labeled_poses: #TODO + rep_learning_dim: An integer dimension of the model (unused). + rep_learning_checkpoint: A string filepath to saved DAE model checkpoints. + rep_model: A DAE neural net model loaded from the above checkpoint. + Models: VQ_Frame, VAE_Network, DAE_Network in DAE_model.py. + """ + + def __init__( + self, + args: argparse.Namespace, + lmdb_dir: str, + n_poses: int, + subdivision_stride: int, + pose_resampling_fps: int, + data_mean: list[float], + data_std: list[float], + ): + """Initialize with dataset location and several parameters. + + The args argument must contain the following keys: + name: A string name of the model (ex. 'DAE' or 'autoencoder_vq'). + rep_learning_checkpoint: If name is not 'DAE', a string filepath to a saved 'DAE' checkpoint model. + autoencoder_checkpoint: If sentence level is True, a string filepath to a saved VQVAE checkpoint model. + sentence_frame_length: An integer number of frames in each clip (for a sentence instead of gesture). + rep_learning_checkpoint: A string filepath to saved model checkpoints. + use_derivative: A boolean whether to use derivatives. + + Args: + args: A configargparse object containing parameters (See above). + lmdb_dir: A string representing the filepath of the directory containing the actual dataset. + n_poses: An int representing the number of frames in each clip in the dataset (normally 30 (in 30 fps)). + subdivision_stride: An int representing the number of frames between the start of one clip and the start of the next clip (clips can overlap). + pose_resampling_fps: An int representing the frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps). + data_mean: A mean calculated from each video in the original dataset. + data_std: A standard deviation calculcated from each video in the original dataset. + """ + self.lmdb_dir = lmdb_dir + self.n_poses = n_poses + self.subdivision_stride = subdivision_stride + self.skeleton_resampling_fps = pose_resampling_fps + self.lang_model: Vocab | None = None + self.data_mean = np.array(data_mean).squeeze() + self.data_std = np.array(data_std).squeeze() + + # We will change it to true once we call the creat_similarity_dataset + self.pairwise_enabled: bool = False + self.use_derivative: bool = args.use_derivative == "True" + + logging.info("Reading data '{}'...".format(lmdb_dir)) + preloaded_dir = lmdb_dir + "_cache" + if not os.path.exists(preloaded_dir): + data_sampler = DataPreprocessor( + args, + lmdb_dir, + preloaded_dir, + n_poses, + subdivision_stride, + pose_resampling_fps, + ) + data_sampler.run() + else: + logging.info("Found pre-loaded samples from {}".format(preloaded_dir)) + + # init lmdb + self.lmdb_env: lmdb.Environment = lmdb.open( + preloaded_dir, readonly=True, lock=False + ) + with self.lmdb_env.begin() as txn: + self.n_samples = txn.stat()["entries"] + + # Todo: we need to initiate pre-trained representation learning model + checkpoint_path: str = args.rep_learning_checkpoint + self.rep_learning_dim: int = args.rep_learning_dim + ( + rep_learning_args, + rep_model, + rep_loss_fn, + rep_lang_model, + rep_out_dim, + ) = utils.train_utils.load_checkpoint_and_model(checkpoint_path, device, "DAE") + self.rep_model: torch.nn.Module = rep_model.to("cpu") + self.rep_model.train(False) + + def __len__(self) -> int: + """Get the number of samples in the dataset. + + Returns: + The integer size of samples in the dataset. + """ + return self.n_samples + + def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]: + """Get the item at a specific index in the dataset. + + Args: + idx: An integer index of the item to get. + + Returns: + A 2-Tuple: + encoded_poses: A Tensor of pose/gesture data. + encoded_poses: The same tensor as above. + """ + with self.lmdb_env.begin(write=False) as txn: + key = "{:010}".format(idx).encode("ascii") + sample = txn.get(key) + + sample: Tuple[ + np.ndarray, np.ndarray, np.ndarray, dict + ] = pyarrow.deserialize(sample) + word_seq, pose_seq, audio, aux_info = sample + + # normalize + std = np.clip(self.data_std, a_min=0.01, a_max=None) + pose_seq = (pose_seq - self.data_mean) / std + + # Todo: Here we should apply rep_learning + target = torch.from_numpy(pose_seq) + # target = torch.unsqueeze(target, 2) + target = target.float() + # target = target.to(device) + with torch.no_grad(): + if self.rep_model.encoder == None: # for ablation study + encoded_poses = target + else: + encoded_poses: torch.Tensor = self.rep_model.encoder(target) + + # encoded_poses = torch.squeeze(encoded_poses, 2) + # encoded_poses = encoded_poses.to('cpu') + # reconstructed = encoded_poses.detach().numpy() + + # to tensors + # word_seq_tensor = words_to_tensor(self.lang_model, word_seq, aux_info['end_time']) + # pose_seq = torch.from_numpy(pose_seq).reshape((pose_seq.shape[0], -1)).float() + encoded_poses = encoded_poses.reshape((encoded_poses.shape[0], -1)).float() + # audio = torch.from_numpy(audio).float() + + if self.use_derivative: + diff = [ + (encoded_poses[n, :] - encoded_poses[n - 1, :]) + for n in range(1, encoded_poses.shape[0]) + ] + diff.insert(0, torch.zeros_like(encoded_poses[0, :])) + encoded_poses = torch.hstack((encoded_poses, torch.stack(diff))) + + # return word_seq_tensor, pose_seq, audio, aux_info + return encoded_poses, encoded_poses + + def create_similarity_dataset(self, pickle_file: str, labelstxt_file: str) -> None: + """TODO""" + # Todo: 1. Thos function gets the pickle file that I made in the clustering.py(or flowgmm) process as well + # Todo: as the labels text file that I annotated in the Unity application. + # Todo: 2. Then I will creat those pairs of similarity and dissimilarity + # Todo: 3. Finally, we store the pairs into the class. + # Todo: We will use pairwise label and an extra loss in backpropagation process later. + + # 1. call preprocess load + ( + self.data_rnn, + self.labels, + self.pairwise_labels, + self.data_original, + ) = self.load_gesture_data(pickle_file, labelstxt_file) + + # normalize + std = np.clip(self.data_std, a_min=0.01, a_max=None) + self.data_original = (self.data_original - self.data_mean) / std + + target = torch.from_numpy(self.data_original) + + target = target.float() + # target = target.to(device) + with torch.no_grad(): + self.encoded_labeled_poses = self.rep_model.encoder(target) + if self.use_derivative: + diff = [ + ( + self.encoded_labeled_poses[n, :] + - self.encoded_labeled_poses[n - 1, :] + ) + for n in range(1, self.encoded_labeled_poses.shape[0]) + ] + diff.insert(0, torch.zeros_like(self.encoded_labeled_poses[0, :])) + self.encoded_labeled_poses = torch.cat( + (self.encoded_labeled_poses, torch.stack(diff)), dim=2 + ) + self.pairwise_enabled = True + pass + + def get_labeled_( + self, count: int + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """TODO""" + stack_pairs1 = torch.zeros( + count, + self.encoded_labeled_poses.shape[1], + self.encoded_labeled_poses.shape[2], + ) + stack_pairs2 = torch.zeros( + count, + self.encoded_labeled_poses.shape[1], + self.encoded_labeled_poses.shape[2], + ) + stack_labels = torch.zeros(count) + rnds = random.sample(range(1, len(self.pairwise_labels)), 3) + k = 0 + for rnd in rnds: + current_pair = self.pairwise_labels[rnd] + s1_ = self.encoded_labeled_poses[current_pair[0]] + s2_ = self.encoded_labeled_poses[current_pair[1]] + ss_label = current_pair[2] + stack_pairs1[k, :, :] = s1_ + stack_pairs2[k, :, :] = s2_ + stack_labels[k] = ss_label + k = k + 1 + + return stack_pairs1, stack_pairs2, stack_labels + + def set_lang_model(self, lang_model: Vocab) -> None: + """Set the word vector representation to be used with the dataset. + + Modifies the internal state of the object at the attribute 'lang_model'. + + Args: + lang_model: A pre-trained word vector representation contained in the 'Vocab' class. + """ + self.lang_model = lang_model + + def load_gesture_data( + self, pre_processed_pickle_adress: str, labelstxt_file: str + ) -> Tuple[np.ndarray, np.ndarray, list, list]: + """TODO""" + loaded = pickle.load(open(pre_processed_pickle_adress, "rb")) + liaded_len = len(loaded) + loaded = np.hstack(loaded) + print("len loaded", len(loaded)) + print("Loaded successfully") + + data_latent_rnn = [] + data_latent_linear = np.zeros( + [ + len(loaded), + loaded[0]["latent_linear"].shape[0], + loaded[0]["latent_linear"].shape[1], + ] + ) + data_latent_linear_listwise = [] + data_original = [] + count = len(loaded) + # count = 4000 + for i in range(count): + # 1 + current_latent_linear = loaded[i]["latent_linear"] + current_latent_rnn = loaded[i]["latent_rnn"] + current_original = loaded[i]["original"] + + if len(current_original) != len(loaded[0]["original"]): + continue + + # 2 + # current_latent_linear = np.hstack(current_latent_linear) + current_latent_rnn = np.hstack(current_latent_rnn) + # current_original = np.hstack(current_original) + + # 3 + data_latent_linear[i] = current_latent_linear + data_latent_linear_listwise.append(current_latent_linear) + data_latent_rnn.append(current_latent_rnn) + data_original.append(current_original) + + # Should be constructed here since it is not obvious how many we will have at the end. + data_latent_linear = np.zeros( + [ + len(data_latent_linear_listwise), + loaded[0]["latent_linear"].shape[0], + loaded[0]["latent_linear"].shape[1], + ] + ) + for i in range(len(data_latent_linear_listwise)): + data_latent_linear[i] = data_latent_linear_listwise[i] + + data_latent_rnn_ndarray = np.array(data_latent_rnn) + + first_order_labels = np.ones(len(data_latent_rnn_ndarray)) * -1 + + labels_list = [] + label_file = open(labelstxt_file, "r") + + for line in label_file: + str = line.split(",") + lbl = str[4] + left = int(str[1]) + middle = int(str[2]) + right = int(str[3]) + chance = random.random() + # if chance > 0.1: + # continue + if lbl == "neither": + # continue + labels_list.append([right, middle, 0]) + labels_list.append([left, middle, 0]) + first_order_labels[right] = 1 + first_order_labels[left] = 1 + first_order_labels[middle] = 1 + if lbl == "right": + labels_list.append([right, middle, 1]) + first_order_labels[right] = 1 + # first_order_labels[left] = 1 + first_order_labels[middle] = 1 + if lbl == "left": + labels_list.append([left, middle, 1]) + # first_order_labels[right] = 1 + first_order_labels[left] = 1 + first_order_labels[middle] = 1 + + # Todo: read from file + + return ( + data_latent_rnn_ndarray[:, 0:200], + first_order_labels, + (labels_list), + data_original, + ) + + +class TrinityDataset_with_cluster(Dataset): + """Contains information and parameters of a (Trinity) dataset. + + This is a PyTorch Dataset subclass containing information of a Trinity dataset. + https://trinityspeechgesture.scss.tcd.ie/data/Trinity%20Speech-Gesture%20I/GENEA_Challenge_2020_data_release/ + + This class is #TODO + + Attributes: + lmdb_dir: A string filepath of the directory containing the actual dataset. + lmdb_env: A Lmdb object loaded from .mdb files in the lmdb_dir. + n_poses: An integer number of frames in each clip in the dataset (normally 30 (in 30 fps)). + subdivision_stride: An integer number of frames between the start of one clip and the start of the next clip (clips can overlap). + skeleton_resampling_fps: An integer frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps). + n_samples: An integer number of clips/entries in the original dataset. + lang_model: A 'Vocab' pre-trained word vector representation or None. + data_mean: A mean calculated from each video in the original dataset. + data_std: A standard deviation calculcated from each video. + pairwise_enabled: #TODO + use_derivative: Boolean to stack gradients onto data during training. + encoded_labeled_poses: #TODO + sentence_level: #TODO + RNNautoencoder_model: #TODO + rep_learning_dim: An integer dimension of the model (unused). + rep_model: A DAE neural net model loaded from the above checkpoint. + Models: VQ_Frame, VAE_Network, DAE_Network in DAE_model.py. + """ + + def __init__( + self, + args, + lmdb_dir, + n_poses, + subdivision_stride, + pose_resampling_fps, + data_mean, + data_std, + ): + self.lmdb_dir = lmdb_dir + self.n_poses = n_poses + self.subdivision_stride = subdivision_stride + self.skeleton_resampling_fps = pose_resampling_fps + self.lang_model = None + self.data_mean = np.array(data_mean).squeeze() + self.data_std = np.array(data_std).squeeze() + self.use_derivative = args.use_derivative == "True" + self.kmeanmodel: DBSCAN | KMeans | AgglomerativeClustering = pickle.load( + open("../output/clustering_results/kmeans_model.pk", "rb") + ) + + if args.sentence_level == "True": + self.sentence_level = True + else: + self.sentence_level = False + + logging.info("Reading data '{}'...".format(lmdb_dir)) + if self.sentence_level: + preloaded_dir = lmdb_dir + "_sentence_level" + "_cache" + else: + preloaded_dir = lmdb_dir + "_cache" + + if not os.path.exists(preloaded_dir): + data_sampler = DataPreprocessor( + lmdb_dir, + preloaded_dir, + n_poses, + subdivision_stride, + pose_resampling_fps, + ) + data_sampler.run() + else: + logging.info("Found pre-loaded samples from {}".format(preloaded_dir)) + + # init lmdb + self.lmdb_env: lmdb.Environment = lmdb.open( + preloaded_dir, readonly=True, lock=False + ) + with self.lmdb_env.begin() as txn: + self.n_samples = txn.stat()["entries"] + + # Representation model + checkpoint_path: str = args.rep_learning_checkpoint + self.rep_learning_dim: int = args.rep_learning_dim + ( + rep_learning_args, + rep_model, + rep_loss_fn, + rep_lang_model, + rep_out_dim, + ) = utils.train_utils.load_checkpoint_and_model(checkpoint_path, device, "DAE") + self.rep_model: torch.nn.Module = rep_model.to("cpu") + self.rep_model.train(False) + # RNN autoencoder + checkpoint_path: str = args.autoencoder_checkpoint + + # RNNAutoencoder_args, RNNAutoencoder_model, RNNAutoencoder_fn,\ + # RNNAutoencoder_model, RNNAutoencoder_dim = utils.train_utils.load_checkpoint_and_model( + # checkpoint_path, device) + + ( + args, + rnn, + loss_fn, + lang_model, + out_dim, + ) = utils.train_utils.load_checkpoint_and_model( + checkpoint_path, device, "autoencoder" + ) + self.RNNAutoencoder_model: torch.nn.Module = rnn.to("cpu") + self.RNNAutoencoder_model.train(False) + + def __len__(self) -> int: + """Get the size of the dataset. + + Returns: + The number of samples in the dataset. + """ + return self.n_samples + + def __getitem__( + self, idx: int + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, dict, torch.Tensor]: + """TODO""" + with self.lmdb_env.begin(write=False) as txn: + key = "{:010}".format(idx).encode("ascii") + sample = txn.get(key) + + sample = pyarrow.deserialize(sample) + word_seq, pose_seq, audio, aux_info, portion = sample + + def words_to_tensor( + lang: Vocab | None, words: list[list], end_time: float | None = None + ): + indexes = [lang.SOS_token] + for word in words: + if end_time is not None and word[1] > end_time: + break + indexes.append(lang.get_word_index(word[0])) + indexes.append(lang.EOS_token) + return torch.Tensor(indexes).long() + + # normalize + std = np.clip(self.data_std, a_min=0.01, a_max=None) + pose_seq = (pose_seq - self.data_mean) / std + + # Todo: Here we should apply rep_learning + target = torch.from_numpy(pose_seq) + target = target.float() + with torch.no_grad(): + encoded_poses = self.rep_model.encoder(target) + + encoded_poses = encoded_poses.reshape((encoded_poses.shape[0], -1)).float() + + if self.use_derivative: + diff = [ + (encoded_poses[n, :] - encoded_poses[n - 1, :]) + for n in range(1, encoded_poses.shape[0]) + ] + diff.insert(0, torch.zeros_like(encoded_poses[0, :])) + encoded_poses = torch.hstack((encoded_poses, torch.stack(diff))) + with torch.no_grad(): + out_pose, latent, mue, logvar = self.RNNAutoencoder_model( + encoded_poses.unsqueeze(0), encoded_poses.unsqueeze(0) + ) + latent = latent[: self.RNNAutoencoder_model.decoder.n_layers] + latent = latent.squeeze(1) + latent = latent.reshape((1, -1)).float() + # latent = latent.squeeze(0) + cluster_id = self.kmeanmodel.predict(latent.cpu().detach().numpy()) + cluster_id = torch.Tensor(cluster_id).long() + # to tensors + word_seq_tensor = words_to_tensor( + self.lang_model, word_seq, aux_info["end_time"] + ) + pose_seq = torch.from_numpy(pose_seq).reshape((pose_seq.shape[0], -1)).float() + + # audio = torch.from_numpy(audio).float() + + # return cluster_id + return word_seq_tensor, encoded_poses, audio, aux_info, cluster_id + + def set_lang_model(self, lang_model: Vocab) -> None: + """Set the word vector representation to be used with the dataset. + + Modifies the internal state of the object at the attribute 'lang_model'. + + Args: + lang_model: A pre-trained word vector representation contained in the 'Vocab' class. + """ + self.lang_model = lang_model + + +class TrinityDataset_sentencelevel(Dataset): + """Contains information and parameters of a (Trinity) dataset. + + This is a PyTorch Dataset subclass containing information of a Trinity dataset. + https://trinityspeechgesture.scss.tcd.ie/data/Trinity%20Speech-Gesture%20I/GENEA_Challenge_2020_data_release/ + + This class is #TODO. + + Attributes: + lmdb_dir: A string filepath of the directory containing the actual dataset. + lmdb_env: A Lmdb object loaded from .mdb files in the lmdb_dir. + n_poses: An integer number of frames in each clip in the dataset (normally 30 (in 30 fps)). + subdivision_stride: An integer number of frames between the start of one clip and the start of the next clip (clips can overlap). + skeleton_resampling_fps: An integer frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps). + n_samples: An integer number of clips/entries in the original dataset. + lang_model: A 'Vocab' pre-trained word vector representation or None. + data_mean: A mean calculated from each video in the original dataset. + data_std: A standard deviation calculcated from each video. + args: #TODO + kmeanmodel: #TODO + sentence_level: #TODO + RNNAutoencoder_model: #TODO + use_derivative: #TODO + rep_learning_dim: An integer dimension of the model (unused). + rep_model: A DAE neural net model loaded from the above checkpoint. + Models: VQ_Frame, VAE_Network, DAE_Network in DAE_model.py. + """ + + def __init__( + self, + args: argparse.Namespace, + lmdb_dir: str, + n_poses: int, + subdivision_stride: int, + pose_resampling_fps: int, + data_mean: list[float], + data_std: list[float], + ): + """ """ + self.lmdb_dir = lmdb_dir + self.n_poses = n_poses + self.subdivision_stride = subdivision_stride + self.subdivision_stride_sentence = args.subdivision_stride_sentence + self.args = args + self.skeleton_resampling_fps = pose_resampling_fps + self.lang_model = None + self.data_mean = np.array(data_mean).squeeze() + self.data_std = np.array(data_std).squeeze() + self.use_derivative = args.use_derivative == "True" + + # Todo: fix adress -- Fixed + test = os.path.dirname(args.autoencoder_checkpoint) + self.kmeanmodel = pickle.load( + open( + os.path.dirname(args.autoencoder_checkpoint) + + "/clusters/kmeans_model.pk", + "rb", + ) + ) + # self.clustering_transforms = pickle.load(open(os.path.dirname(args.autoencoder_checkpoint) + # + '/clusters/transforms.pkl', 'rb')) + + if args.sentence_level == "True": + self.sentence_level = True + else: + self.sentence_level = False + + logging.info("Reading data '{}'...".format(lmdb_dir)) + + if self.sentence_level: + # preloaded_dir = lmdb_dir + '_sentence_level' + '_cache' + preloaded_dir = ( + args.model_save_path + + "lmdb/" + + os.path.basename((lmdb_dir)) + + "_sentence_level" + + "_cache" + ) + else: + # preloaded_dir = lmdb_dir + '_cache' + preloaded_dir = ( + args.model_save_path + "lmdb/" + os.path.basename((lmdb_dir)) + "_cache" + ) + + if not os.path.exists(args.model_save_path + "lmdb"): + os.mkdir(args.model_save_path + "lmdb") + if not os.path.exists(preloaded_dir): + data_sampler = DataPreprocessor( + args, + lmdb_dir, + preloaded_dir, + n_poses, + self.subdivision_stride_sentence, + pose_resampling_fps, + sentence_level=self.sentence_level, + ) + data_sampler.run() + else: + # Todo: Remove later + + # data_sampler = DataPreprocessor(args, lmdb_dir, preloaded_dir, n_poses, + # subdivision_stride, pose_resampling_fps, + # sentence_level=self.sentence_level) + # data_sampler.run() + logging.info("Found pre-loaded samples from {}".format(preloaded_dir)) + + # init lmdb + self.lmdb_env: lmdb.Environment = lmdb.open( + preloaded_dir, readonly=True, lock=False + ) + with self.lmdb_env.begin() as txn: + self.n_samples = txn.stat()["entries"] + + # Representation model + checkpoint_path: str = args.rep_learning_checkpoint + self.rep_learning_dim: int = args.rep_learning_dim + ( + rep_learning_args, + rep_model, + rep_loss_fn, + rep_lang_model, + rep_out_dim, + ) = utils.train_utils.load_checkpoint_and_model(checkpoint_path, device, "DAE") + self.rep_model = rep_model.to("cpu") + self.rep_model.train(False) + # RNN autoencoder + checkpoint_path: str = args.autoencoder_checkpoint + + # RNNAutoencoder_args, RNNAutoencoder_model, RNNAutoencoder_fn,\ + # RNNAutoencoder_model, RNNAutoencoder_dim = utils.train_utils.load_checkpoint_and_model( + # checkpoint_path, device) + + ( + args_rnn, + rnn, + loss_fn, + lang_model, + out_dim, + ) = utils.train_utils.load_checkpoint_and_model( + checkpoint_path, device, "autoencoder_vq" + ) + self.RNNAutoencoder_model: torch.nn.Module = rnn.to("cpu") + self.RNNAutoencoder_model.train(False) + + def __len__(self) -> int: + """Get the size of the dataset. + + Returns: + The number of samples in the dataset. + """ + return self.n_samples + + def __getitem__( + self, idx: int + ) -> Tuple[ + torch.Tensor, + torch.Tensor, + torch.Tensor, + dict, + torch.Tensor, + torch.Tensor, + torch.Tensor, + ]: + """TODO""" + with self.lmdb_env.begin(write=False) as txn: + key = "{:010}".format(idx).encode("ascii") + sample = txn.get(key) + + sample = pyarrow.deserialize(sample) + ( + word_seq, + pose_seq, + audio_raws, + audio_mels, + aux_info, + sentence_leve_latents, + GP3_Embedding, + ) = sample + + def words_to_tensor( + lang: Vocab | None, words: list[list], end_time: float | None = None + ) -> torch.Tensor: + # indexes = [lang.SOS_token] + indexes = [] + if len(words) <= 0: + print() + for word in words: + if end_time is not None and word[1] > end_time: + break + indexes.append(lang.get_word_index(word[0])) + # indexes.append(lang.EOS_token) + return torch.Tensor(indexes).long() + + def poseIndex_to_tensor(sop, eop, poses_indicies): + indexes = [sop] + for pose_index in poses_indicies: + indexes.append(pose_index) + indexes.append(eop) + return torch.Tensor(indexes).long() + + # normalize + std = np.clip(self.data_std, a_min=0.01, a_max=None) + pose_seq = (pose_seq - self.data_mean) / std + + # to tensors + word_seq_tensor = words_to_tensor( + self.lang_model, word_seq, aux_info["end_time"] + ) + pose_seq = torch.from_numpy(pose_seq).reshape((pose_seq.shape[0], -1)).float() + + sentence_leve_latents = torch.from_numpy(sentence_leve_latents) + sentence_leve_latents = sentence_leve_latents.reshape( + [sentence_leve_latents.shape[0], -1] + ).float() + + # Todo: we may need do some preprocessing. (e.g. padding, resampling, etc.) + # Todo: Move list-->ndarray to the preprocessing. + audio_raw_for_now = False + if audio_raw_for_now: + audio = audio_raws + else: + audio = audio_mels + audio = np.array(audio) + audio = torch.from_numpy(audio).float() + # audio = torch.reshape(audio, (sentence_leve_latents.shape[0], -1)) + # audio = torch.reshape(audio, (4, -1)) + + # return cluster_id + cluster_ids = np.zeros(sentence_leve_latents.shape[0]) + if self.RNNAutoencoder_model.vq == True: + ( + loss_vq, + quantized, + perplexity_vq, + encodings, + ) = self.RNNAutoencoder_model.vq_layer(sentence_leve_latents) + cluster_ids = torch.argmax(encodings, dim=1) + + # q = poseIndex_to_tensor(sop=512, eop=513, + # poses_indicies=cluster_ids.cpu().detach().numpy()) + # cluster_ids = q + # print(cluster_ids) + else: + cluster_ids = self.kmeanmodel.predict( + # self.clustering_transforms['scalar'].transform + (sentence_leve_latents.cpu().detach().numpy()) + ) + cluster_ids = torch.from_numpy(cluster_ids).long() + # cluster_id = torch.Tensor(cluster_id).long() + # print(cluster_ids) + + # clusters_portion = self.clustering_transforms['portion'][cluster_ids] + # clusters_portion = torch.from_numpy(clusters_portion) + + # print((cluster_ids)) + try: + GP3_Embedding = torch.from_numpy(GP3_Embedding).float() + except: + pass + + return ( + word_seq_tensor, + pose_seq, + audio, + aux_info, + sentence_leve_latents, + cluster_ids, + GP3_Embedding, + ) + + def set_lang_model(self, lang_model: Vocab) -> None: + """Set the word vector representation to be used with the dataset. + + Modifies the internal state of the object at the attribute 'lang_model'. + + Args: + lang_model: A pre-trained word vector representation contained in the 'Vocab' class. + """ + self.lang_model = lang_model