From c88aa4fe19836a142eee4c5f2190c6391c011db5 Mon Sep 17 00:00:00 2001
From: Payam Jome Yazdian <pjyazdian@users.noreply.github.com>
Date: Tue, 6 Feb 2024 23:48:07 -0800
Subject: [PATCH] Add files via upload

---
 scripts/data_loader/data_preprocessor.py |  946 ++++----
 scripts/data_loader/gesture_labels.txt   |  822 +++----
 scripts/data_loader/lmdb_data_loader.py  | 2646 +++++++++++-----------
 3 files changed, 2207 insertions(+), 2207 deletions(-)

diff --git a/scripts/data_loader/data_preprocessor.py b/scripts/data_loader/data_preprocessor.py
index 8194eb0..d4d0b89 100644
--- a/scripts/data_loader/data_preprocessor.py
+++ b/scripts/data_loader/data_preprocessor.py
@@ -1,473 +1,473 @@
-"""create data samples
-"""
-
-import math
-import pickle
-import os
-from typing import Tuple
-
-import lmdb
-import numpy as np
-import pyarrow
-import torch
-import librosa
-from tqdm import tqdm
-from configargparse import argparse
-from model.vocab import Vocab
-
-import utils
-
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-
-class DataPreprocessor:
-    """Loads and extracts skeleton, audio and video data from Lmdb files and writes those entires into a separate new Lmdb file.
-
-    Attributes:
-        src_lmdb_env: A Lmdb object containing the origin database environment (similar to PostgreSQL schema).
-        dst_lmdb_env: A Lmdb object containing the destination database environment.
-        n_videos: An integer number of entries in the database (equal to the number of videos in the training set).
-        n_poses: An integer number of frames in each clip in the dataset (normally 30 (in 30 fps)).
-        subdivision_stride: An integer number of frames between the start of one clip and the start of the next clip (clips can overlap).
-        skeleton_resampling_fps: An integer frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps).
-        audio_sample_length: An integer length of the audio clip in hertz (sampled at 16,000 Hz).
-        n_out_samples: An integer total number of database entries (audio, video and skeleton) that has been extracted from the original videos.
-        sentence_frame_length: An integer number of frames in each clip but for sentences rather than gestures.
-        audio_sampling_rate: An integer sampling rate for an audio signal.
-        DAE_frame_level: A DAE model only if args.name in the initialization method is not 'DAE'.
-        rnn_representation: A VQVAE model only if 'sentence_level' is True else None.
-        ckpt_path_DAE: A string filepath to a saved 'DAE' checkpoint model.
-        ckpt_path_Autoencode: A string filepath to a saved VQVAE checkpoint model.
-    """
-    def __init__(self, args: argparse.Namespace, clip_lmdb_dir: str, out_lmdb_dir: str, n_poses: int, subdivision_stride: int, pose_resampling_fps: int, sentence_level: bool = False):
-        """Initialize with several dataset parameters.
-
-        Initializes database connections to the lmdb files. Note that the connections are open until closed by the run method.
-
-        The args argument must contain the following keys:
-            name: A string name of the model (ex. 'DAE' or 'autoencoder_vq').
-            rep_learning_checkpoint: If name is not 'DAE', a string filepath to a saved 'DAE' checkpoint model.
-            autoencoder_checkpoint: If sentence level is True, a string filepath to a saved VQVAE checkpoint model.
-            sentence_frame_length: An integer number of frames in each clip (for a sentence instead of gesture).
-
-        Args:
-            args: A configargparser object with specific parameters (See above).
-            clip_lmdb_dir: A string filepath containing the lmdb dataset files.
-            out_lmdb_dir: A string filepath to save output as lmdb files.
-            n_poses: An integer number of frames per second in each clip in the dataset (ex. 30 in 30 fps).
-            subdivision_stride: An integer number of frames between the start of one clip and the start of the next clip (may overlap).
-            pose_resampling_fps: An integer frames per second for training (may differ from clip fps).
-            sentence_level: A boolean flag to add a language model.
-        """
-        self.n_poses = n_poses
-        self.subdivision_stride = subdivision_stride
-        self.skeleton_resampling_fps = pose_resampling_fps
-        self.sentence_level = sentence_level
-        self.src_lmdb_env: lmdb.Environment = lmdb.open(clip_lmdb_dir, readonly=True, lock=False)
-        self.out_lmdb_dir = out_lmdb_dir
-        with self.src_lmdb_env.begin() as txn:
-            self.n_videos: int = txn.stat()['entries']
-
-        self.audio_sample_length = int(self.n_poses / self.skeleton_resampling_fps * 16000)
-
-        self.ckpt_path_DAE: str = args.rep_learning_checkpoint
-        self.ckpt_path_Autoencode: str = args.autoencoder_checkpoint
-
-        if args.name != "DAE":
-            self.DAE_frame_level: Tuple[argparse.Namespace, torch.nn.Module, torch.nn.MSELoss, Vocab, int] = utils.train_utils.load_checkpoint_and_model(
-                self.ckpt_path_DAE, device,'DAE')
-
-        if self.sentence_level:
-            self.rnn_representation: Tuple[argparse.Namespace, torch.nn.Module, torch.nn.MSELoss, Vocab, int] = utils.train_utils.load_checkpoint_and_model(
-                self.ckpt_path_Autoencode, device, 'autoencoder_vq')
-
-        # create db for samples
-        map_size = 1024 * 50  # in MB
-        map_size <<= 20  # in B
-        self.dst_lmdb_env: lmdb.Environment = lmdb.open(out_lmdb_dir, map_size=map_size)
-        self.n_out_samples = 0
-
-        self.sentence_frame_length = args.sentence_frame_length
-        self.audio_sampling_rate = 16000
-
-
-    def run(self) -> None:
-        """Extract skeleton, audio, word data from source and write entries into a destination Lmdb file.
-
-        Closes both src_lmdb_env and dst_lmdb_env database connections upon completion.
-        Does not return any values. Modifies internal state of the object (Close db connection).
-        """
-        src_txn = self.src_lmdb_env.begin(write=False)
-        total_count = src_txn.stat()['entries']
-
-        # sampling and normalization
-        cursor = src_txn.cursor()
-        counter = 0
-        for key, value in tqdm(cursor):
-            print("video ", counter, "of", total_count, '\n')
-            video = pyarrow.deserialize(value)
-            vid = video['vid']
-            clips = video['clips']
-            for clip_idx, clip in enumerate(clips):
-                self._sample_from_clip(vid, clip)
-                counter = counter + 1
-
-        # print number of samples
-        with self.dst_lmdb_env.begin() as txn:
-            print("Sample_counter", txn.stat()['entries'])
-
-        # close db
-        self.src_lmdb_env.close()
-        self.dst_lmdb_env.sync()
-        self.dst_lmdb_env.close()
-
-    def _sample_from_clip(self, vid: str, clip: dict) -> None:
-        """Internal function to extract and write skeleton, audio and word data from provided clip.
-
-        Modifies internal state of the object (n_out_samples, n_poses, audio_sample_length).
-        #TODO
-
-        Args:
-            vid: A string representing the name or id of the clip.
-            clip: A dictionary containing the following string keys:
-                'poses': A Numpy array of pose/gesture data.
-                'audio_raw': A Numpy array of audio data.
-                'words': A list of lists. Each internal list contains 3 elements:
-                    index 0: A float start time.
-                    index 1: A float end time.
-                    index 2: A string word.
-        """
-        clip_skeleton: np.ndarray = clip['poses']
-        clip_audio_raw: np.ndarray = clip['audio_raw']
-        clip_word_list: list[list] = clip['words']
-
-        # divide
-        aux_info = []
-        sample_skeletons_list = []
-        sample_words_list = []
-
-        sample_audio_list_mels = []
-        sample_audio_list_raws = []
-
-        sentence_leve_latents_list = []
-        GPT_3_STR_list = []
-        GPT_3_Embedding_list = []
-
-        if self.sentence_level:
-            self.n_poses = self.sentence_frame_length
-            self.audio_sample_length = int(self.n_poses / self.skeleton_resampling_fps * self.audio_sampling_rate)
-
-        num_subdivision = math.floor(
-            (len(clip_skeleton) - self.n_poses)
-            / self.subdivision_stride) + 1  # floor((K - (N+M)) / S) + 1
-
-        # Sentence level preparation:
-
-        '''
-        self.sentence_level = False
-        if self.sentence_level:
-            print("Sentence level")
-            
-            str_transcript = ""
-            for i_t in range(len(clip_word_list)):
-                str_transcript = str_transcript + " " + clip_word_list[i_t][0]
-
-            import nltk  # library
-            # nltk.download()
-            sentences = nltk.sent_tokenize(str_transcript)  # whole paragraph break into sentence.
-
-            start_sentence_word_index = 0
-            end_sentence_word_index = 0
-            for sentence in sentences:
-                x = sentence.replace('.', '').strip().split(' ')
-
-                end_sentence_word_index = start_sentence_word_index + len(sentence.replace('.', '').strip().split(' ')) - 1  # to index
-                #     process
-
-                reconstructed_sentence = clip_word_list[start_sentence_word_index:end_sentence_word_index+1]
-
-                start_idx = int( clip_word_list[start_sentence_word_index][1] * self.skeleton_resampling_fps )
-                try:
-                    fin_idx = int( clip_word_list[end_sentence_word_index][2] * self.skeleton_resampling_fps )
-                except:
-
-                    print()
-                # ........................
-                sample_skeletons = clip_skeleton[start_idx:fin_idx]
-                subdivision_start_time = start_idx / self.skeleton_resampling_fps
-                subdivision_end_time = fin_idx / self.skeleton_resampling_fps
-                sample_words = self.get_words_in_time_range(word_list=clip_word_list,
-                                                            start_time=subdivision_start_time,
-                                                            end_time=subdivision_end_time)
-                if len(sample_words) < 2:
-                    continue
-
-                # raw audio
-                audio_start = math.floor(start_idx / len(clip_skeleton) * len(clip_audio_raw))
-                audio_end = audio_start + self.audio_sample_length
-                sample_audio = clip_audio_raw[audio_start:audio_end]
-
-                motion_info = {'vid': vid,
-                               'start_frame_no': start_idx,
-                               'end_frame_no': fin_idx,
-                               'start_time': subdivision_start_time,
-                               'end_time': subdivision_end_time}
-
-                sample_skeletons_list.append(sample_skeletons)
-                sample_words_list.append(sample_words)
-                sample_audio_list.append(sample_audio)
-                aux_info.append(motion_info)
-
-
-
-                start_sentence_word_index = end_sentence_word_index + 1
-                print()
-            '''
-        # Loading cached GP3 requestes
-        address = self.out_lmdb_dir + '_' + vid + '.gpt'
-        if os.path.exists(self.out_lmdb_dir + '_' + vid + '.gpt'):
-            loaded_gpt3 = pickle.load(open(address, 'rb'))
-        else:
-            loaded_gpt3 = None
-
-        for i in tqdm(range(num_subdivision)):
-
-
-
-            start_idx = i * self.subdivision_stride
-            fin_idx = start_idx + self.n_poses
-
-            sample_skeletons = clip_skeleton[start_idx:fin_idx]
-            subdivision_start_time = start_idx / self.skeleton_resampling_fps
-            subdivision_end_time = fin_idx / self.skeleton_resampling_fps
-            sample_words = self.get_words_in_time_range(word_list=clip_word_list,
-                                                        start_time=subdivision_start_time,
-                                                        end_time=subdivision_end_time)
-
-
-            if len(sample_words) < 4:
-                continue
-
-            # raw audio
-            audio_start = math.floor(start_idx / len(clip_skeleton) * len(clip_audio_raw))
-            audio_end = audio_start + self.audio_sample_length
-            sample_audio = clip_audio_raw[audio_start:audio_end]
-
-            mel_chunks = []
-            raw_chunks = []
-            for audio_sub in range(self.audio_sample_length//self.audio_sampling_rate):
-                audio_chunk = sample_audio[audio_sub*self.audio_sampling_rate: (audio_sub+1)*self.audio_sampling_rate]
-                signal = librosa.feature.melspectrogram(y=audio_chunk, sr=self.audio_sampling_rate)
-                signal = librosa.power_to_db(signal, ref=np.max)
-                mel_chunks.append(signal)
-                # raw_chunks.append(audio_chunk)
-                raw_chunks.append(0)
-                # signal = librosa.amplitude_to_db(signal)
-
-
-
-
-
-
-
-            motion_info = {'vid': vid,
-                           'start_frame_no': start_idx,
-                           'end_frame_no': fin_idx,
-                           'start_time': subdivision_start_time,
-                           'end_time': subdivision_end_time}
-
-            sample_skeletons_list.append(sample_skeletons)
-            sample_words_list.append(sample_words)
-
-            sample_audio_list_mels.append(mel_chunks)
-            sample_audio_list_raws.append(raw_chunks)
-
-            aux_info.append(motion_info)
-            if self.sentence_level:
-
-                # For the input side:
-                # GPT-3 Embeddomg
-                str_input = ""
-                for word in sample_words:
-                    str_input += ' ' + word[0]
-
-                GPT_3_features = self.GPT_3_caller(str_input, loaded_gpt3)
-                GPT_3_Embedding_list.append(GPT_3_features)
-                GPT_3_STR_list.append(str_input)
-                # Discretization for the output side:
-                sentence_leve_latents_list.append(self.get_pose_latent(sample_skeletons))
-
-            # if i>100:
-            #     break
-
-
-
-        object_to_save = {'sample_words_list':GPT_3_STR_list,
-                          'GPT_3_Embedding_list': GPT_3_Embedding_list}
-        pickle.dump(object_to_save, open(self.out_lmdb_dir + '_' + vid + '.gpt','wb'))
-        print("Embedding Saved!")
-        if len(sample_skeletons_list) > 0:
-            with self.dst_lmdb_env.begin(write=True) as txn:
-                if self.sentence_level:
-                    for words, poses, audio_raws, audio_mels, aux, sentence_leve_latents , GPT_3_Embedding in \
-                            zip(sample_words_list, sample_skeletons_list,
-                                sample_audio_list_raws, sample_audio_list_mels,
-                                aux_info, sentence_leve_latents_list, GPT_3_Embedding_list):
-                        poses = np.asarray(poses)
-                        GPT_3_Embedding = np.array(GPT_3_Embedding)
-                        # save
-                        k = '{:010}'.format(self.n_out_samples).encode('ascii')
-                        v = [words, poses, audio_raws, audio_mels, aux, sentence_leve_latents, GPT_3_Embedding]
-                        v = pyarrow.serialize(v).to_buffer()
-                        txn.put(k, v)
-                        self.n_out_samples += 1
-                else:
-                    for words, poses, audio, aux in zip(sample_words_list, sample_skeletons_list,
-                                                        sample_audio_list, aux_info):
-                        poses = np.asarray(poses)
-
-                        # save
-                        k = '{:010}'.format(self.n_out_samples).encode('ascii')
-                        v = [words, poses, audio, aux]
-                        v = pyarrow.serialize(v).to_buffer()
-                        txn.put(k, v)
-                        self.n_out_samples += 1
-
-    @staticmethod
-    def get_words_in_time_range(word_list: list[list], start_time: float, end_time: float) -> list[list]:
-        """Retrieves words in the list that fall between the start_time and end_time provided.
-
-        Args:
-            word_list: A list that each element contains a list with three elements:
-                index 0: A float start time.
-                index 1: A float end time.
-                index 2: A string word.
-            start_time: A float indicating when to start filtering.
-            end_time: A float indicating when to end filtering.
-
-        Returns:
-            A list containing all elements in the word_list that fall between the start_time and end_time provided.
-        """
-        words = []
-
-        for word in word_list:
-            _, word_s, word_e = word[0], word[1], word[2]
-
-            if word_s >= end_time:
-                break
-
-            if word_e <= start_time:
-                continue
-
-            words.append(word)
-
-        return words
-
-
-    def get_pose_latent(self, poses: np.ndarray) -> np.ndarray:
-        """TODO
-        """
-
-        # 1. Load models
-        args, DAE, loss_fn, lang_model, out_dim = self.DAE_frame_level
-        args, rnn, loss_fn, lang_model, out_dim = self.rnn_representation
-        DAE.train(False)
-        rnn.train(False)
-
-
-        # poses, poses_mirror = process_bvh(bvh_file)
-
-        mean = np.array(args.data_mean).squeeze()
-        std = np.array(args.data_std).squeeze()
-        std = np.clip(std, a_min=0.01, a_max=None)
-        out_poses = (np.copy(poses) - mean) / std
-
-        target = torch.from_numpy(out_poses)
-        # target = torch.unsqueeze(target,2)
-        target = target.to(device).float()
-        reconstructed = []
-        # for i in range(len(out_poses)):
-        #     input = torch.unsqueeze(target[i],0)
-        #     current_out = pose_decoder(input)
-        #     reconstructed.append(current_out)
-
-        if DAE.encoder == None:
-            encoded = target
-        else:
-            encoded = DAE.encoder(target)
-        # encoded = torch.squeeze(encoded, 2)
-        # encoded = encoded.to('cpu')
-        # encoded = encoded.detach().numpy()
-        # all_frames_from_rnn = None
-
-        result = np.zeros((len(encoded)//args.n_poses, rnn.decoder.n_layers, args.hidden_size))
-        result_index = 0
-        for i in range(0, len(encoded), args.n_poses):
-            current_dict = dict()
-            input_seq = encoded[i:i + args.n_poses]
-
-            current_dict['original'] = poses[i: i + args.n_poses]
-            current_dict['latent_linear'] = encoded[i: i + args.n_poses].detach().clone().to('cpu').numpy()
-
-            input_pre_seq = encoded[i]
-            output_seq = encoded[i:i + args.n_poses]
-
-            input_seq = torch.unsqueeze(input_seq, 0)
-            input_seq = input_seq.transpose(0, 1)
-            use_drivitive = args.use_derivitive=='True'
-            if use_drivitive:
-                diff = [(input_seq[n, :] - input_seq[n - 1, :]) for n in range(1, input_seq.shape[0])]
-                diff.insert(0, torch.zeros_like(input_seq[0, :]))
-                input_seq = torch.cat((input_seq, torch.stack(diff)), dim=2)
-
-            # output_seq = torch.unsqueeze(output_seq, 0)
-            # output_seq = output_seq.transpose(0, 1)
-            output_seq = input_seq.clone()
-
-
-
-
-            reconstructed_rnn = torch.zeros(args.n_poses, output_seq.size(1), rnn.decoder.output_size) \
-                .to(output_seq.device)
-            # run words through encoder
-            encoder_outputs, encoder_hidden = rnn.encoder(input_seq, None)
-
-            if rnn.VAE:
-                decoder_hidden = encoder_hidden[:rnn.decoder.n_layers]  # use last hidden state from encoder
-                # [2, 128, 200]
-                # print("decoder_hidden!!! org", decoder_hidden.shape)
-                decoder_hidden = decoder_hidden.transpose(1, 0).contiguous()  # [128, 2, 200]
-                decoder_hidden = torch.reshape(decoder_hidden, (decoder_hidden.shape[0], -1))
-                mean = rnn.VAE_fc_mean(decoder_hidden)
-                logvar = rnn.VAE_fc_std(decoder_hidden)
-                z = rnn.reparameterize(mean, logvar, train=False)
-                z = rnn.VAE_fc_decoder(z)
-                decoder_hidden = z.reshape(decoder_hidden.shape[0],
-                                           rnn.decoder.n_layers, -1)
-                decoder_hidden = decoder_hidden.transpose(1, 0).contiguous()
-                # print("decoder_hidden!!! modified", decoder_hidden.shape)
-                decoder_first_hidden = decoder_hidden
-            else:
-                decoder_hidden = encoder_hidden[:rnn.decoder.n_layers]  # use last hidden state from encoder
-                # print("decoder_hidden!!! not VAE ", decoder_hidden.shape)
-
-            current_dict['latent_rnn'] = torch.squeeze(decoder_hidden.detach().clone(), 1).to('cpu').numpy()
-            result[result_index] = torch.squeeze(decoder_hidden.detach().clone(), 1).to('cpu').numpy()
-            result_index = result_index + 1
-
-        return result
-
-    def GPT_3_caller(self, str_input, pickle_file_loaded):
-
-        return 1
-
-        if pickle_file_loaded != None:
-            for i in range(len(pickle_file_loaded['sample_words_list'])):
-                if str_input == pickle_file_loaded['sample_words_list'][i]:
-                    print("Embedding found:", str_input)
-                    return pickle_file_loaded['GPT_3_Embedding_list'][i]
-
-        request = openai.Embedding.create(input=str_input, engine="text-similarity-ada-001")
-        embedding = request['data'][0]['embedding']
-        print("!!!!!!!!!!!!!!!!!!!Embedding requested:", str_input)
-        return embedding
-
+"""create data samples
+"""
+
+import math
+import pickle
+import os
+from typing import Tuple
+
+import lmdb
+import numpy as np
+import pyarrow
+import torch
+import librosa
+from tqdm import tqdm
+from configargparse import argparse
+from model.vocab import Vocab
+
+import utils
+
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+class DataPreprocessor:
+    """Loads and extracts skeleton, audio and video data from Lmdb files and writes those entires into a separate new Lmdb file.
+
+    Attributes:
+        src_lmdb_env: A Lmdb object containing the origin database environment (similar to PostgreSQL schema).
+        dst_lmdb_env: A Lmdb object containing the destination database environment.
+        n_videos: An integer number of entries in the database (equal to the number of videos in the training set).
+        n_poses: An integer number of frames in each clip in the dataset (normally 30 (in 30 fps)).
+        subdivision_stride: An integer number of frames between the start of one clip and the start of the next clip (clips can overlap).
+        skeleton_resampling_fps: An integer frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps).
+        audio_sample_length: An integer length of the audio clip in hertz (sampled at 16,000 Hz).
+        n_out_samples: An integer total number of database entries (audio, video and skeleton) that has been extracted from the original videos.
+        sentence_frame_length: An integer number of frames in each clip but for sentences rather than gestures.
+        audio_sampling_rate: An integer sampling rate for an audio signal.
+        DAE_frame_level: A DAE model only if args.name in the initialization method is not 'DAE'.
+        rnn_representation: A VQVAE model only if 'sentence_level' is True else None.
+        ckpt_path_DAE: A string filepath to a saved 'DAE' checkpoint model.
+        ckpt_path_Autoencode: A string filepath to a saved VQVAE checkpoint model.
+    """
+    def __init__(self, args: argparse.Namespace, clip_lmdb_dir: str, out_lmdb_dir: str, n_poses: int, subdivision_stride: int, pose_resampling_fps: int, sentence_level: bool = False):
+        """Initialize with several dataset parameters.
+
+        Initializes database connections to the lmdb files. Note that the connections are open until closed by the run method.
+
+        The args argument must contain the following keys:
+            name: A string name of the model (ex. 'DAE' or 'autoencoder_vq').
+            rep_learning_checkpoint: If name is not 'DAE', a string filepath to a saved 'DAE' checkpoint model.
+            autoencoder_checkpoint: If sentence level is True, a string filepath to a saved VQVAE checkpoint model.
+            sentence_frame_length: An integer number of frames in each clip (for a sentence instead of gesture).
+
+        Args:
+            args: A configargparser object with specific parameters (See above).
+            clip_lmdb_dir: A string filepath containing the lmdb dataset files.
+            out_lmdb_dir: A string filepath to save output as lmdb files.
+            n_poses: An integer number of frames per second in each clip in the dataset (ex. 30 in 30 fps).
+            subdivision_stride: An integer number of frames between the start of one clip and the start of the next clip (may overlap).
+            pose_resampling_fps: An integer frames per second for training (may differ from clip fps).
+            sentence_level: A boolean flag to add a language model.
+        """
+        self.n_poses = n_poses
+        self.subdivision_stride = subdivision_stride
+        self.skeleton_resampling_fps = pose_resampling_fps
+        self.sentence_level = sentence_level
+        self.src_lmdb_env: lmdb.Environment = lmdb.open(clip_lmdb_dir, readonly=True, lock=False)
+        self.out_lmdb_dir = out_lmdb_dir
+        with self.src_lmdb_env.begin() as txn:
+            self.n_videos: int = txn.stat()['entries']
+
+        self.audio_sample_length = int(self.n_poses / self.skeleton_resampling_fps * 16000)
+
+        self.ckpt_path_DAE: str = args.rep_learning_checkpoint
+        self.ckpt_path_Autoencode: str = args.autoencoder_checkpoint
+
+        if args.name != "Frame_Level":
+            self.DAE_frame_level: Tuple[argparse.Namespace, torch.nn.Module, torch.nn.MSELoss, Vocab, int] = utils.train_utils.load_checkpoint_and_model(
+                self.ckpt_path_DAE, device,'DAE')
+
+        if self.sentence_level:
+            self.rnn_representation: Tuple[argparse.Namespace, torch.nn.Module, torch.nn.MSELoss, Vocab, int] = utils.train_utils.load_checkpoint_and_model(
+                self.ckpt_path_Autoencode, device, 'autoencoder_vq')
+
+        # create db for samples
+        map_size = 1024 * 50  # in MB
+        map_size <<= 20  # in B
+        self.dst_lmdb_env: lmdb.Environment = lmdb.open(out_lmdb_dir, map_size=map_size)
+        self.n_out_samples = 0
+
+        self.sentence_frame_length = args.sentence_frame_length
+        self.audio_sampling_rate = 16000
+
+
+    def run(self) -> None:
+        """Extract skeleton, audio, word data from source and write entries into a destination Lmdb file.
+
+        Closes both src_lmdb_env and dst_lmdb_env database connections upon completion.
+        Does not return any values. Modifies internal state of the object (Close db connection).
+        """
+        src_txn = self.src_lmdb_env.begin(write=False)
+        total_count = src_txn.stat()['entries']
+
+        # sampling and normalization
+        cursor = src_txn.cursor()
+        counter = 0
+        for key, value in tqdm(cursor):
+            print("video ", counter, "of", total_count, '\n')
+            video = pyarrow.deserialize(value)
+            vid = video['vid']
+            clips = video['clips']
+            for clip_idx, clip in enumerate(clips):
+                self._sample_from_clip(vid, clip)
+                counter = counter + 1
+
+        # print number of samples
+        with self.dst_lmdb_env.begin() as txn:
+            print("Sample_counter", txn.stat()['entries'])
+
+        # close db
+        self.src_lmdb_env.close()
+        self.dst_lmdb_env.sync()
+        self.dst_lmdb_env.close()
+
+    def _sample_from_clip(self, vid: str, clip: dict) -> None:
+        """Internal function to extract and write skeleton, audio and word data from provided clip.
+
+        Modifies internal state of the object (n_out_samples, n_poses, audio_sample_length).
+        #TODO
+
+        Args:
+            vid: A string representing the name or id of the clip.
+            clip: A dictionary containing the following string keys:
+                'poses': A Numpy array of pose/gesture data.
+                'audio_raw': A Numpy array of audio data.
+                'words': A list of lists. Each internal list contains 3 elements:
+                    index 0: A float start time.
+                    index 1: A float end time.
+                    index 2: A string word.
+        """
+        clip_skeleton: np.ndarray = clip['poses']
+        clip_audio_raw: np.ndarray = clip['audio_raw']
+        clip_word_list: list[list] = clip['words']
+
+        # divide
+        aux_info = []
+        sample_skeletons_list = []
+        sample_words_list = []
+
+        sample_audio_list_mels = []
+        sample_audio_list_raws = []
+
+        sentence_leve_latents_list = []
+        GPT_3_STR_list = []
+        GPT_3_Embedding_list = []
+
+        if self.sentence_level:
+            self.n_poses = self.sentence_frame_length
+            self.audio_sample_length = int(self.n_poses / self.skeleton_resampling_fps * self.audio_sampling_rate)
+
+        num_subdivision = math.floor(
+            (len(clip_skeleton) - self.n_poses)
+            / self.subdivision_stride) + 1  # floor((K - (N+M)) / S) + 1
+
+        # Sentence level preparation:
+
+        '''
+        self.sentence_level = False
+        if self.sentence_level:
+            print("Sentence level")
+            
+            str_transcript = ""
+            for i_t in range(len(clip_word_list)):
+                str_transcript = str_transcript + " " + clip_word_list[i_t][0]
+
+            import nltk  # library
+            # nltk.download()
+            sentences = nltk.sent_tokenize(str_transcript)  # whole paragraph break into sentence.
+
+            start_sentence_word_index = 0
+            end_sentence_word_index = 0
+            for sentence in sentences:
+                x = sentence.replace('.', '').strip().split(' ')
+
+                end_sentence_word_index = start_sentence_word_index + len(sentence.replace('.', '').strip().split(' ')) - 1  # to index
+                #     process
+
+                reconstructed_sentence = clip_word_list[start_sentence_word_index:end_sentence_word_index+1]
+
+                start_idx = int( clip_word_list[start_sentence_word_index][1] * self.skeleton_resampling_fps )
+                try:
+                    fin_idx = int( clip_word_list[end_sentence_word_index][2] * self.skeleton_resampling_fps )
+                except:
+
+                    print()
+                # ........................
+                sample_skeletons = clip_skeleton[start_idx:fin_idx]
+                subdivision_start_time = start_idx / self.skeleton_resampling_fps
+                subdivision_end_time = fin_idx / self.skeleton_resampling_fps
+                sample_words = self.get_words_in_time_range(word_list=clip_word_list,
+                                                            start_time=subdivision_start_time,
+                                                            end_time=subdivision_end_time)
+                if len(sample_words) < 2:
+                    continue
+
+                # raw audio
+                audio_start = math.floor(start_idx / len(clip_skeleton) * len(clip_audio_raw))
+                audio_end = audio_start + self.audio_sample_length
+                sample_audio = clip_audio_raw[audio_start:audio_end]
+
+                motion_info = {'vid': vid,
+                               'start_frame_no': start_idx,
+                               'end_frame_no': fin_idx,
+                               'start_time': subdivision_start_time,
+                               'end_time': subdivision_end_time}
+
+                sample_skeletons_list.append(sample_skeletons)
+                sample_words_list.append(sample_words)
+                sample_audio_list.append(sample_audio)
+                aux_info.append(motion_info)
+
+
+
+                start_sentence_word_index = end_sentence_word_index + 1
+                print()
+            '''
+        # Loading cached GP3 requestes
+        address = self.out_lmdb_dir + '_' + vid + '.gpt'
+        if os.path.exists(self.out_lmdb_dir + '_' + vid + '.gpt'):
+            loaded_gpt3 = pickle.load(open(address, 'rb'))
+        else:
+            loaded_gpt3 = None
+
+        for i in tqdm(range(num_subdivision)):
+
+
+
+            start_idx = i * self.subdivision_stride
+            fin_idx = start_idx + self.n_poses
+
+            sample_skeletons = clip_skeleton[start_idx:fin_idx]
+            subdivision_start_time = start_idx / self.skeleton_resampling_fps
+            subdivision_end_time = fin_idx / self.skeleton_resampling_fps
+            sample_words = self.get_words_in_time_range(word_list=clip_word_list,
+                                                        start_time=subdivision_start_time,
+                                                        end_time=subdivision_end_time)
+
+
+            if len(sample_words) < 4:
+                continue
+
+            # raw audio
+            audio_start = math.floor(start_idx / len(clip_skeleton) * len(clip_audio_raw))
+            audio_end = audio_start + self.audio_sample_length
+            sample_audio = clip_audio_raw[audio_start:audio_end]
+
+            mel_chunks = []
+            raw_chunks = []
+            for audio_sub in range(self.audio_sample_length//self.audio_sampling_rate):
+                audio_chunk = sample_audio[audio_sub*self.audio_sampling_rate: (audio_sub+1)*self.audio_sampling_rate]
+                signal = librosa.feature.melspectrogram(y=audio_chunk, sr=self.audio_sampling_rate)
+                signal = librosa.power_to_db(signal, ref=np.max)
+                mel_chunks.append(signal)
+                # raw_chunks.append(audio_chunk)
+                raw_chunks.append(0)
+                # signal = librosa.amplitude_to_db(signal)
+
+
+
+
+
+
+
+            motion_info = {'vid': vid,
+                           'start_frame_no': start_idx,
+                           'end_frame_no': fin_idx,
+                           'start_time': subdivision_start_time,
+                           'end_time': subdivision_end_time}
+
+            sample_skeletons_list.append(sample_skeletons)
+            sample_words_list.append(sample_words)
+
+            sample_audio_list_mels.append(mel_chunks)
+            sample_audio_list_raws.append(raw_chunks)
+
+            aux_info.append(motion_info)
+            if self.sentence_level:
+
+                # For the input side:
+                # GPT-3 Embeddomg
+                str_input = ""
+                for word in sample_words:
+                    str_input += ' ' + word[0]
+
+                GPT_3_features = self.GPT_3_caller(str_input, loaded_gpt3)
+                GPT_3_Embedding_list.append(GPT_3_features)
+                GPT_3_STR_list.append(str_input)
+                # Discretization for the output side:
+                sentence_leve_latents_list.append(self.get_pose_latent(sample_skeletons))
+
+            # if i>100:
+            #     break
+
+
+
+        object_to_save = {'sample_words_list':GPT_3_STR_list,
+                          'GPT_3_Embedding_list': GPT_3_Embedding_list}
+        pickle.dump(object_to_save, open(self.out_lmdb_dir + '_' + vid + '.gpt','wb'))
+        print("Embedding Saved!")
+        if len(sample_skeletons_list) > 0:
+            with self.dst_lmdb_env.begin(write=True) as txn:
+                if self.sentence_level:
+                    for words, poses, audio_raws, audio_mels, aux, sentence_leve_latents , GPT_3_Embedding in \
+                            zip(sample_words_list, sample_skeletons_list,
+                                sample_audio_list_raws, sample_audio_list_mels,
+                                aux_info, sentence_leve_latents_list, GPT_3_Embedding_list):
+                        poses = np.asarray(poses)
+                        GPT_3_Embedding = np.array(GPT_3_Embedding)
+                        # save
+                        k = '{:010}'.format(self.n_out_samples).encode('ascii')
+                        v = [words, poses, audio_raws, audio_mels, aux, sentence_leve_latents, GPT_3_Embedding]
+                        v = pyarrow.serialize(v).to_buffer()
+                        txn.put(k, v)
+                        self.n_out_samples += 1
+                else:
+                    for words, poses, audio, aux in zip(sample_words_list, sample_skeletons_list,
+                                                        sample_audio_list_raws, aux_info):
+                        poses = np.asarray(poses)
+
+                        # save
+                        k = '{:010}'.format(self.n_out_samples).encode('ascii')
+                        v = [words, poses, audio, aux]
+                        v = pyarrow.serialize(v).to_buffer()
+                        txn.put(k, v)
+                        self.n_out_samples += 1
+
+    @staticmethod
+    def get_words_in_time_range(word_list, start_time: float, end_time: float):
+        """Retrieves words in the list that fall between the start_time and end_time provided.
+
+        Args:
+            word_list: A list that each element contains a list with three elements:
+                index 0: A float start time.
+                index 1: A float end time.
+                index 2: A string word.
+            start_time: A float indicating when to start filtering.
+            end_time: A float indicating when to end filtering.
+
+        Returns:
+            A list containing all elements in the word_list that fall between the start_time and end_time provided.
+        """
+        words = []
+
+        for word in word_list:
+            _, word_s, word_e = word[0], word[1], word[2]
+
+            if word_s >= end_time:
+                break
+
+            if word_e <= start_time:
+                continue
+
+            words.append(word)
+
+        return words
+
+
+    def get_pose_latent(self, poses: np.ndarray) -> np.ndarray:
+        """TODO
+        """
+
+        # 1. Load models
+        args, DAE, loss_fn, lang_model, out_dim = self.DAE_frame_level
+        args, rnn, loss_fn, lang_model, out_dim = self.rnn_representation
+        DAE.train(False)
+        rnn.train(False)
+
+
+        # poses, poses_mirror = process_bvh(bvh_file)
+
+        mean = np.array(args.data_mean).squeeze()
+        std = np.array(args.data_std).squeeze()
+        std = np.clip(std, a_min=0.01, a_max=None)
+        out_poses = (np.copy(poses) - mean) / std
+
+        target = torch.from_numpy(out_poses)
+        # target = torch.unsqueeze(target,2)
+        target = target.to(device).float()
+        reconstructed = []
+        # for i in range(len(out_poses)):
+        #     input = torch.unsqueeze(target[i],0)
+        #     current_out = pose_decoder(input)
+        #     reconstructed.append(current_out)
+
+        if DAE.encoder == None:
+            encoded = target
+        else:
+            encoded = DAE.encoder(target)
+        # encoded = torch.squeeze(encoded, 2)
+        # encoded = encoded.to('cpu')
+        # encoded = encoded.detach().numpy()
+        # all_frames_from_rnn = None
+
+        result = np.zeros((len(encoded)//args.n_poses, rnn.decoder.n_layers, args.hidden_size))
+        result_index = 0
+        for i in range(0, len(encoded), args.n_poses):
+            current_dict = dict()
+            input_seq = encoded[i:i + args.n_poses]
+
+            current_dict['original'] = poses[i: i + args.n_poses]
+            current_dict['latent_linear'] = encoded[i: i + args.n_poses].detach().clone().to('cpu').numpy()
+
+            input_pre_seq = encoded[i]
+            output_seq = encoded[i:i + args.n_poses]
+
+            input_seq = torch.unsqueeze(input_seq, 0)
+            input_seq = input_seq.transpose(0, 1)
+            use_drivitive = args.use_derivitive=='True'
+            if use_drivitive:
+                diff = [(input_seq[n, :] - input_seq[n - 1, :]) for n in range(1, input_seq.shape[0])]
+                diff.insert(0, torch.zeros_like(input_seq[0, :]))
+                input_seq = torch.cat((input_seq, torch.stack(diff)), dim=2)
+
+            # output_seq = torch.unsqueeze(output_seq, 0)
+            # output_seq = output_seq.transpose(0, 1)
+            output_seq = input_seq.clone()
+
+
+
+
+            reconstructed_rnn = torch.zeros(args.n_poses, output_seq.size(1), rnn.decoder.output_size) \
+                .to(output_seq.device)
+            # run words through encoder
+            encoder_outputs, encoder_hidden = rnn.encoder(input_seq, None)
+
+            if rnn.VAE:
+                decoder_hidden = encoder_hidden[:rnn.decoder.n_layers]  # use last hidden state from encoder
+                # [2, 128, 200]
+                # print("decoder_hidden!!! org", decoder_hidden.shape)
+                decoder_hidden = decoder_hidden.transpose(1, 0).contiguous()  # [128, 2, 200]
+                decoder_hidden = torch.reshape(decoder_hidden, (decoder_hidden.shape[0], -1))
+                mean = rnn.VAE_fc_mean(decoder_hidden)
+                logvar = rnn.VAE_fc_std(decoder_hidden)
+                z = rnn.reparameterize(mean, logvar, train=False)
+                z = rnn.VAE_fc_decoder(z)
+                decoder_hidden = z.reshape(decoder_hidden.shape[0],
+                                           rnn.decoder.n_layers, -1)
+                decoder_hidden = decoder_hidden.transpose(1, 0).contiguous()
+                # print("decoder_hidden!!! modified", decoder_hidden.shape)
+                decoder_first_hidden = decoder_hidden
+            else:
+                decoder_hidden = encoder_hidden[:rnn.decoder.n_layers]  # use last hidden state from encoder
+                # print("decoder_hidden!!! not VAE ", decoder_hidden.shape)
+
+            current_dict['latent_rnn'] = torch.squeeze(decoder_hidden.detach().clone(), 1).to('cpu').numpy()
+            result[result_index] = torch.squeeze(decoder_hidden.detach().clone(), 1).to('cpu').numpy()
+            result_index = result_index + 1
+
+        return result
+
+    def GPT_3_caller(self, str_input, pickle_file_loaded):
+
+        return 1
+
+        if pickle_file_loaded != None:
+            for i in range(len(pickle_file_loaded['sample_words_list'])):
+                if str_input == pickle_file_loaded['sample_words_list'][i]:
+                    print("Embedding found:", str_input)
+                    return pickle_file_loaded['GPT_3_Embedding_list'][i]
+
+        request = openai.Embedding.create(input=str_input, engine="text-similarity-ada-001")
+        embedding = request['data'][0]['embedding']
+        print("!!!!!!!!!!!!!!!!!!!Embedding requested:", str_input)
+        return embedding
+
diff --git a/scripts/data_loader/gesture_labels.txt b/scripts/data_loader/gesture_labels.txt
index 016b546..ea700c0 100644
--- a/scripts/data_loader/gesture_labels.txt
+++ b/scripts/data_loader/gesture_labels.txt
@@ -1,411 +1,411 @@
-payam,8720,1210,2930,neither,24.5149699
-payam,960,6830,7040,right,19.103924
-payam,2400,7930,7770,right,12.2262586
-payam,4850,2520,2830,neither,15.1091679
-payam,740,2820,2750,neither,18.6786855
-payam,1840,1090,7750,left,7.826561
-payam,7290,6890,6760,left,18.6590548
-payam,910,7710,7630,neither,15.9142669
-payam,110,3080,3020,right,19.3148321
-payam,4310,8430,6020,right,18.3946849
-payam,2370,6450,6640,neither,8.0961591
-payam,3760,3670,8090,left,35.6075189
-payam,7290,6890,6760,left,6.1512555
-payam,2330,1610,4290,neither,9.3917529
-payam,4960,1360,6940,neither,17.0703494
-payam,4880,2220,4050,neither,6.66984
-payam,4150,6320,5120,left,11.1385341
-payam,2380,2290,3830,neither,8.175581
-payam,6990,4350,7850,right,24.8897172
-payam,1260,1300,1150,left,8.7496713
-payam,4040,8150,270,neither,6.5148909
-payam,8210,4200,540,neither,9.3981568
-payam,1840,1090,7750,left,3.4069544
-payam,80,1770,5440,neither,6.8139126
-payam,4740,6380,6320,left,15.6245139
-payam,350,1710,820,neither,6.9307515
-payam,7980,380,440,right,6.9549693
-payam,4710,4520,6560,left,15.8368305
-payam,4150,1030,5450,right,5.8812649
-payam,4090,8210,1650,left,9.3473628
-payam,3160,3050,3300,right,15.4507672
-payam,3700,5340,4990,neither,19.4590834
-payam,5790,3250,4560,right,5.8536474
-payam,2820,2730,410,left,8.6303164
-payam,4180,6270,6780,neither,10.6426271
-payam,3450,3010,3200,neither,9.5873883
-payam,1480,2740,2400,neither,7.4411407
-payam,5200,140,2280,neither,10.3224914
-payam,7920,5360,5370,right,12.6515
-payam,8140,6670,6500,neither,8.3602584
-payam,1880,1810,5430,right,23.2110811
-payam,3190,1380,3600,left,13.4339459
-payam,50,4340,5020,neither,21.3263438
-payam,8350,8660,720,neither,5.8660808
-payam,3380,3280,1440,neither,12.1587357
-payam,1990,1450,3180,right,11.0167687
-payam,5220,3470,2030,right,17.4336008
-payam,3030,1430,7950,neither,5.1681943
-payam,2070,2690,1080,right,7.0990855
-payam,5270,760,2620,right,7.8710579
-payam,1500,5160,5060,right,13.7366622
-payam,3720,5320,980,right,3.9487614
-payam,6360,5830,6430,right,16.9997513
-payam,8100,2890,2200,neither,11.3478833
-payam,8190,8240,5770,left,9.2111841
-payam,6660,4820,3920,right,7.2671531
-payam,1380,1480,6320,left,8.1044273
-payam,2920,6550,6590,right,4.2769958
-payam,8030,5080,8270,left,8.6034509
-payam,7140,7510,4590,neither,16.6477046
-payam,2820,6050,8100,left,24.6404202
-payam,8760,7440,5770,right,13.7314397
-payam,1500,3220,670,left,11.3452669
-payam,7760,7550,5210,neither,6.2482816
-payam,1480,3220,1500,right,12.5323615
-payam,3370,5490,770,neither,7.1390339
-payam,7810,1860,900,right,10.9310926
-payam,1480,60,1000,left,22.48273
-payam,880,370,100,right,6.3977794
-payam,6530,6470,8270,left,23.1602921
-payam,7870,5570,5710,neither,10.2806256
-payam,6440,5560,1420,left,6.4094853
-payam,3080,3090,5170,left,21.2385869
-payam,2360,3630,3810,neither,8.7701795
-payam,490,1260,1220,neither,6.672139
-payam,2290,610,2460,left,6.049926
-payam,3380,3280,1440,neither,20.8177628
-payam,2660,6610,6450,left,42.8382073
-payam,5880,2790,2520,right,9.3886082
-payam,6520,5860,5480,neither,7.0841901
-payam,3640,2700,1130,left,32.19488
-payam,2800,6870,6710,neither,11.8798787
-payam,740,2820,2750,neither,9.5489998
-payam,7590,7770,6800,neither,19.0458094
-payam,4240,2560,270,right,12.5144577
-payam,4170,8630,3110,neither,9.0806503
-payam,4870,6970,6740,neither,16.2745591
-payam,6840,6060,6290,neither,6.9093279
-payam,90,3320,4740,neither,9.6742201
-payam,7140,7510,4590,left,22.7795439
-payam,5010,2290,7770,neither,19.3744291
-payam,5960,6240,8600,right,9.0793041
-payam,1250,6260,7880,right,4.93853
-payam,4270,1320,1350,right,17.2790887
-payam,4670,1250,3220,neither,10.618701
-payam,1480,2740,2400,neither,7.0960476
-payam,4270,1320,1350,right,4.4593416
-payam,7500,6670,450,neither,6.8258252
-payam,2330,3690,3800,neither,9.1139991
-payam,6890,3890,3700,neither,4.0416935
-payam,2160,8690,5990,neither,11.627078
-payam,5230,5350,5200,left,12.7237877
-payam,5410,5190,5000,neither,14.5101507
-payam,8310,8680,3050,left,7.9895167
-payam,1600,400,980,neither,10.7849002
-payam,170,2250,1110,right,8.0488411
-payam,540,2300,1470,left,15.8741313
-payam,6520,1530,2860,neither,7.0883511
-payam,5640,5810,4930,neither,7.4146026
-payam,3220,6620,5720,right,7.7339321
-test,30,3890,1900,right,22.9397707
-test,8530,4240,8100,neither,5.9101173
-test,7420,150,560,neither,137.2592382
-test,2130,5290,6180,neither,10.5838116
-test,1370,1510,1710,neither,6.7253406
-test,2750,1950,3380,right,17.9191102
-test,6460,2490,2850,neither,29.0007036
-test,3030,1430,7950,neither,3.9712144
-test,1390,7990,5990,right,42.9268688
-Payam,7010,7880,1480,neither,29.2939266
-Payam,3670,4380,1690,neither,13.5852305
-payam3,510,3840,2050,neither,19.60391
-payam3,3120,5240,400,neither,7.5961915
-payam3,6610,1980,5910,neither,32.5094664
-payam3,2790,6640,7410,neither,6.8419322
-payam3,2500,2680,520,neither,4.9135896
-payam3,3000,6310,6340,right,15.3105667
-payam3,3430,5560,80,neither,7.3489193
-payam3,5430,6460,570,neither,18.0880914
-payam3,5710,2830,2520,neither,11.3542066
-payam3,8740,1560,590,neither,9.5529691
-payam3,260,5430,4780,neither,8.8140994
-payam3,5380,5730,6680,neither,9.4738672
-payam3,990,190,3210,neither,6.6699878
-payam3,820,1910,4510,left,6.098936
-payam3,610,5890,7960,right,13.1935591
-payam3,4870,4910,6980,left,79.8618806
-payam3,120,1450,1240,left,23.4519851
-payam3,6940,7090,4180,left,7.9929938
-payam3,6360,5830,6430,neither,27.7130248
-payam,6980,8150,440,left,27.8841035
-payam,580,7490,7090,neither,10.9262192
-payam,2340,8460,6140,neither,26.7986099
-payam,5860,5670,4210,neither,18.7140217
-payam,1860,7700,7940,neither,14.8113515
-payam,8140,6110,4520,neither,24.3878234
-payam,2710,300,430,neither,21.5049204
-payam,3630,3590,2680,left,23.1192547
-payam,3740,820,3660,right,5.682812
-payam,1480,2740,2400,right,12.0348692
-payam,910,7710,7630,right,7.8845036
-payam,230,1540,4340,left,7.4580693
-payam,3170,1410,490,neither,10.9175666
-payam,500,2870,260,neither,12.8639616
-payam,7930,7780,5110,right,26.1968788
-payam,2300,1100,1090,right,8.9982824
-payam,2960,8730,4210,neither,15.1996149
-payam,4500,5370,2490,neither,20.4606212
-payam,2440,1140,6330,left,8.8519273
-payam,3930,1500,2990,right,12.3335319
-payam,4960,7920,3680,left,8.5003094
-payam,8600,6700,7470,right,8.8673901
-payam,5580,6610,1700,left,23.4709146
-payam,1320,3160,1380,right,19.3742774
-payam,8220,8660,1510,left,4.5353338
-payam,3120,6550,1760,neither,11.938804
-payam,3120,6550,1760,neither,24.8215028
-payam,3120,6550,1760,neither,32.6924222
-payam,3120,6550,1760,neither,199.0291749
-aaa,7680,780,3320,neither,13.9752481
-aaa,1170,1680,8400,right,72.1732308
-aaa,190,1570,590,neither,23.8951871
-aaa,890,190,8790,neither,15.8835947
-aaa,1110,850,1120,neither,12.2388968
-aaa,6990,70,840,neither,10.1972333
-aaa,1570,600,1040,neither,58.0329853
-payam,2610,3470,8790,neither,23.478667
-payam,2820,970,2610,neither,17.2883431
-payam,5130,3330,1970,neither,25.0759505
-payam,200,970,3200,neither,10.951438
-payam,6160,890,870,neither,13.3928642
-payam,4030,870,190,right,21.7972779
-payam,4640,3260,4480,neither,15.6459803
-payam,3750,3400,250,left,15.7365793
-payam,5820,5110,260,neither,11.9040374
-payam,4960,7920,2720,left,13.7575275
-payam,3340,5410,4670,neither,11.0850437
-payam,3220,5500,3450,neither,12.7476892
-payam,4830,3340,1750,neither,8.2270027
-payam,3280,5500,6100,neither,11.6609682
-payam,5570,3380,4830,neither,12.5343834
-payam,2170,3490,5410,left,20.2327963
-payam,1310,7950,5980,right,17.9011993
-payam,4830,8250,5060,right,12.6581358
-payam,2410,1730,1420,left,8.4628379
-payam,1610,140,1040,neither,39.4259191
-payam,2720,290,660,neither,9.004165
-payam,290,1770,1500,neither,8.7271764
-payam,2590,1750,2280,left,21.5393153
-payam,940,4110,5100,neither,14.6486691
-payam,3880,5100,8100,right,6.654937
-payam,2820,4740,3760,neither,8.9366303
-payam,2620,3800,3640,neither,32.7218192
-payam,490,3630,50,neither,15.6962632
-payam,6500,4740,3950,neither,10.4569726
-payam,3860,3960,3670,neither,5.6011954
-payam,8160,4670,3320,neither,9.9990773
-payam,5130,4670,3960,neither,10.0627506
-payam,1970,3670,6110,neither,8.9863573
-payam,5240,3800,4670,neither,35.1511656
-pp,6520,3000,3730,neither,13.2667115
-pp,3950,4670,8530,neither,10.3694821
-pp,3960,3670,1100,right,14.2283085
-pp,1080,2690,4240,left,8.0165702
-pp,4060,8590,7870,right,39.0669929
-pp,2730,860,5670,neither,21.0611187
-pp,1970,4410,2750,neither,23.8645963
-pp,2520,4410,7870,neither,7.0013341
-pp,2280,2790,2800,right,10.0719624
-pp,5730,1590,1630,neither,60.7203313
-pp,2170,2630,970,right,15.679544
-pp,7170,4640,1940,neither,12.8070448
-pp,1910,3790,3370,right,10.4345838
-pp,1940,5050,4170,neither,15.4618189
-pp,4240,6180,5690,neither,11.6433038
-pp,1360,4960,4100,right,22.1847562
-pp,2620,3610,3280,neither,16.611427
-payam,1140,1770,140,neither,32.3426861
-payam,5410,2110,3480,neither,9.2396016
-payam,6350,4540,2160,neither,15.8730432
-payam,1090,3280,5400,right,7.9992393
-payam,6870,7500,6480,left,16.2428709
-payam,4890,7850,2720,left,11.0865865
-payam,6340,6250,450,left,7.1841906
-payam,2410,1240,1360,right,3.8704583
-payam,4010,8090,2620,left,11.8036291
-payam,1940,4910,4050,right,10.9399087
-payam,8580,6390,5740,neither,19.4242683
-payam,360,6430,6520,neither,39.232415
-payam,4330,5830,6420,neither,14.8946759
-payam,7070,6430,280,neither,9.1711564
-payam,440,6390,5570,neither,16.4782124
-payam,5830,6650,3080,neither,27.4215719
-payam,6620,5790,3630,neither,7.9312582
-payam,3520,6680,5860,neither,15.903332
-payam,6680,6600,1570,neither,7.0110227
-payam,1050,5650,5570,neither,9.8728923
-payam,4540,5740,6430,right,7.6298997
-payam,2560,1730,1030,left,64.8227024
-payam,1910,6760,7300,right,5.3704937
-payam,5730,3020,3080,right,14.8289519
-payam,4440,1750,1640,neither,17.3421493
-payam,3160,910,140,neither,7.7735092
-payam,5280,80,2260,right,10.4654106
-p,8630,3910,1190,neither,12.2734672
-p,4610,8400,6030,neither,25.0126291
-p,4250,5200,8530,right,11.2253927
-p,370,660,6480,neither,8.051555
-p,5420,150,180,right,7.0392668
-p,3360,3600,560,right,12.5866966
-p,3900,3980,3760,right,60.7114951
-p,5010,8340,6970,neither,25.5606582
-p,6210,8650,2790,neither,14.1406681
-p,640,8210,5010,neither,11.7996506
-p,3200,8170,8030,neither,88.1912972
-p,7970,8110,7150,neither,9.3730532
-p,7980,4090,8290,right,14.0371127
-p,1070,430,410,neither,9.7275574
-p,310,410,4240,left,5.8301648
-p,7030,2460,2840,right,7.3928905
-p,3830,5260,3130,neither,7.6125024
-p,3850,4450,7780,neither,15.566987
-p,3830,2120,6260,neither,15.1221466
-p,5340,4510,1340,neither,10.4645626
-p,3830,3710,8580,neither,10.3860476
-p,7090,2180,3720,neither,13.3298749
-p,3410,2120,6400,neither,6.5504196
-p,1570,1990,3880,left,16.0690818
-p,8280,7720,7760,left,19.2019649
-p,180,4850,980,right,12.3407615
-payam,4460,2880,5160,neither,31.7953241
-payam,4990,4230,1360,left,26.7975901
-payam,3690,2640,2750,neither,13.5067165
-payam,2820,2520,5540,neither,14.3831229
-payam,640,2830,2670,neither,22.702336
-payam,2640,2520,1790,right,9.7770384
-payam,2220,3330,4770,neither,17.5712949
-payam,4000,2130,4350,neither,8.9440079
-payam,2470,5370,4710,neither,25.1191583
-payam,60,4790,3780,neither,6.8597862
-payam,5290,4760,4360,neither,39.3547437
-payam,2680,5470,4730,right,9.0767636
-payam,460,1700,4490,left,9.5737932
-payam,640,4640,3240,left,9.0991829
-payam,5360,8340,7920,neither,43.4682622
-payam,8370,8320,5650,left,13.9077435
-payam,8800,6350,2520,neither,23.3640544
-pp,1270,4440,2720,neither,15.9654004
-pp,6050,5870,8630,neither,64.0813694
-pp,4960,4100,1380,left,20.2915712
-pp,2410,2030,3340,right,5.8915843
-pp,7520,6470,3160,left,4.4095175
-pp,2490,7850,5920,right,19.7446139
-pp,7870,8190,6080,left,12.2258913
-payam,6910,8130,5200,left,22.1673536
-payam,1270,1310,8550,neither,4.9356615
-payam,1200,3010,8090,neither,6.2535121
-payam,1640,1200,3010,neither,7.2112379
-payam,2160,3180,1450,neither,12.2517686
-payam,2290,2920,1450,neither,11.0051224
-payam,4620,2970,1270,neither,6.0543814
-payam,3180,1510,360,neither,7.1203205
-payam,1110,3010,1200,neither,4.9222736
-payam,500,1240,2970,neither,244.7143959
-payam,1370,2970,1540,neither,69.5212346
-payam,3010,3200,5450,neither,12.7399181
-payam,3120,1270,1240,right,7.1447759
-payam,4860,8750,5670,neither,19.6800883
-payam,190,8410,8790,right,4.0843236
-payam,4620,7980,4870,neither,11.6172416
-payam,4870,5080,1040,neither,9.651124
-payam,590,4910,4880,neither,17.2420091
-payam,340,4880,5080,neither,11.3620724
-payam,8740,4050,4020,right,71.2654567
-payam,2310,1960,6000,neither,21.7536178
-payam,6620,5740,5500,neither,6.9649316
-payam,5670,5650,5300,right,7.7041501
-payam,3040,7940,7700,neither,25.42975
-payam,7710,7780,1620,left,195.092137
-payam,7430,7320,5660,right,13.8465693
-payam,7980,2240,2230,right,13.2360946
-payam,6790,7300,3100,neither,15.0350203
-payam,2650,7200,7300,right,6.4745868
-payam,1430,1440,2920,neither,7.7909788
-payam,3060,1290,2590,left,56.5645663
-payam,6500,790,2720,neither,12.4153127
-payam,1750,1740,7740,neither,45.0387307
-payam,3860,290,1750,right,127.7812023
-payam,2900,3120,3160,left,8.5753824
-payam,560,2300,2550,right,18.1986405
-payam,1210,6030,8530,neither,12.6016999
-payam,5130,8400,4060,right,13.3131464
-payam,1060,320,250,right,11.4700255
-payam,3250,4560,7510,left,9.6844356
-payam,2140,1010,2570,neither,19.8406119
-payam,1000,400,1450,neither,20.5022202
-payam,930,900,5540,left,9.8994706
-payam,3760,3520,1860,right,47.1336049
-payam,6140,1230,1260,left,108.9202734
-payam,590,430,310,neither,11.3733376
-payam,7880,2690,2680,neither,5.1495561
-payam,3170,2680,310,neither,8.5025821
-payam,3050,1080,410,neither,8.527219
-payam,2770,1080,7490,neither,402.8720653
-payam,3190,410,2680,neither,10.4691005
-payam,2770,410,3720,left,4.3418474
-payam,1410,2460,340,neither,12.1507813
-payam,3150,2440,2840,neither,11.3710139
-payam,2820,800,2310,neither,30.0199888
-payam,2460,710,160,neither,9.4166814
-payam,2440,30,6480,neither,7.3646978
-payam,7870,1140,340,neither,14.3368254
-payam,2930,1720,2590,neither,18.4583712
-payam,240,2450,5190,left,8.7122048
-payam,3890,2120,6000,right,17.7283225
-payam,6190,4120,6640,left,12.888971
-payam,5240,3650,5320,right,4.7470391
-payam,4330,5950,6030,left,9.3768817
-payam,1110,590,430,left,11.9393439
-payam,3000,4830,8350,neither,17.2718661
-payam,1790,2730,2750,neither,9.0715062
-payam,2820,2760,5640,neither,136.1871848
-payam,2790,2830,0,left,7.9951019
-payam,6890,3350,5370,right,10.1746638
-payam,4100,460,850,neither,15.272502
-payam,2880,190,2630,neither,11.2373856
-payam,7290,1590,460,right,328.9525274
-payam,3270,2010,590,right,42.705704
-payam,500,6020,8370,neither,10.2211368
-payam,6160,8320,2700,neither,5.8553287
-payam,2910,8510,6260,neither,20.3620241
-payam,8660,5940,6070,left,10.4425546
-payam,6670,4170,4960,right,28.4874515
-payam,1350,2030,5410,neither,8.5564246
-payam,2160,2110,2680,left,5.1389665
-payam,7520,6530,4530,neither,13.91322
-payam,7230,6670,2250,neither,8.7192474
-payam,7500,7230,8500,neither,12.1839644
-payam,6530,7500,7660,neither,13.3554725
-payam,6670,7520,7950,left,7.8109235
-payam,4440,6070,1490,neither,22.8200555
-payam,6290,4350,530,left,30.2692709
-payam,7870,8190,6080,left,18.4469568
-payam,8520,3180,2920,neither,15.4442971
-payam,1450,3180,1960,left,14.0648741
-payam,1770,8490,4180,neither,15.9577787
-payam,5060,8790,3120,neither,19.0112589
-payam,4310,8230,3100,neither,18.039143
-payam,8720,7890,5100,right,27.7411957
-payam,2620,8740,8270,neither,10.0985898
-payam,5080,4870,8800,neither,285.7399943
-payam,390,4910,4880,neither,31.1095841
-payam,4880,8270,2720,neither,6.3076338
-payam,3030,4880,8740,neither,17.0361449
-payam,4910,7980,2910,neither,8.4894469
-payam,4050,7980,1710,neither,6.3573893
-payam,4910,4880,5450,neither,4.4998203
-payam,1750,4050,8740,right,96.8739301
-payam,6650,6620,5540,neither,11.0029129
-payam,5740,5830,3310,neither,12.8770997
-payam,6390,6560,1970,neither,40.2096949
+payam,8720,1210,2930,neither,24.5149699
+payam,960,6830,7040,right,19.103924
+payam,2400,7930,7770,right,12.2262586
+payam,4850,2520,2830,neither,15.1091679
+payam,740,2820,2750,neither,18.6786855
+payam,1840,1090,7750,left,7.826561
+payam,7290,6890,6760,left,18.6590548
+payam,910,7710,7630,neither,15.9142669
+payam,110,3080,3020,right,19.3148321
+payam,4310,8430,6020,right,18.3946849
+payam,2370,6450,6640,neither,8.0961591
+payam,3760,3670,8090,left,35.6075189
+payam,7290,6890,6760,left,6.1512555
+payam,2330,1610,4290,neither,9.3917529
+payam,4960,1360,6940,neither,17.0703494
+payam,4880,2220,4050,neither,6.66984
+payam,4150,6320,5120,left,11.1385341
+payam,2380,2290,3830,neither,8.175581
+payam,6990,4350,7850,right,24.8897172
+payam,1260,1300,1150,left,8.7496713
+payam,4040,8150,270,neither,6.5148909
+payam,8210,4200,540,neither,9.3981568
+payam,1840,1090,7750,left,3.4069544
+payam,80,1770,5440,neither,6.8139126
+payam,4740,6380,6320,left,15.6245139
+payam,350,1710,820,neither,6.9307515
+payam,7980,380,440,right,6.9549693
+payam,4710,4520,6560,left,15.8368305
+payam,4150,1030,5450,right,5.8812649
+payam,4090,8210,1650,left,9.3473628
+payam,3160,3050,3300,right,15.4507672
+payam,3700,5340,4990,neither,19.4590834
+payam,5790,3250,4560,right,5.8536474
+payam,2820,2730,410,left,8.6303164
+payam,4180,6270,6780,neither,10.6426271
+payam,3450,3010,3200,neither,9.5873883
+payam,1480,2740,2400,neither,7.4411407
+payam,5200,140,2280,neither,10.3224914
+payam,7920,5360,5370,right,12.6515
+payam,8140,6670,6500,neither,8.3602584
+payam,1880,1810,5430,right,23.2110811
+payam,3190,1380,3600,left,13.4339459
+payam,50,4340,5020,neither,21.3263438
+payam,8350,8660,720,neither,5.8660808
+payam,3380,3280,1440,neither,12.1587357
+payam,1990,1450,3180,right,11.0167687
+payam,5220,3470,2030,right,17.4336008
+payam,3030,1430,7950,neither,5.1681943
+payam,2070,2690,1080,right,7.0990855
+payam,5270,760,2620,right,7.8710579
+payam,1500,5160,5060,right,13.7366622
+payam,3720,5320,980,right,3.9487614
+payam,6360,5830,6430,right,16.9997513
+payam,8100,2890,2200,neither,11.3478833
+payam,8190,8240,5770,left,9.2111841
+payam,6660,4820,3920,right,7.2671531
+payam,1380,1480,6320,left,8.1044273
+payam,2920,6550,6590,right,4.2769958
+payam,8030,5080,8270,left,8.6034509
+payam,7140,7510,4590,neither,16.6477046
+payam,2820,6050,8100,left,24.6404202
+payam,8760,7440,5770,right,13.7314397
+payam,1500,3220,670,left,11.3452669
+payam,7760,7550,5210,neither,6.2482816
+payam,1480,3220,1500,right,12.5323615
+payam,3370,5490,770,neither,7.1390339
+payam,7810,1860,900,right,10.9310926
+payam,1480,60,1000,left,22.48273
+payam,880,370,100,right,6.3977794
+payam,6530,6470,8270,left,23.1602921
+payam,7870,5570,5710,neither,10.2806256
+payam,6440,5560,1420,left,6.4094853
+payam,3080,3090,5170,left,21.2385869
+payam,2360,3630,3810,neither,8.7701795
+payam,490,1260,1220,neither,6.672139
+payam,2290,610,2460,left,6.049926
+payam,3380,3280,1440,neither,20.8177628
+payam,2660,6610,6450,left,42.8382073
+payam,5880,2790,2520,right,9.3886082
+payam,6520,5860,5480,neither,7.0841901
+payam,3640,2700,1130,left,32.19488
+payam,2800,6870,6710,neither,11.8798787
+payam,740,2820,2750,neither,9.5489998
+payam,7590,7770,6800,neither,19.0458094
+payam,4240,2560,270,right,12.5144577
+payam,4170,8630,3110,neither,9.0806503
+payam,4870,6970,6740,neither,16.2745591
+payam,6840,6060,6290,neither,6.9093279
+payam,90,3320,4740,neither,9.6742201
+payam,7140,7510,4590,left,22.7795439
+payam,5010,2290,7770,neither,19.3744291
+payam,5960,6240,8600,right,9.0793041
+payam,1250,6260,7880,right,4.93853
+payam,4270,1320,1350,right,17.2790887
+payam,4670,1250,3220,neither,10.618701
+payam,1480,2740,2400,neither,7.0960476
+payam,4270,1320,1350,right,4.4593416
+payam,7500,6670,450,neither,6.8258252
+payam,2330,3690,3800,neither,9.1139991
+payam,6890,3890,3700,neither,4.0416935
+payam,2160,8690,5990,neither,11.627078
+payam,5230,5350,5200,left,12.7237877
+payam,5410,5190,5000,neither,14.5101507
+payam,8310,8680,3050,left,7.9895167
+payam,1600,400,980,neither,10.7849002
+payam,170,2250,1110,right,8.0488411
+payam,540,2300,1470,left,15.8741313
+payam,6520,1530,2860,neither,7.0883511
+payam,5640,5810,4930,neither,7.4146026
+payam,3220,6620,5720,right,7.7339321
+test,30,3890,1900,right,22.9397707
+test,8530,4240,8100,neither,5.9101173
+test,7420,150,560,neither,137.2592382
+test,2130,5290,6180,neither,10.5838116
+test,1370,1510,1710,neither,6.7253406
+test,2750,1950,3380,right,17.9191102
+test,6460,2490,2850,neither,29.0007036
+test,3030,1430,7950,neither,3.9712144
+test,1390,7990,5990,right,42.9268688
+Payam,7010,7880,1480,neither,29.2939266
+Payam,3670,4380,1690,neither,13.5852305
+payam3,510,3840,2050,neither,19.60391
+payam3,3120,5240,400,neither,7.5961915
+payam3,6610,1980,5910,neither,32.5094664
+payam3,2790,6640,7410,neither,6.8419322
+payam3,2500,2680,520,neither,4.9135896
+payam3,3000,6310,6340,right,15.3105667
+payam3,3430,5560,80,neither,7.3489193
+payam3,5430,6460,570,neither,18.0880914
+payam3,5710,2830,2520,neither,11.3542066
+payam3,8740,1560,590,neither,9.5529691
+payam3,260,5430,4780,neither,8.8140994
+payam3,5380,5730,6680,neither,9.4738672
+payam3,990,190,3210,neither,6.6699878
+payam3,820,1910,4510,left,6.098936
+payam3,610,5890,7960,right,13.1935591
+payam3,4870,4910,6980,left,79.8618806
+payam3,120,1450,1240,left,23.4519851
+payam3,6940,7090,4180,left,7.9929938
+payam3,6360,5830,6430,neither,27.7130248
+payam,6980,8150,440,left,27.8841035
+payam,580,7490,7090,neither,10.9262192
+payam,2340,8460,6140,neither,26.7986099
+payam,5860,5670,4210,neither,18.7140217
+payam,1860,7700,7940,neither,14.8113515
+payam,8140,6110,4520,neither,24.3878234
+payam,2710,300,430,neither,21.5049204
+payam,3630,3590,2680,left,23.1192547
+payam,3740,820,3660,right,5.682812
+payam,1480,2740,2400,right,12.0348692
+payam,910,7710,7630,right,7.8845036
+payam,230,1540,4340,left,7.4580693
+payam,3170,1410,490,neither,10.9175666
+payam,500,2870,260,neither,12.8639616
+payam,7930,7780,5110,right,26.1968788
+payam,2300,1100,1090,right,8.9982824
+payam,2960,8730,4210,neither,15.1996149
+payam,4500,5370,2490,neither,20.4606212
+payam,2440,1140,6330,left,8.8519273
+payam,3930,1500,2990,right,12.3335319
+payam,4960,7920,3680,left,8.5003094
+payam,8600,6700,7470,right,8.8673901
+payam,5580,6610,1700,left,23.4709146
+payam,1320,3160,1380,right,19.3742774
+payam,8220,8660,1510,left,4.5353338
+payam,3120,6550,1760,neither,11.938804
+payam,3120,6550,1760,neither,24.8215028
+payam,3120,6550,1760,neither,32.6924222
+payam,3120,6550,1760,neither,199.0291749
+aaa,7680,780,3320,neither,13.9752481
+aaa,1170,1680,8400,right,72.1732308
+aaa,190,1570,590,neither,23.8951871
+aaa,890,190,8790,neither,15.8835947
+aaa,1110,850,1120,neither,12.2388968
+aaa,6990,70,840,neither,10.1972333
+aaa,1570,600,1040,neither,58.0329853
+payam,2610,3470,8790,neither,23.478667
+payam,2820,970,2610,neither,17.2883431
+payam,5130,3330,1970,neither,25.0759505
+payam,200,970,3200,neither,10.951438
+payam,6160,890,870,neither,13.3928642
+payam,4030,870,190,right,21.7972779
+payam,4640,3260,4480,neither,15.6459803
+payam,3750,3400,250,left,15.7365793
+payam,5820,5110,260,neither,11.9040374
+payam,4960,7920,2720,left,13.7575275
+payam,3340,5410,4670,neither,11.0850437
+payam,3220,5500,3450,neither,12.7476892
+payam,4830,3340,1750,neither,8.2270027
+payam,3280,5500,6100,neither,11.6609682
+payam,5570,3380,4830,neither,12.5343834
+payam,2170,3490,5410,left,20.2327963
+payam,1310,7950,5980,right,17.9011993
+payam,4830,8250,5060,right,12.6581358
+payam,2410,1730,1420,left,8.4628379
+payam,1610,140,1040,neither,39.4259191
+payam,2720,290,660,neither,9.004165
+payam,290,1770,1500,neither,8.7271764
+payam,2590,1750,2280,left,21.5393153
+payam,940,4110,5100,neither,14.6486691
+payam,3880,5100,8100,right,6.654937
+payam,2820,4740,3760,neither,8.9366303
+payam,2620,3800,3640,neither,32.7218192
+payam,490,3630,50,neither,15.6962632
+payam,6500,4740,3950,neither,10.4569726
+payam,3860,3960,3670,neither,5.6011954
+payam,8160,4670,3320,neither,9.9990773
+payam,5130,4670,3960,neither,10.0627506
+payam,1970,3670,6110,neither,8.9863573
+payam,5240,3800,4670,neither,35.1511656
+pp,6520,3000,3730,neither,13.2667115
+pp,3950,4670,8530,neither,10.3694821
+pp,3960,3670,1100,right,14.2283085
+pp,1080,2690,4240,left,8.0165702
+pp,4060,8590,7870,right,39.0669929
+pp,2730,860,5670,neither,21.0611187
+pp,1970,4410,2750,neither,23.8645963
+pp,2520,4410,7870,neither,7.0013341
+pp,2280,2790,2800,right,10.0719624
+pp,5730,1590,1630,neither,60.7203313
+pp,2170,2630,970,right,15.679544
+pp,7170,4640,1940,neither,12.8070448
+pp,1910,3790,3370,right,10.4345838
+pp,1940,5050,4170,neither,15.4618189
+pp,4240,6180,5690,neither,11.6433038
+pp,1360,4960,4100,right,22.1847562
+pp,2620,3610,3280,neither,16.611427
+payam,1140,1770,140,neither,32.3426861
+payam,5410,2110,3480,neither,9.2396016
+payam,6350,4540,2160,neither,15.8730432
+payam,1090,3280,5400,right,7.9992393
+payam,6870,7500,6480,left,16.2428709
+payam,4890,7850,2720,left,11.0865865
+payam,6340,6250,450,left,7.1841906
+payam,2410,1240,1360,right,3.8704583
+payam,4010,8090,2620,left,11.8036291
+payam,1940,4910,4050,right,10.9399087
+payam,8580,6390,5740,neither,19.4242683
+payam,360,6430,6520,neither,39.232415
+payam,4330,5830,6420,neither,14.8946759
+payam,7070,6430,280,neither,9.1711564
+payam,440,6390,5570,neither,16.4782124
+payam,5830,6650,3080,neither,27.4215719
+payam,6620,5790,3630,neither,7.9312582
+payam,3520,6680,5860,neither,15.903332
+payam,6680,6600,1570,neither,7.0110227
+payam,1050,5650,5570,neither,9.8728923
+payam,4540,5740,6430,right,7.6298997
+payam,2560,1730,1030,left,64.8227024
+payam,1910,6760,7300,right,5.3704937
+payam,5730,3020,3080,right,14.8289519
+payam,4440,1750,1640,neither,17.3421493
+payam,3160,910,140,neither,7.7735092
+payam,5280,80,2260,right,10.4654106
+p,8630,3910,1190,neither,12.2734672
+p,4610,8400,6030,neither,25.0126291
+p,4250,5200,8530,right,11.2253927
+p,370,660,6480,neither,8.051555
+p,5420,150,180,right,7.0392668
+p,3360,3600,560,right,12.5866966
+p,3900,3980,3760,right,60.7114951
+p,5010,8340,6970,neither,25.5606582
+p,6210,8650,2790,neither,14.1406681
+p,640,8210,5010,neither,11.7996506
+p,3200,8170,8030,neither,88.1912972
+p,7970,8110,7150,neither,9.3730532
+p,7980,4090,8290,right,14.0371127
+p,1070,430,410,neither,9.7275574
+p,310,410,4240,left,5.8301648
+p,7030,2460,2840,right,7.3928905
+p,3830,5260,3130,neither,7.6125024
+p,3850,4450,7780,neither,15.566987
+p,3830,2120,6260,neither,15.1221466
+p,5340,4510,1340,neither,10.4645626
+p,3830,3710,8580,neither,10.3860476
+p,7090,2180,3720,neither,13.3298749
+p,3410,2120,6400,neither,6.5504196
+p,1570,1990,3880,left,16.0690818
+p,8280,7720,7760,left,19.2019649
+p,180,4850,980,right,12.3407615
+payam,4460,2880,5160,neither,31.7953241
+payam,4990,4230,1360,left,26.7975901
+payam,3690,2640,2750,neither,13.5067165
+payam,2820,2520,5540,neither,14.3831229
+payam,640,2830,2670,neither,22.702336
+payam,2640,2520,1790,right,9.7770384
+payam,2220,3330,4770,neither,17.5712949
+payam,4000,2130,4350,neither,8.9440079
+payam,2470,5370,4710,neither,25.1191583
+payam,60,4790,3780,neither,6.8597862
+payam,5290,4760,4360,neither,39.3547437
+payam,2680,5470,4730,right,9.0767636
+payam,460,1700,4490,left,9.5737932
+payam,640,4640,3240,left,9.0991829
+payam,5360,8340,7920,neither,43.4682622
+payam,8370,8320,5650,left,13.9077435
+payam,8800,6350,2520,neither,23.3640544
+pp,1270,4440,2720,neither,15.9654004
+pp,6050,5870,8630,neither,64.0813694
+pp,4960,4100,1380,left,20.2915712
+pp,2410,2030,3340,right,5.8915843
+pp,7520,6470,3160,left,4.4095175
+pp,2490,7850,5920,right,19.7446139
+pp,7870,8190,6080,left,12.2258913
+payam,6910,8130,5200,left,22.1673536
+payam,1270,1310,8550,neither,4.9356615
+payam,1200,3010,8090,neither,6.2535121
+payam,1640,1200,3010,neither,7.2112379
+payam,2160,3180,1450,neither,12.2517686
+payam,2290,2920,1450,neither,11.0051224
+payam,4620,2970,1270,neither,6.0543814
+payam,3180,1510,360,neither,7.1203205
+payam,1110,3010,1200,neither,4.9222736
+payam,500,1240,2970,neither,244.7143959
+payam,1370,2970,1540,neither,69.5212346
+payam,3010,3200,5450,neither,12.7399181
+payam,3120,1270,1240,right,7.1447759
+payam,4860,8750,5670,neither,19.6800883
+payam,190,8410,8790,right,4.0843236
+payam,4620,7980,4870,neither,11.6172416
+payam,4870,5080,1040,neither,9.651124
+payam,590,4910,4880,neither,17.2420091
+payam,340,4880,5080,neither,11.3620724
+payam,8740,4050,4020,right,71.2654567
+payam,2310,1960,6000,neither,21.7536178
+payam,6620,5740,5500,neither,6.9649316
+payam,5670,5650,5300,right,7.7041501
+payam,3040,7940,7700,neither,25.42975
+payam,7710,7780,1620,left,195.092137
+payam,7430,7320,5660,right,13.8465693
+payam,7980,2240,2230,right,13.2360946
+payam,6790,7300,3100,neither,15.0350203
+payam,2650,7200,7300,right,6.4745868
+payam,1430,1440,2920,neither,7.7909788
+payam,3060,1290,2590,left,56.5645663
+payam,6500,790,2720,neither,12.4153127
+payam,1750,1740,7740,neither,45.0387307
+payam,3860,290,1750,right,127.7812023
+payam,2900,3120,3160,left,8.5753824
+payam,560,2300,2550,right,18.1986405
+payam,1210,6030,8530,neither,12.6016999
+payam,5130,8400,4060,right,13.3131464
+payam,1060,320,250,right,11.4700255
+payam,3250,4560,7510,left,9.6844356
+payam,2140,1010,2570,neither,19.8406119
+payam,1000,400,1450,neither,20.5022202
+payam,930,900,5540,left,9.8994706
+payam,3760,3520,1860,right,47.1336049
+payam,6140,1230,1260,left,108.9202734
+payam,590,430,310,neither,11.3733376
+payam,7880,2690,2680,neither,5.1495561
+payam,3170,2680,310,neither,8.5025821
+payam,3050,1080,410,neither,8.527219
+payam,2770,1080,7490,neither,402.8720653
+payam,3190,410,2680,neither,10.4691005
+payam,2770,410,3720,left,4.3418474
+payam,1410,2460,340,neither,12.1507813
+payam,3150,2440,2840,neither,11.3710139
+payam,2820,800,2310,neither,30.0199888
+payam,2460,710,160,neither,9.4166814
+payam,2440,30,6480,neither,7.3646978
+payam,7870,1140,340,neither,14.3368254
+payam,2930,1720,2590,neither,18.4583712
+payam,240,2450,5190,left,8.7122048
+payam,3890,2120,6000,right,17.7283225
+payam,6190,4120,6640,left,12.888971
+payam,5240,3650,5320,right,4.7470391
+payam,4330,5950,6030,left,9.3768817
+payam,1110,590,430,left,11.9393439
+payam,3000,4830,8350,neither,17.2718661
+payam,1790,2730,2750,neither,9.0715062
+payam,2820,2760,5640,neither,136.1871848
+payam,2790,2830,0,left,7.9951019
+payam,6890,3350,5370,right,10.1746638
+payam,4100,460,850,neither,15.272502
+payam,2880,190,2630,neither,11.2373856
+payam,7290,1590,460,right,328.9525274
+payam,3270,2010,590,right,42.705704
+payam,500,6020,8370,neither,10.2211368
+payam,6160,8320,2700,neither,5.8553287
+payam,2910,8510,6260,neither,20.3620241
+payam,8660,5940,6070,left,10.4425546
+payam,6670,4170,4960,right,28.4874515
+payam,1350,2030,5410,neither,8.5564246
+payam,2160,2110,2680,left,5.1389665
+payam,7520,6530,4530,neither,13.91322
+payam,7230,6670,2250,neither,8.7192474
+payam,7500,7230,8500,neither,12.1839644
+payam,6530,7500,7660,neither,13.3554725
+payam,6670,7520,7950,left,7.8109235
+payam,4440,6070,1490,neither,22.8200555
+payam,6290,4350,530,left,30.2692709
+payam,7870,8190,6080,left,18.4469568
+payam,8520,3180,2920,neither,15.4442971
+payam,1450,3180,1960,left,14.0648741
+payam,1770,8490,4180,neither,15.9577787
+payam,5060,8790,3120,neither,19.0112589
+payam,4310,8230,3100,neither,18.039143
+payam,8720,7890,5100,right,27.7411957
+payam,2620,8740,8270,neither,10.0985898
+payam,5080,4870,8800,neither,285.7399943
+payam,390,4910,4880,neither,31.1095841
+payam,4880,8270,2720,neither,6.3076338
+payam,3030,4880,8740,neither,17.0361449
+payam,4910,7980,2910,neither,8.4894469
+payam,4050,7980,1710,neither,6.3573893
+payam,4910,4880,5450,neither,4.4998203
+payam,1750,4050,8740,right,96.8739301
+payam,6650,6620,5540,neither,11.0029129
+payam,5740,5830,3310,neither,12.8770997
+payam,6390,6560,1970,neither,40.2096949
diff --git a/scripts/data_loader/lmdb_data_loader.py b/scripts/data_loader/lmdb_data_loader.py
index a11fd16..956782d 100644
--- a/scripts/data_loader/lmdb_data_loader.py
+++ b/scripts/data_loader/lmdb_data_loader.py
@@ -1,1323 +1,1323 @@
-"""
-"""
-
-
-from __future__ import annotations
-import logging
-import os
-import pickle
-import random
-from typing import Tuple
-
-import numpy as np
-import lmdb as lmdb
-import torch
-from torch.nn.utils.rnn import pad_sequence
-from torch.utils.data import Dataset
-from torch.utils.data.dataloader import default_collate
-from data_loader.data_preprocessor import DataPreprocessor
-import pyarrow
-from configargparse import argparse
-from sklearn.cluster import DBSCAN, KMeans, AgglomerativeClustering
-
-from model.vocab import Vocab
-import utils
-
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-
-
-def word_seq_collate_fn(
-    data: list,
-) -> (
-    Tuple[
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        dict,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-    ]
-    | Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, dict]
-):
-    """Collate function for loading word sequences in variable lengths.
-
-    This function returns extra values depending if sentence_leve is True or False.
-    sentence_leve is set to always True and must be manually changed.
-
-    Args:
-        data: A list of samples including the following values in each element:
-            word_seq: A list of Tensors of word vector representations.
-            words_lengths: A list of Tensors of word lengths in word_seq.
-            poses_seq: A list of Tensors of poses/gestures.
-            audio: A list of Tensors of audio data.
-
-    Returns:
-        A 5-Tuple or 8-Tuple:
-
-        8-Tuple (if sentence_level is set to True):
-            word_seq: A Tensor of word vector representations.
-            words_lengths: A Tensor of word lengths in word_seq.
-            poses_seq: A Tensor of poses/gestures.
-            audio: A Tensor of audio data.
-            aux_info: A dict containing info such as name, start/end frames and times.
-            sentence_leve_latents: #TODO
-            cluster_portion: #TODO
-            GPT3_Embedding: #TODO
-
-        5-Tuple:
-            The first five values in the 8-Tuple.
-    """
-    # sort a list by sequence length (descending order) to use pack_padded_sequence
-    data.sort(key=lambda x: len(x[0]), reverse=True)
-
-    # separate source and target sequences
-    # Todo: fix this using args
-    sentence_leve = True
-    if not sentence_leve:
-        (
-            word_seq,
-            poses_seq,
-            audio,
-            aux_info,
-        ) = zip(*data)
-    else:
-        (
-            word_seq,
-            poses_seq,
-            audio,
-            aux_info,
-            sentence_leve_latents,
-            cluster_portion,
-            GPT3_Embedding,
-        ) = zip(*data)
-
-    # merge sequences
-    words_lengths = torch.LongTensor([len(x) for x in word_seq])
-    word_seq = pad_sequence(word_seq, batch_first=True).long()
-
-    poses_seq = default_collate(poses_seq)
-    audio = default_collate(audio)
-    if sentence_leve:
-        sentence_leve_latents = default_collate(sentence_leve_latents)
-        cluster_portion = default_collate(cluster_portion)
-        GPT3_Embedding = default_collate(GPT3_Embedding)
-
-    # audio = default_collate(audio)
-    aux_info = {key: default_collate([d[key] for d in aux_info]) for key in aux_info[0]}
-
-    if sentence_leve:
-        return (
-            word_seq,
-            words_lengths,
-            poses_seq,
-            audio,
-            aux_info,
-            sentence_leve_latents,
-            cluster_portion,
-            GPT3_Embedding,
-        )
-    else:
-        return word_seq, words_lengths, poses_seq, audio, aux_info
-
-
-class TrinityDataset(Dataset):
-    """Contains information and associated parameters of a (Trinity) dataset.
-
-    This is a PyTorch Dataset subclass containing information of a Trinity dataset.
-    https://trinityspeechgesture.scss.tcd.ie/data/Trinity%20Speech-Gesture%20I/GENEA_Challenge_2020_data_release/
-
-    Attributes:
-        lmdb_dir: A string representing the filepath of the directory containing the actual dataset.
-        lmdb_env: A Lmdb ('Lightning Memory-Mapped' Database) object loaded from .mdb files located at the lmdb_dir.
-        n_poses: An int representing the number of frames in each clip in the dataset (normally 30 (in 30 fps)).
-        subdivision_stride: An int representing the number of frames between the start of one clip and the start of the next clip (clips can overlap).
-        skeleton_resampling_fps: An int representing the frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps).
-        n_samples: An int representing the number of clips/entries in the original dataset.
-        lang_model: A pre-trained (English) language vector representation contained in the 'Vocab' custom class.
-        data_mean: A mean calculated from each video in the original dataset.
-        data_std: A standard deviation calculcated from each video in the original dataset.
-    """
-
-    def __init__(
-        self,
-        args: argparse.Namespace,
-        lmdb_dir: str,
-        n_poses: int,
-        subdivision_stride: int,
-        pose_resampling_fps: int,
-        data_mean: list[float],
-        data_std: list[float],
-    ):
-        """Initialize with multiple dataset parameters.
-
-        The args argument must contain the following keys:
-            name: A string name of the model (ex. 'DAE' or 'autoencoder_vq').
-            rep_learning_checkpoint: If name is not 'DAE', a string filepath to a saved 'DAE' checkpoint model.
-            autoencoder_checkpoint: If sentence level is True, a string filepath to a saved VQVAE checkpoint model.
-            sentence_frame_length: An integer number of frames in each clip (for a sentence instead of gesture).
-
-        Args:
-            args: A configargparse object containing parameters (See above).
-            lmdb_dir: A string representing the filepath of the directory containing the actual dataset.
-            n_poses: An int representing the number of frames in each clip in the dataset (normally 30 (in 30 fps)).
-            subdivision_stride: An int representing the number of frames between the start of one clip and the start of the next clip (clips can overlap).
-            pose_resampling_fps: An int representing the frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps).
-            data_mean: A mean calculated from each video in the original dataset.
-            data_std: A standard deviation calculcated from each video in the original dataset.
-        """
-        self.lmdb_dir = lmdb_dir
-        self.n_poses = n_poses
-        self.subdivision_stride = subdivision_stride
-        self.skeleton_resampling_fps = pose_resampling_fps
-        self.lang_model = None
-        self.data_mean = np.array(data_mean).squeeze()
-        self.data_std = np.array(data_std).squeeze()
-
-        logging.info("Reading data '{}'...".format(lmdb_dir))
-        preloaded_dir = lmdb_dir + "_cache"
-        if not os.path.exists(preloaded_dir):
-            data_sampler = DataPreprocessor(
-                args,
-                lmdb_dir,
-                preloaded_dir,
-                n_poses,
-                subdivision_stride,
-                pose_resampling_fps,
-                sentence_level=False,
-            )
-            data_sampler.run()
-        else:
-            logging.info("Found pre-loaded samples from {}".format(preloaded_dir))
-
-        # init lmdb
-        self.lmdb_env: lmdb.Environment = lmdb.open(
-            preloaded_dir, readonly=True, lock=False
-        )
-        with self.lmdb_env.begin() as txn:
-            self.n_samples = txn.stat()["entries"]
-
-    def __len__(self) -> int:
-        """Get the size of the dataset.
-
-        Returns:
-            The number of samples in the original dataset.
-        """
-        return self.n_samples
-
-    def __getitem__(
-        self, idx: int
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, dict]:
-        """Get an item at a specific index in the dataset.
-
-        Retrieves a specific entry in the original dataset and
-        associated information of that entry such as words, audio, poses, etc.
-
-        Args:
-            idx: The index of the item to retrieve.
-
-        Returns:
-            A 4-Tuple:
-                word_seq_tensor: A Tensor of word vector representations.
-                pose_seq: A Tensor of pose/gesture data.
-                audio: A Tensor of audio data.
-                aux_info: A dict containing the string keys:
-                    'vid': A string name for the information.
-                    'start_frame_no': An integer of the start index.
-                    'end_frame_no': An integer of the end index.
-                    'start_time': A float of the start time of the clip.
-                    'end_time': A float of the end time of the clip.
-        """
-        with self.lmdb_env.begin(write=False) as txn:
-            key = "{:010}".format(idx).encode("ascii")
-            sample = txn.get(key)
-
-            sample = pyarrow.deserialize(sample)
-            word_seq, pose_seq, audio, aux_info = sample
-
-        def words_to_tensor(lang, words, end_time=None):
-            indexes = [lang.SOS_token]
-            for word in words:
-                if end_time is not None and word[1] > end_time:
-                    break
-                indexes.append(lang.get_word_index(word[0]))
-            indexes.append(lang.EOS_token)
-            return torch.Tensor(indexes).long()
-
-        # normalize
-        std = np.clip(self.data_std, a_min=0.01, a_max=None)
-        pose_seq: torch.Tensor = (pose_seq - self.data_mean) / std
-
-        # to tensors
-        word_seq_tensor = words_to_tensor(
-            self.lang_model, word_seq, aux_info["end_time"]
-        )
-        pose_seq = torch.from_numpy(pose_seq).reshape((pose_seq.shape[0], -1)).float()
-        audio = torch.from_numpy(audio).float()
-
-        return word_seq_tensor, pose_seq, audio, aux_info
-
-    def set_lang_model(self, lang_model: Vocab) -> None:
-        """Set the word vector representation to be used with the dataset.
-
-        Modifies the internal state of the object at the attribute 'lang_model'.
-
-        Args:
-            lang_model: A pre-trained word vector representation contained in the 'Vocab' class.
-        """
-        self.lang_model = lang_model
-
-
-class TrinityDataset_DAE(Dataset):
-    """Contains information and parameters of a (Trinity) dataset.
-
-    This is a PyTorch Dataset subclass containing information of a Trinity dataset.
-    https://trinityspeechgesture.scss.tcd.ie/data/Trinity%20Speech-Gesture%20I/GENEA_Challenge_2020_data_release/
-
-    This class is focussed on the pose/gesture data only.
-
-    Attributes:
-        lmdb_dir: A string filepath of the directory containing the actual dataset.
-        lmdb_env: A Lmdb object loaded from .mdb files in the lmdb_dir.
-        n_poses: An integer number of frames in each clip in the dataset (normally 30 (in 30 fps)).
-        subdivision_stride: An integer number of frames between the start of one clip and the start of the next clip (clips can overlap).
-        skeleton_resampling_fps: An integer frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps).
-        n_samples: An integer number of clips/entries in the original dataset.
-        lang_model: A 'Vocab' pre-trained word vector representation.
-        data_mean: A mean calculated from each video in the original dataset.
-        data_std: A standard deviation calculcated from each video.
-        all_poses: A list where each element is a dict containing string keys:
-            'original': A Tensor of each pose/gesture data.
-            'noisy': A Tensor with the same values as 'original'.
-    """
-
-    def __init__(
-        self,
-        args: argparse.Namespace,
-        lmdb_dir: str,
-        n_poses: int,
-        subdivision_stride: int,
-        pose_resampling_fps: int,
-        data_mean: list[float],
-        data_std: list[float],
-    ):
-        """Initialize with dataset location and several parameters.
-
-        The args argument must contain the following keys:
-            name: A string name of the model (ex. 'DAE' or 'autoencoder_vq').
-            rep_learning_checkpoint: If name is not 'DAE', a string filepath to a saved 'DAE' checkpoint model.
-            autoencoder_checkpoint: If sentence level is True, a string filepath to a saved VQVAE checkpoint model.
-            sentence_frame_length: An integer number of frames in each clip (for a sentence instead of gesture).
-
-        Args:
-            args: A configargparse object containing parameters (See above).
-            lmdb_dir: A string representing the filepath of the directory containing the actual dataset.
-            n_poses: An int representing the number of frames in each clip in the dataset (normally 30 (in 30 fps)).
-            subdivision_stride: An int representing the number of frames between the start of one clip and the start of the next clip (clips can overlap).
-            pose_resampling_fps: An int representing the frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps).
-            data_mean: A mean calculated from each video in the original dataset.
-            data_std: A standard deviation calculcated from each video in the original dataset.
-        """
-        self.lmdb_dir = lmdb_dir
-        self.n_poses = n_poses
-        self.subdivision_stride = subdivision_stride
-        self.skeleton_resampling_fps = pose_resampling_fps
-        self.lang_model = None
-        self.data_mean = np.array(data_mean).squeeze()
-        self.data_std = np.array(data_std).squeeze()
-
-        logging.info("Reading data '{}'...".format(lmdb_dir))
-        preloaded_dir = lmdb_dir + "_cache"
-        if not os.path.exists(preloaded_dir):
-            data_sampler = DataPreprocessor(
-                args,
-                lmdb_dir,
-                preloaded_dir,
-                n_poses,
-                subdivision_stride,
-                pose_resampling_fps,
-            )
-            data_sampler.run()
-        else:
-            logging.info("Found pre-loaded samples from {}".format(preloaded_dir))
-
-        # init lmdb
-        self.lmdb_env: lmdb.Environment = lmdb.open(
-            preloaded_dir, readonly=True, lock=False
-        )
-        with self.lmdb_env.begin() as txn:
-            self.n_samples = txn.stat()["entries"]
-
-        # Initiate all poses
-        self.all_poses = []
-        self.create_all_poses()
-        print("data init finished!")
-
-    def __len__(self) -> int:
-        """Get the size of the dataset.
-
-        Returns:
-            The number of samples in the dataset.
-        """
-        # return 500
-        # return (self.n_samples * self.n_poses) //5
-        return self.n_samples
-
-    def create_all_poses(self) -> None:
-        """Move all poses/gestures from database into object.
-
-        Modifies the internal state of the object at attribute 'all_poses'.
-        """
-        with self.lmdb_env.begin(write=False) as txn:
-            for i in range(self.n_samples):
-                key = "{:010}".format(i).encode("ascii")
-                sample = txn.get(key)
-
-                sample = pyarrow.deserialize(sample)
-                word_seq, pose_seq, audio, aux_info = sample
-
-                # normalize
-                std = np.clip(self.data_std, a_min=0.01, a_max=None)
-                pose_seq = (pose_seq - self.data_mean) / std
-
-                for j in range(0, len(pose_seq)):
-                    original = pose_seq[j, :]
-                    var_coef = 1  # std
-                    sigma = 1
-                    # noisy = self.add_noise(original, 0.0, std)
-                    noisy = original  # dropout layer adds noise instead
-                    self.all_poses.append({"original": original, "noisy": noisy})
-
-    def get_item_Memory_Efficient(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Get the item at a specific index in the dataset.
-
-        This method improves memory performance by returning
-        the pose/gesture data of the item only.
-
-        Args:
-            idx: An integer index of the item to get.
-
-        Returns:
-            A 2-Tuple:
-                noisy: A Tensor of pose/gesture data.
-                original: A Tensor with the same values as 'noisy'.
-        """
-        idx_lmdb = idx // self.n_poses
-
-        with self.lmdb_env.begin(write=False) as txn:
-            key = "{:010}".format(idx_lmdb).encode("ascii")
-            sample = txn.get(key)
-
-            sample = pyarrow.deserialize(sample)
-            word_seq, pose_seq, audio, aux_info = sample
-
-            # normalize
-            std = np.clip(self.data_std, a_min=0.01, a_max=None)
-            pose_seq = (pose_seq - self.data_mean) / std
-
-            original = pose_seq[idx % self.n_poses, :]
-            noisy = original
-
-            original = (
-                torch.from_numpy(original).reshape((original.shape[0], -1)).float()
-            )
-            noisy = torch.from_numpy(noisy).reshape((noisy.shape[0], -1)).float()
-
-            return noisy, original
-
-    def add_noise(
-        self, x: np.ndarray, variance_multiplier: np.ndarray, sigma: np.ndarray
-    ) -> np.ndarray:
-        """Add Gaussian noise to the input data.
-
-        The noise added is based on the variance_multiplier and sigma array values.
-
-        Args:
-            x: A numeric numpy array of data.
-            variance_multiplier: A numeric numpy array of coefficients to multiple variance of the noise on
-            sigma: A numeric numpy array of the variance of the dataset
-
-        Returns:
-            A numeric numpy array of the data with noise added.
-        """
-        eps = 1e-15
-        noise = np.random.normal(
-            0.0, np.multiply(sigma, variance_multiplier) + eps, x.shape
-        )
-        x = x + noise
-        return x
-
-    def add_noise2(self, x: np.ndarray, prob: float) -> np.ndarray:
-        """Add Gaussian noise to the input data.
-
-        The noise is added by zeroing the specific elements.
-
-        Args:
-            x: A numeric numpy array of data.
-            prob: A float percentage of adding noise to a single element.
-
-        Returns:
-            A numeric numpy array of the data with noise added.
-        """
-        for i in range(len(x)):
-            rnd = random.random()
-            if rnd < prob:
-                x[i] = 0
-        return x
-
-    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Get the item at a specific index in the dataset.
-
-        This method returns the data exactly as stored. The 'noisy' data does
-        not have noise added but assumes that noise has previously been added.
-
-        Args:
-            idx: An integer index of the item to get.
-
-        Returns:
-            A 2-Tuple:
-                noisy: A Tensor of pose/gesture data (with noise).
-                original: A Tensor of pose/gesture data.
-        """
-        # Keep and document for only large datasets
-        # Todo: I think this way is more efficient than the prev. approach
-        # I need to double check and make sure everything works as before.
-        # return self.get_item_Memory_Efficient(idx)
-        # original, noisy = self.get_item_Memory_Efficient(idx)
-        # return noisy, original
-
-        original = self.all_poses[idx]["original"]
-        noisy = self.all_poses[idx]["noisy"]
-
-        original = torch.from_numpy(original).reshape((original.shape[0], -1)).float()
-        noisy = torch.from_numpy(noisy).reshape((noisy.shape[0], -1)).float()
-
-        return noisy, original
-
-    def set_lang_model(self, lang_model: Vocab) -> None:
-        """Set the word vector representation to be used with the dataset.
-
-        Modifies the internal state of the object at the attribute 'lang_model'.
-
-        Args:
-            lang_model: A pre-trained word vector representation contained in the 'Vocab' class.
-        """
-        self.lang_model = lang_model
-
-
-class TrinityDataset_DAEed_Autoencoder(Dataset):
-    """Contains information and parameters of a (Trinity) dataset.
-
-    This is a PyTorch Dataset subclass containing information of a Trinity dataset.
-    https://trinityspeechgesture.scss.tcd.ie/data/Trinity%20Speech-Gesture%20I/GENEA_Challenge_2020_data_release/
-
-    This class is #TODO.
-
-    Attributes:
-        lmdb_dir: A string filepath of the directory containing the actual dataset.
-        lmdb_env: A Lmdb object loaded from .mdb files in the lmdb_dir.
-        n_poses: An integer number of frames in each clip in the dataset (normally 30 (in 30 fps)).
-        subdivision_stride: An integer number of frames between the start of one clip and the start of the next clip (clips can overlap).
-        skeleton_resampling_fps: An integer frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps).
-        n_samples: An integer number of clips/entries in the original dataset.
-        lang_model: A 'Vocab' pre-trained word vector representation or None.
-        data_mean: A mean calculated from each video in the original dataset.
-        data_std: A standard deviation calculcated from each video.
-        pairwise_enabled: #TODO
-        use_derivative: Boolean to stack gradients to data during training.
-        encoded_labeled_poses: #TODO
-        rep_learning_dim: An integer dimension of the model (unused).
-        rep_learning_checkpoint: A string filepath to saved DAE model checkpoints.
-        rep_model: A DAE neural net model loaded from the above checkpoint.
-            Models: VQ_Frame, VAE_Network, DAE_Network in DAE_model.py.
-    """
-
-    def __init__(
-        self,
-        args: argparse.Namespace,
-        lmdb_dir: str,
-        n_poses: int,
-        subdivision_stride: int,
-        pose_resampling_fps: int,
-        data_mean: list[float],
-        data_std: list[float],
-    ):
-        """Initialize with dataset location and several parameters.
-
-        The args argument must contain the following keys:
-            name: A string name of the model (ex. 'DAE' or 'autoencoder_vq').
-            rep_learning_checkpoint: If name is not 'DAE', a string filepath to a saved 'DAE' checkpoint model.
-            autoencoder_checkpoint: If sentence level is True, a string filepath to a saved VQVAE checkpoint model.
-            sentence_frame_length: An integer number of frames in each clip (for a sentence instead of gesture).
-            rep_learning_checkpoint: A string filepath to saved model checkpoints.
-            use_derivative: A boolean whether to use derivatives.
-
-        Args:
-            args: A configargparse object containing parameters (See above).
-            lmdb_dir: A string representing the filepath of the directory containing the actual dataset.
-            n_poses: An int representing the number of frames in each clip in the dataset (normally 30 (in 30 fps)).
-            subdivision_stride: An int representing the number of frames between the start of one clip and the start of the next clip (clips can overlap).
-            pose_resampling_fps: An int representing the frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps).
-            data_mean: A mean calculated from each video in the original dataset.
-            data_std: A standard deviation calculcated from each video in the original dataset.
-        """
-        self.lmdb_dir = lmdb_dir
-        self.n_poses = n_poses
-        self.subdivision_stride = subdivision_stride
-        self.skeleton_resampling_fps = pose_resampling_fps
-        self.lang_model: Vocab | None = None
-        self.data_mean = np.array(data_mean).squeeze()
-        self.data_std = np.array(data_std).squeeze()
-
-        # We will change it to true once we call the creat_similarity_dataset
-        self.pairwise_enabled: bool = False
-        self.use_derivative: bool = args.use_derivative == "True"
-
-        logging.info("Reading data '{}'...".format(lmdb_dir))
-        preloaded_dir = lmdb_dir + "_cache"
-        if not os.path.exists(preloaded_dir):
-            data_sampler = DataPreprocessor(
-                args,
-                lmdb_dir,
-                preloaded_dir,
-                n_poses,
-                subdivision_stride,
-                pose_resampling_fps,
-            )
-            data_sampler.run()
-        else:
-            logging.info("Found pre-loaded samples from {}".format(preloaded_dir))
-
-        # init lmdb
-        self.lmdb_env: lmdb.Environment = lmdb.open(
-            preloaded_dir, readonly=True, lock=False
-        )
-        with self.lmdb_env.begin() as txn:
-            self.n_samples = txn.stat()["entries"]
-
-        #     Todo: we need to initiate pre-trained representation learning model
-        checkpoint_path: str = args.rep_learning_checkpoint
-        self.rep_learning_dim: int = args.rep_learning_dim
-        (
-            rep_learning_args,
-            rep_model,
-            rep_loss_fn,
-            rep_lang_model,
-            rep_out_dim,
-        ) = utils.train_utils.load_checkpoint_and_model(checkpoint_path, device, "DAE")
-        self.rep_model: torch.nn.Module = rep_model.to("cpu")
-        self.rep_model.train(False)
-
-    def __len__(self) -> int:
-        """Get the number of samples in the dataset.
-
-        Returns:
-            The integer size of samples in the dataset.
-        """
-        return self.n_samples
-
-    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Get the item at a specific index in the dataset.
-
-        Args:
-            idx: An integer index of the item to get.
-
-        Returns:
-            A 2-Tuple:
-                encoded_poses: A Tensor of pose/gesture data.
-                encoded_poses: The same tensor as above.
-        """
-        with self.lmdb_env.begin(write=False) as txn:
-            key = "{:010}".format(idx).encode("ascii")
-            sample = txn.get(key)
-
-            sample: Tuple[
-                np.ndarray, np.ndarray, np.ndarray, dict
-            ] = pyarrow.deserialize(sample)
-            word_seq, pose_seq, audio, aux_info = sample
-
-        # normalize
-        std = np.clip(self.data_std, a_min=0.01, a_max=None)
-        pose_seq = (pose_seq - self.data_mean) / std
-
-        # Todo: Here we should apply rep_learning
-        target = torch.from_numpy(pose_seq)
-        # target = torch.unsqueeze(target, 2)
-        target = target.float()
-        # target = target.to(device)
-        with torch.no_grad():
-            if self.rep_model.encoder == None:  # for ablation study
-                encoded_poses = target
-            else:
-                encoded_poses: torch.Tensor = self.rep_model.encoder(target)
-
-            # encoded_poses = torch.squeeze(encoded_poses, 2)
-            # encoded_poses = encoded_poses.to('cpu')
-            # reconstructed = encoded_poses.detach().numpy()
-
-            # to tensors
-            # word_seq_tensor = words_to_tensor(self.lang_model, word_seq, aux_info['end_time'])
-            # pose_seq = torch.from_numpy(pose_seq).reshape((pose_seq.shape[0], -1)).float()
-            encoded_poses = encoded_poses.reshape((encoded_poses.shape[0], -1)).float()
-        # audio = torch.from_numpy(audio).float()
-
-        if self.use_derivative:
-            diff = [
-                (encoded_poses[n, :] - encoded_poses[n - 1, :])
-                for n in range(1, encoded_poses.shape[0])
-            ]
-            diff.insert(0, torch.zeros_like(encoded_poses[0, :]))
-            encoded_poses = torch.hstack((encoded_poses, torch.stack(diff)))
-
-        # return word_seq_tensor, pose_seq, audio, aux_info
-        return encoded_poses, encoded_poses
-
-    def create_similarity_dataset(self, pickle_file: str, labelstxt_file: str) -> None:
-        """TODO"""
-        # Todo: 1. Thos function gets the pickle file that I made in the clustering.py(or flowgmm) process as well
-        # Todo: as the labels text file that I annotated in the Unity application.
-        # Todo: 2. Then I will creat those pairs of similarity and dissimilarity
-        # Todo: 3. Finally, we store the pairs into the class.
-        # Todo: We will use pairwise label and an extra loss in backpropagation process later.
-
-        # 1. call preprocess load
-        (
-            self.data_rnn,
-            self.labels,
-            self.pairwise_labels,
-            self.data_original,
-        ) = self.load_gesture_data(pickle_file, labelstxt_file)
-
-        # normalize
-        std = np.clip(self.data_std, a_min=0.01, a_max=None)
-        self.data_original = (self.data_original - self.data_mean) / std
-
-        target = torch.from_numpy(self.data_original)
-
-        target = target.float()
-        # target = target.to(device)
-        with torch.no_grad():
-            self.encoded_labeled_poses = self.rep_model.encoder(target)
-            if self.use_derivative:
-                diff = [
-                    (
-                        self.encoded_labeled_poses[n, :]
-                        - self.encoded_labeled_poses[n - 1, :]
-                    )
-                    for n in range(1, self.encoded_labeled_poses.shape[0])
-                ]
-                diff.insert(0, torch.zeros_like(self.encoded_labeled_poses[0, :]))
-                self.encoded_labeled_poses = torch.cat(
-                    (self.encoded_labeled_poses, torch.stack(diff)), dim=2
-                )
-        self.pairwise_enabled = True
-        pass
-
-    def get_labeled_(
-        self, count: int
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """TODO"""
-        stack_pairs1 = torch.zeros(
-            count,
-            self.encoded_labeled_poses.shape[1],
-            self.encoded_labeled_poses.shape[2],
-        )
-        stack_pairs2 = torch.zeros(
-            count,
-            self.encoded_labeled_poses.shape[1],
-            self.encoded_labeled_poses.shape[2],
-        )
-        stack_labels = torch.zeros(count)
-        rnds = random.sample(range(1, len(self.pairwise_labels)), 3)
-        k = 0
-        for rnd in rnds:
-            current_pair = self.pairwise_labels[rnd]
-            s1_ = self.encoded_labeled_poses[current_pair[0]]
-            s2_ = self.encoded_labeled_poses[current_pair[1]]
-            ss_label = current_pair[2]
-            stack_pairs1[k, :, :] = s1_
-            stack_pairs2[k, :, :] = s2_
-            stack_labels[k] = ss_label
-            k = k + 1
-
-        return stack_pairs1, stack_pairs2, stack_labels
-
-    def set_lang_model(self, lang_model: Vocab) -> None:
-        """Set the word vector representation to be used with the dataset.
-
-        Modifies the internal state of the object at the attribute 'lang_model'.
-
-        Args:
-            lang_model: A pre-trained word vector representation contained in the 'Vocab' class.
-        """
-        self.lang_model = lang_model
-
-    def load_gesture_data(
-        self, pre_processed_pickle_adress: str, labelstxt_file: str
-    ) -> Tuple[np.ndarray, np.ndarray, list, list]:
-        """TODO"""
-        loaded = pickle.load(open(pre_processed_pickle_adress, "rb"))
-        liaded_len = len(loaded)
-        loaded = np.hstack(loaded)
-        print("len loaded", len(loaded))
-        print("Loaded successfully")
-
-        data_latent_rnn = []
-        data_latent_linear = np.zeros(
-            [
-                len(loaded),
-                loaded[0]["latent_linear"].shape[0],
-                loaded[0]["latent_linear"].shape[1],
-            ]
-        )
-        data_latent_linear_listwise = []
-        data_original = []
-        count = len(loaded)
-        # count = 4000
-        for i in range(count):
-            # 1
-            current_latent_linear = loaded[i]["latent_linear"]
-            current_latent_rnn = loaded[i]["latent_rnn"]
-            current_original = loaded[i]["original"]
-
-            if len(current_original) != len(loaded[0]["original"]):
-                continue
-
-            # 2
-            # current_latent_linear = np.hstack(current_latent_linear)
-            current_latent_rnn = np.hstack(current_latent_rnn)
-            # current_original = np.hstack(current_original)
-
-            # 3
-            data_latent_linear[i] = current_latent_linear
-            data_latent_linear_listwise.append(current_latent_linear)
-            data_latent_rnn.append(current_latent_rnn)
-            data_original.append(current_original)
-
-        # Should be constructed here since it is not obvious how many we will have at the end.
-        data_latent_linear = np.zeros(
-            [
-                len(data_latent_linear_listwise),
-                loaded[0]["latent_linear"].shape[0],
-                loaded[0]["latent_linear"].shape[1],
-            ]
-        )
-        for i in range(len(data_latent_linear_listwise)):
-            data_latent_linear[i] = data_latent_linear_listwise[i]
-
-        data_latent_rnn_ndarray = np.array(data_latent_rnn)
-
-        first_order_labels = np.ones(len(data_latent_rnn_ndarray)) * -1
-
-        labels_list = []
-        label_file = open(labelstxt_file, "r")
-
-        for line in label_file:
-            str = line.split(",")
-            lbl = str[4]
-            left = int(str[1])
-            middle = int(str[2])
-            right = int(str[3])
-            chance = random.random()
-            # if chance > 0.1:
-            #     continue
-            if lbl == "neither":
-                # continue
-                labels_list.append([right, middle, 0])
-                labels_list.append([left, middle, 0])
-                first_order_labels[right] = 1
-                first_order_labels[left] = 1
-                first_order_labels[middle] = 1
-            if lbl == "right":
-                labels_list.append([right, middle, 1])
-                first_order_labels[right] = 1
-                # first_order_labels[left] = 1
-                first_order_labels[middle] = 1
-            if lbl == "left":
-                labels_list.append([left, middle, 1])
-                # first_order_labels[right] = 1
-                first_order_labels[left] = 1
-                first_order_labels[middle] = 1
-
-        #     Todo: read from file
-
-        return (
-            data_latent_rnn_ndarray[:, 0:200],
-            first_order_labels,
-            (labels_list),
-            data_original,
-        )
-
-
-class TrinityDataset_with_cluster(Dataset):
-    """Contains information and parameters of a (Trinity) dataset.
-
-    This is a PyTorch Dataset subclass containing information of a Trinity dataset.
-    https://trinityspeechgesture.scss.tcd.ie/data/Trinity%20Speech-Gesture%20I/GENEA_Challenge_2020_data_release/
-
-    This class is #TODO
-
-    Attributes:
-        lmdb_dir: A string filepath of the directory containing the actual dataset.
-        lmdb_env: A Lmdb object loaded from .mdb files in the lmdb_dir.
-        n_poses: An integer number of frames in each clip in the dataset (normally 30 (in 30 fps)).
-        subdivision_stride: An integer number of frames between the start of one clip and the start of the next clip (clips can overlap).
-        skeleton_resampling_fps: An integer frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps).
-        n_samples: An integer number of clips/entries in the original dataset.
-        lang_model: A 'Vocab' pre-trained word vector representation or None.
-        data_mean: A mean calculated from each video in the original dataset.
-        data_std: A standard deviation calculcated from each video.
-        pairwise_enabled: #TODO
-        use_derivative: Boolean to stack gradients onto data during training.
-        encoded_labeled_poses: #TODO
-        sentence_level: #TODO
-        RNNautoencoder_model: #TODO
-        rep_learning_dim: An integer dimension of the model (unused).
-        rep_model: A DAE neural net model loaded from the above checkpoint.
-            Models: VQ_Frame, VAE_Network, DAE_Network in DAE_model.py.
-    """
-
-    def __init__(
-        self,
-        args,
-        lmdb_dir,
-        n_poses,
-        subdivision_stride,
-        pose_resampling_fps,
-        data_mean,
-        data_std,
-    ):
-        self.lmdb_dir = lmdb_dir
-        self.n_poses = n_poses
-        self.subdivision_stride = subdivision_stride
-        self.skeleton_resampling_fps = pose_resampling_fps
-        self.lang_model = None
-        self.data_mean = np.array(data_mean).squeeze()
-        self.data_std = np.array(data_std).squeeze()
-        self.use_derivative = args.use_derivative == "True"
-        self.kmeanmodel: DBSCAN | KMeans | AgglomerativeClustering = pickle.load(
-            open("../output/clustering_results/kmeans_model.pk", "rb")
-        )
-
-        if args.sentence_level == "True":
-            self.sentence_level = True
-        else:
-            self.sentence_level = False
-
-        logging.info("Reading data '{}'...".format(lmdb_dir))
-        if self.sentence_level:
-            preloaded_dir = lmdb_dir + "_sentence_level" + "_cache"
-        else:
-            preloaded_dir = lmdb_dir + "_cache"
-
-        if not os.path.exists(preloaded_dir):
-            data_sampler = DataPreprocessor(
-                lmdb_dir,
-                preloaded_dir,
-                n_poses,
-                subdivision_stride,
-                pose_resampling_fps,
-            )
-            data_sampler.run()
-        else:
-            logging.info("Found pre-loaded samples from {}".format(preloaded_dir))
-
-        # init lmdb
-        self.lmdb_env: lmdb.Environment = lmdb.open(
-            preloaded_dir, readonly=True, lock=False
-        )
-        with self.lmdb_env.begin() as txn:
-            self.n_samples = txn.stat()["entries"]
-
-        # Representation model
-        checkpoint_path: str = args.rep_learning_checkpoint
-        self.rep_learning_dim: int = args.rep_learning_dim
-        (
-            rep_learning_args,
-            rep_model,
-            rep_loss_fn,
-            rep_lang_model,
-            rep_out_dim,
-        ) = utils.train_utils.load_checkpoint_and_model(checkpoint_path, device, "DAE")
-        self.rep_model: torch.nn.Module = rep_model.to("cpu")
-        self.rep_model.train(False)
-        #   RNN autoencoder
-        checkpoint_path: str = args.autoencoder_checkpoint
-
-        # RNNAutoencoder_args, RNNAutoencoder_model, RNNAutoencoder_fn,\
-        # RNNAutoencoder_model, RNNAutoencoder_dim = utils.train_utils.load_checkpoint_and_model(
-        #     checkpoint_path, device)
-
-        (
-            args,
-            rnn,
-            loss_fn,
-            lang_model,
-            out_dim,
-        ) = utils.train_utils.load_checkpoint_and_model(
-            checkpoint_path, device, "autoencoder"
-        )
-        self.RNNAutoencoder_model: torch.nn.Module = rnn.to("cpu")
-        self.RNNAutoencoder_model.train(False)
-
-    def __len__(self) -> int:
-        """Get the size of the dataset.
-
-        Returns:
-            The number of samples in the dataset.
-        """
-        return self.n_samples
-
-    def __getitem__(
-        self, idx: int
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, dict, torch.Tensor]:
-        """TODO"""
-        with self.lmdb_env.begin(write=False) as txn:
-            key = "{:010}".format(idx).encode("ascii")
-            sample = txn.get(key)
-
-            sample = pyarrow.deserialize(sample)
-            word_seq, pose_seq, audio, aux_info, portion = sample
-
-        def words_to_tensor(
-            lang: Vocab | None, words: list[list], end_time: float | None = None
-        ):
-            indexes = [lang.SOS_token]
-            for word in words:
-                if end_time is not None and word[1] > end_time:
-                    break
-                indexes.append(lang.get_word_index(word[0]))
-            indexes.append(lang.EOS_token)
-            return torch.Tensor(indexes).long()
-
-        # normalize
-        std = np.clip(self.data_std, a_min=0.01, a_max=None)
-        pose_seq = (pose_seq - self.data_mean) / std
-
-        # Todo: Here we should apply rep_learning
-        target = torch.from_numpy(pose_seq)
-        target = target.float()
-        with torch.no_grad():
-            encoded_poses = self.rep_model.encoder(target)
-
-        encoded_poses = encoded_poses.reshape((encoded_poses.shape[0], -1)).float()
-
-        if self.use_derivative:
-            diff = [
-                (encoded_poses[n, :] - encoded_poses[n - 1, :])
-                for n in range(1, encoded_poses.shape[0])
-            ]
-            diff.insert(0, torch.zeros_like(encoded_poses[0, :]))
-            encoded_poses = torch.hstack((encoded_poses, torch.stack(diff)))
-        with torch.no_grad():
-            out_pose, latent, mue, logvar = self.RNNAutoencoder_model(
-                encoded_poses.unsqueeze(0), encoded_poses.unsqueeze(0)
-            )
-            latent = latent[: self.RNNAutoencoder_model.decoder.n_layers]
-            latent = latent.squeeze(1)
-            latent = latent.reshape((1, -1)).float()
-            # latent = latent.squeeze(0)
-            cluster_id = self.kmeanmodel.predict(latent.cpu().detach().numpy())
-            cluster_id = torch.Tensor(cluster_id).long()
-        # to tensors
-        word_seq_tensor = words_to_tensor(
-            self.lang_model, word_seq, aux_info["end_time"]
-        )
-        pose_seq = torch.from_numpy(pose_seq).reshape((pose_seq.shape[0], -1)).float()
-
-        # audio = torch.from_numpy(audio).float()
-
-        # return cluster_id
-        return word_seq_tensor, encoded_poses, audio, aux_info, cluster_id
-
-    def set_lang_model(self, lang_model: Vocab) -> None:
-        """Set the word vector representation to be used with the dataset.
-
-        Modifies the internal state of the object at the attribute 'lang_model'.
-
-        Args:
-            lang_model: A pre-trained word vector representation contained in the 'Vocab' class.
-        """
-        self.lang_model = lang_model
-
-
-class TrinityDataset_sentencelevel(Dataset):
-    """Contains information and parameters of a (Trinity) dataset.
-
-    This is a PyTorch Dataset subclass containing information of a Trinity dataset.
-    https://trinityspeechgesture.scss.tcd.ie/data/Trinity%20Speech-Gesture%20I/GENEA_Challenge_2020_data_release/
-
-    This class is #TODO.
-
-    Attributes:
-        lmdb_dir: A string filepath of the directory containing the actual dataset.
-        lmdb_env: A Lmdb object loaded from .mdb files in the lmdb_dir.
-        n_poses: An integer number of frames in each clip in the dataset (normally 30 (in 30 fps)).
-        subdivision_stride: An integer number of frames between the start of one clip and the start of the next clip (clips can overlap).
-        skeleton_resampling_fps: An integer frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps).
-        n_samples: An integer number of clips/entries in the original dataset.
-        lang_model: A 'Vocab' pre-trained word vector representation or None.
-        data_mean: A mean calculated from each video in the original dataset.
-        data_std: A standard deviation calculcated from each video.
-        args: #TODO
-        kmeanmodel: #TODO
-        sentence_level: #TODO
-        RNNAutoencoder_model: #TODO
-        use_derivative: #TODO
-        rep_learning_dim: An integer dimension of the model (unused).
-        rep_model: A DAE neural net model loaded from the above checkpoint.
-            Models: VQ_Frame, VAE_Network, DAE_Network in DAE_model.py.
-    """
-
-    def __init__(
-        self,
-        args: argparse.Namespace,
-        lmdb_dir: str,
-        n_poses: int,
-        subdivision_stride: int,
-        pose_resampling_fps: int,
-        data_mean: list[float],
-        data_std: list[float],
-    ):
-        """ """
-        self.lmdb_dir = lmdb_dir
-        self.n_poses = n_poses
-        self.subdivision_stride = subdivision_stride
-        self.subdivision_stride_sentence = args.subdivision_stride_sentence
-        self.args = args
-        self.skeleton_resampling_fps = pose_resampling_fps
-        self.lang_model = None
-        self.data_mean = np.array(data_mean).squeeze()
-        self.data_std = np.array(data_std).squeeze()
-        self.use_derivative = args.use_derivative == "True"
-
-        # Todo: fix adress -- Fixed
-        test = os.path.dirname(args.autoencoder_checkpoint)
-        self.kmeanmodel = pickle.load(
-            open(
-                os.path.dirname(args.autoencoder_checkpoint)
-                + "/clusters/kmeans_model.pk",
-                "rb",
-            )
-        )
-        # self.clustering_transforms = pickle.load(open(os.path.dirname(args.autoencoder_checkpoint)
-        #                                               + '/clusters/transforms.pkl', 'rb'))
-
-        if args.sentence_level == "True":
-            self.sentence_level = True
-        else:
-            self.sentence_level = False
-
-        logging.info("Reading data '{}'...".format(lmdb_dir))
-
-        if self.sentence_level:
-            # preloaded_dir = lmdb_dir + '_sentence_level' + '_cache'
-            preloaded_dir = (
-                args.model_save_path
-                + "lmdb/"
-                + os.path.basename((lmdb_dir))
-                + "_sentence_level"
-                + "_cache"
-            )
-        else:
-            # preloaded_dir = lmdb_dir + '_cache'
-            preloaded_dir = (
-                args.model_save_path + "lmdb/" + os.path.basename((lmdb_dir)) + "_cache"
-            )
-
-        if not os.path.exists(args.model_save_path + "lmdb"):
-            os.mkdir(args.model_save_path + "lmdb")
-        if not os.path.exists(preloaded_dir):
-            data_sampler = DataPreprocessor(
-                args,
-                lmdb_dir,
-                preloaded_dir,
-                n_poses,
-                self.subdivision_stride_sentence,
-                pose_resampling_fps,
-                sentence_level=self.sentence_level,
-            )
-            data_sampler.run()
-        else:
-            # Todo: Remove later
-
-            # data_sampler = DataPreprocessor(args, lmdb_dir, preloaded_dir, n_poses,
-            #                                 subdivision_stride, pose_resampling_fps,
-            #                                 sentence_level=self.sentence_level)
-            # data_sampler.run()
-            logging.info("Found pre-loaded samples from {}".format(preloaded_dir))
-
-        # init lmdb
-        self.lmdb_env: lmdb.Environment = lmdb.open(
-            preloaded_dir, readonly=True, lock=False
-        )
-        with self.lmdb_env.begin() as txn:
-            self.n_samples = txn.stat()["entries"]
-
-        # Representation model
-        checkpoint_path: str = args.rep_learning_checkpoint
-        self.rep_learning_dim: int = args.rep_learning_dim
-        (
-            rep_learning_args,
-            rep_model,
-            rep_loss_fn,
-            rep_lang_model,
-            rep_out_dim,
-        ) = utils.train_utils.load_checkpoint_and_model(checkpoint_path, device, "DAE")
-        self.rep_model = rep_model.to("cpu")
-        self.rep_model.train(False)
-        #   RNN autoencoder
-        checkpoint_path: str = args.autoencoder_checkpoint
-
-        # RNNAutoencoder_args, RNNAutoencoder_model, RNNAutoencoder_fn,\
-        # RNNAutoencoder_model, RNNAutoencoder_dim = utils.train_utils.load_checkpoint_and_model(
-        #     checkpoint_path, device)
-
-        (
-            args_rnn,
-            rnn,
-            loss_fn,
-            lang_model,
-            out_dim,
-        ) = utils.train_utils.load_checkpoint_and_model(
-            checkpoint_path, device, "autoencoder_vq"
-        )
-        self.RNNAutoencoder_model: torch.nn.Module = rnn.to("cpu")
-        self.RNNAutoencoder_model.train(False)
-
-    def __len__(self) -> int:
-        """Get the size of the dataset.
-
-        Returns:
-            The number of samples in the dataset.
-        """
-        return self.n_samples
-
-    def __getitem__(
-        self, idx: int
-    ) -> Tuple[
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        dict,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-    ]:
-        """TODO"""
-        with self.lmdb_env.begin(write=False) as txn:
-            key = "{:010}".format(idx).encode("ascii")
-            sample = txn.get(key)
-
-            sample = pyarrow.deserialize(sample)
-            (
-                word_seq,
-                pose_seq,
-                audio_raws,
-                audio_mels,
-                aux_info,
-                sentence_leve_latents,
-                GP3_Embedding,
-            ) = sample
-
-        def words_to_tensor(
-            lang: Vocab | None, words: list[list], end_time: float | None = None
-        ) -> torch.Tensor:
-            # indexes = [lang.SOS_token]
-            indexes = []
-            if len(words) <= 0:
-                print()
-            for word in words:
-                if end_time is not None and word[1] > end_time:
-                    break
-                indexes.append(lang.get_word_index(word[0]))
-            # indexes.append(lang.EOS_token)
-            return torch.Tensor(indexes).long()
-
-        def poseIndex_to_tensor(sop, eop, poses_indicies):
-            indexes = [sop]
-            for pose_index in poses_indicies:
-                indexes.append(pose_index)
-            indexes.append(eop)
-            return torch.Tensor(indexes).long()
-
-        # normalize
-        std = np.clip(self.data_std, a_min=0.01, a_max=None)
-        pose_seq = (pose_seq - self.data_mean) / std
-
-        # to tensors
-        word_seq_tensor = words_to_tensor(
-            self.lang_model, word_seq, aux_info["end_time"]
-        )
-        pose_seq = torch.from_numpy(pose_seq).reshape((pose_seq.shape[0], -1)).float()
-
-        sentence_leve_latents = torch.from_numpy(sentence_leve_latents)
-        sentence_leve_latents = sentence_leve_latents.reshape(
-            [sentence_leve_latents.shape[0], -1]
-        ).float()
-
-        # Todo: we may need do some preprocessing. (e.g. padding, resampling, etc.)
-        # Todo: Move list-->ndarray to the preprocessing.
-        audio_raw_for_now = False
-        if audio_raw_for_now:
-            audio = audio_raws
-        else:
-            audio = audio_mels
-        audio = np.array(audio)
-        audio = torch.from_numpy(audio).float()
-        # audio = torch.reshape(audio, (sentence_leve_latents.shape[0], -1))
-        # audio = torch.reshape(audio, (4, -1))
-
-        # return cluster_id
-        cluster_ids = np.zeros(sentence_leve_latents.shape[0])
-        if self.RNNAutoencoder_model.vq == True:
-            (
-                loss_vq,
-                quantized,
-                perplexity_vq,
-                encodings,
-            ) = self.RNNAutoencoder_model.vq_layer(sentence_leve_latents)
-            cluster_ids = torch.argmax(encodings, dim=1)
-
-            # q = poseIndex_to_tensor(sop=512, eop=513,
-            #                         poses_indicies=cluster_ids.cpu().detach().numpy())
-            # cluster_ids = q
-            # print(cluster_ids)
-        else:
-            cluster_ids = self.kmeanmodel.predict(
-                # self.clustering_transforms['scalar'].transform
-                (sentence_leve_latents.cpu().detach().numpy())
-            )
-            cluster_ids = torch.from_numpy(cluster_ids).long()
-        # cluster_id = torch.Tensor(cluster_id).long()
-        # print(cluster_ids)
-
-        # clusters_portion = self.clustering_transforms['portion'][cluster_ids]
-        # clusters_portion = torch.from_numpy(clusters_portion)
-
-        # print((cluster_ids))
-        try:
-            GP3_Embedding = torch.from_numpy(GP3_Embedding).float()
-        except:
-            pass
-
-        return (
-            word_seq_tensor,
-            pose_seq,
-            audio,
-            aux_info,
-            sentence_leve_latents,
-            cluster_ids,
-            GP3_Embedding,
-        )
-
-    def set_lang_model(self, lang_model: Vocab) -> None:
-        """Set the word vector representation to be used with the dataset.
-
-        Modifies the internal state of the object at the attribute 'lang_model'.
-
-        Args:
-            lang_model: A pre-trained word vector representation contained in the 'Vocab' class.
-        """
-        self.lang_model = lang_model
+"""
+"""
+
+
+from __future__ import annotations
+import logging
+import os
+import pickle
+import random
+from typing import Tuple
+
+import numpy as np
+import lmdb as lmdb
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import Dataset
+from torch.utils.data.dataloader import default_collate
+from data_loader.data_preprocessor import DataPreprocessor
+import pyarrow
+from configargparse import argparse
+from sklearn.cluster import DBSCAN, KMeans, AgglomerativeClustering
+
+from model.vocab import Vocab
+import utils
+
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+
+def word_seq_collate_fn(
+    data: list,
+) -> (
+    Tuple[
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        dict,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+    ]
+    | Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, dict]
+):
+    """Collate function for loading word sequences in variable lengths.
+
+    This function returns extra values depending if sentence_leve is True or False.
+    sentence_leve is set to always True and must be manually changed.
+
+    Args:
+        data: A list of samples including the following values in each element:
+            word_seq: A list of Tensors of word vector representations.
+            words_lengths: A list of Tensors of word lengths in word_seq.
+            poses_seq: A list of Tensors of poses/gestures.
+            audio: A list of Tensors of audio data.
+
+    Returns:
+        A 5-Tuple or 8-Tuple:
+
+        8-Tuple (if sentence_level is set to True):
+            word_seq: A Tensor of word vector representations.
+            words_lengths: A Tensor of word lengths in word_seq.
+            poses_seq: A Tensor of poses/gestures.
+            audio: A Tensor of audio data.
+            aux_info: A dict containing info such as name, start/end frames and times.
+            sentence_leve_latents: #TODO
+            cluster_portion: #TODO
+            GPT3_Embedding: #TODO
+
+        5-Tuple:
+            The first five values in the 8-Tuple.
+    """
+    # sort a list by sequence length (descending order) to use pack_padded_sequence
+    data.sort(key=lambda x: len(x[0]), reverse=True)
+
+    # separate source and target sequences
+    # Todo: fix this using args
+    sentence_leve = True
+    if not sentence_leve:
+        (
+            word_seq,
+            poses_seq,
+            audio,
+            aux_info,
+        ) = zip(*data)
+    else:
+        (
+            word_seq,
+            poses_seq,
+            audio,
+            aux_info,
+            sentence_leve_latents,
+            cluster_portion,
+            GPT3_Embedding,
+        ) = zip(*data)
+
+    # merge sequences
+    words_lengths = torch.LongTensor([len(x) for x in word_seq])
+    word_seq = pad_sequence(word_seq, batch_first=True).long()
+
+    poses_seq = default_collate(poses_seq)
+    audio = default_collate(audio)
+    if sentence_leve:
+        sentence_leve_latents = default_collate(sentence_leve_latents)
+        cluster_portion = default_collate(cluster_portion)
+        GPT3_Embedding = default_collate(GPT3_Embedding)
+
+    # audio = default_collate(audio)
+    aux_info = {key: default_collate([d[key] for d in aux_info]) for key in aux_info[0]}
+
+    if sentence_leve:
+        return (
+            word_seq,
+            words_lengths,
+            poses_seq,
+            audio,
+            aux_info,
+            sentence_leve_latents,
+            cluster_portion,
+            GPT3_Embedding,
+        )
+    else:
+        return word_seq, words_lengths, poses_seq, audio, aux_info
+
+
+class TrinityDataset(Dataset):
+    """Contains information and associated parameters of a (Trinity) dataset.
+
+    This is a PyTorch Dataset subclass containing information of a Trinity dataset.
+    https://trinityspeechgesture.scss.tcd.ie/data/Trinity%20Speech-Gesture%20I/GENEA_Challenge_2020_data_release/
+
+    Attributes:
+        lmdb_dir: A string representing the filepath of the directory containing the actual dataset.
+        lmdb_env: A Lmdb ('Lightning Memory-Mapped' Database) object loaded from .mdb files located at the lmdb_dir.
+        n_poses: An int representing the number of frames in each clip in the dataset (normally 30 (in 30 fps)).
+        subdivision_stride: An int representing the number of frames between the start of one clip and the start of the next clip (clips can overlap).
+        skeleton_resampling_fps: An int representing the frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps).
+        n_samples: An int representing the number of clips/entries in the original dataset.
+        lang_model: A pre-trained (English) language vector representation contained in the 'Vocab' custom class.
+        data_mean: A mean calculated from each video in the original dataset.
+        data_std: A standard deviation calculcated from each video in the original dataset.
+    """
+
+    def __init__(
+        self,
+        args: argparse.Namespace,
+        lmdb_dir: str,
+        n_poses: int,
+        subdivision_stride: int,
+        pose_resampling_fps: int,
+        data_mean: list[float],
+        data_std: list[float],
+    ):
+        """Initialize with multiple dataset parameters.
+
+        The args argument must contain the following keys:
+            name: A string name of the model (ex. 'DAE' or 'autoencoder_vq').
+            rep_learning_checkpoint: If name is not 'DAE', a string filepath to a saved 'DAE' checkpoint model.
+            autoencoder_checkpoint: If sentence level is True, a string filepath to a saved VQVAE checkpoint model.
+            sentence_frame_length: An integer number of frames in each clip (for a sentence instead of gesture).
+
+        Args:
+            args: A configargparse object containing parameters (See above).
+            lmdb_dir: A string representing the filepath of the directory containing the actual dataset.
+            n_poses: An int representing the number of frames in each clip in the dataset (normally 30 (in 30 fps)).
+            subdivision_stride: An int representing the number of frames between the start of one clip and the start of the next clip (clips can overlap).
+            pose_resampling_fps: An int representing the frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps).
+            data_mean: A mean calculated from each video in the original dataset.
+            data_std: A standard deviation calculcated from each video in the original dataset.
+        """
+        self.lmdb_dir = lmdb_dir
+        self.n_poses = n_poses
+        self.subdivision_stride = subdivision_stride
+        self.skeleton_resampling_fps = pose_resampling_fps
+        self.lang_model = None
+        self.data_mean = np.array(data_mean).squeeze()
+        self.data_std = np.array(data_std).squeeze()
+
+        logging.info("Reading data '{}'...".format(lmdb_dir))
+        preloaded_dir = lmdb_dir + "_cache"
+        if not os.path.exists(preloaded_dir):
+            data_sampler = DataPreprocessor(
+                args,
+                lmdb_dir,
+                preloaded_dir,
+                n_poses,
+                subdivision_stride,
+                pose_resampling_fps,
+                sentence_level=False,
+            )
+            data_sampler.run()
+        else:
+            logging.info("Found pre-loaded samples from {}".format(preloaded_dir))
+
+        # init lmdb
+        self.lmdb_env: lmdb.Environment = lmdb.open(
+            preloaded_dir, readonly=True, lock=False
+        )
+        with self.lmdb_env.begin() as txn:
+            self.n_samples = txn.stat()["entries"]
+
+    def __len__(self) -> int:
+        """Get the size of the dataset.
+
+        Returns:
+            The number of samples in the original dataset.
+        """
+        return self.n_samples
+
+    def __getitem__(
+        self, idx: int
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, dict]:
+        """Get an item at a specific index in the dataset.
+
+        Retrieves a specific entry in the original dataset and
+        associated information of that entry such as words, audio, poses, etc.
+
+        Args:
+            idx: The index of the item to retrieve.
+
+        Returns:
+            A 4-Tuple:
+                word_seq_tensor: A Tensor of word vector representations.
+                pose_seq: A Tensor of pose/gesture data.
+                audio: A Tensor of audio data.
+                aux_info: A dict containing the string keys:
+                    'vid': A string name for the information.
+                    'start_frame_no': An integer of the start index.
+                    'end_frame_no': An integer of the end index.
+                    'start_time': A float of the start time of the clip.
+                    'end_time': A float of the end time of the clip.
+        """
+        with self.lmdb_env.begin(write=False) as txn:
+            key = "{:010}".format(idx).encode("ascii")
+            sample = txn.get(key)
+
+            sample = pyarrow.deserialize(sample)
+            word_seq, pose_seq, audio, aux_info = sample
+
+        def words_to_tensor(lang, words, end_time=None):
+            indexes = [lang.SOS_token]
+            for word in words:
+                if end_time is not None and word[1] > end_time:
+                    break
+                indexes.append(lang.get_word_index(word[0]))
+            indexes.append(lang.EOS_token)
+            return torch.Tensor(indexes).long()
+
+        # normalize
+        std = np.clip(self.data_std, a_min=0.01, a_max=None)
+        pose_seq: torch.Tensor = (pose_seq - self.data_mean) / std
+
+        # to tensors
+        word_seq_tensor = words_to_tensor(
+            self.lang_model, word_seq, aux_info["end_time"]
+        )
+        pose_seq = torch.from_numpy(pose_seq).reshape((pose_seq.shape[0], -1)).float()
+        audio = torch.from_numpy(audio).float()
+
+        return word_seq_tensor, pose_seq, audio, aux_info
+
+    def set_lang_model(self, lang_model: Vocab) -> None:
+        """Set the word vector representation to be used with the dataset.
+
+        Modifies the internal state of the object at the attribute 'lang_model'.
+
+        Args:
+            lang_model: A pre-trained word vector representation contained in the 'Vocab' class.
+        """
+        self.lang_model = lang_model
+
+
+class TrinityDataset_DAE(Dataset):
+    """Contains information and parameters of a (Trinity) dataset.
+
+    This is a PyTorch Dataset subclass containing information of a Trinity dataset.
+    https://trinityspeechgesture.scss.tcd.ie/data/Trinity%20Speech-Gesture%20I/GENEA_Challenge_2020_data_release/
+
+    This class is focussed on the pose/gesture data only.
+
+    Attributes:
+        lmdb_dir: A string filepath of the directory containing the actual dataset.
+        lmdb_env: A Lmdb object loaded from .mdb files in the lmdb_dir.
+        n_poses: An integer number of frames in each clip in the dataset (normally 30 (in 30 fps)).
+        subdivision_stride: An integer number of frames between the start of one clip and the start of the next clip (clips can overlap).
+        skeleton_resampling_fps: An integer frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps).
+        n_samples: An integer number of clips/entries in the original dataset.
+        lang_model: A 'Vocab' pre-trained word vector representation.
+        data_mean: A mean calculated from each video in the original dataset.
+        data_std: A standard deviation calculcated from each video.
+        all_poses: A list where each element is a dict containing string keys:
+            'original': A Tensor of each pose/gesture data.
+            'noisy': A Tensor with the same values as 'original'.
+    """
+
+    def __init__(
+        self,
+        args: argparse.Namespace,
+        lmdb_dir: str,
+        n_poses: int,
+        subdivision_stride: int,
+        pose_resampling_fps: int,
+        data_mean: list[float],
+        data_std: list[float],
+    ):
+        """Initialize with dataset location and several parameters.
+
+        The args argument must contain the following keys:
+            name: A string name of the model (ex. 'DAE' or 'autoencoder_vq').
+            rep_learning_checkpoint: If name is not 'DAE', a string filepath to a saved 'DAE' checkpoint model.
+            autoencoder_checkpoint: If sentence level is True, a string filepath to a saved VQVAE checkpoint model.
+            sentence_frame_length: An integer number of frames in each clip (for a sentence instead of gesture).
+
+        Args:
+            args: A configargparse object containing parameters (See above).
+            lmdb_dir: A string representing the filepath of the directory containing the actual dataset.
+            n_poses: An int representing the number of frames in each clip in the dataset (normally 30 (in 30 fps)).
+            subdivision_stride: An int representing the number of frames between the start of one clip and the start of the next clip (clips can overlap).
+            pose_resampling_fps: An int representing the frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps).
+            data_mean: A mean calculated from each video in the original dataset.
+            data_std: A standard deviation calculcated from each video in the original dataset.
+        """
+        self.lmdb_dir = lmdb_dir
+        self.n_poses = n_poses
+        self.subdivision_stride = subdivision_stride
+        self.skeleton_resampling_fps = pose_resampling_fps
+        self.lang_model = None
+        self.data_mean = np.array(data_mean).squeeze()
+        self.data_std = np.array(data_std).squeeze()
+
+        logging.info("Reading data '{}'...".format(lmdb_dir))
+        preloaded_dir = lmdb_dir + "_cache"
+        if not os.path.exists(preloaded_dir):
+            data_sampler = DataPreprocessor(
+                args,
+                lmdb_dir,
+                preloaded_dir,
+                n_poses,
+                subdivision_stride,
+                pose_resampling_fps,
+            )
+            data_sampler.run()
+        else:
+            logging.info("Found pre-loaded samples from {}".format(preloaded_dir))
+
+        # init lmdb
+        self.lmdb_env: lmdb.Environment = lmdb.open(
+            preloaded_dir, readonly=True, lock=False
+        )
+        with self.lmdb_env.begin() as txn:
+            self.n_samples = txn.stat()["entries"]
+
+        # Initiate all poses
+        self.all_poses = []
+        self.create_all_poses()
+        print("data init finished!")
+
+    def __len__(self) -> int:
+        """Get the size of the dataset.
+
+        Returns:
+            The number of samples in the dataset.
+        """
+        # return 500
+        # return (self.n_samples * self.n_poses) //5
+        return self.n_samples
+
+    def create_all_poses(self) -> None:
+        """Move all poses/gestures from database into object.
+
+        Modifies the internal state of the object at attribute 'all_poses'.
+        """
+        with self.lmdb_env.begin(write=False) as txn:
+            for i in range(self.n_samples):
+                key = "{:010}".format(i).encode("ascii")
+                sample = txn.get(key)
+
+                sample = pyarrow.deserialize(sample)
+                word_seq, pose_seq, audio, aux_info = sample
+
+                # normalize
+                std = np.clip(self.data_std, a_min=0.01, a_max=None)
+                pose_seq = (pose_seq - self.data_mean) / std
+
+                for j in range(0, len(pose_seq)):
+                    original = pose_seq[j, :]
+                    var_coef = 1  # std
+                    sigma = 1
+                    # noisy = self.add_noise(original, 0.0, std)
+                    noisy = original  # dropout layer adds noise instead
+                    self.all_poses.append({"original": original, "noisy": noisy})
+
+    def get_item_Memory_Efficient(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Get the item at a specific index in the dataset.
+
+        This method improves memory performance by returning
+        the pose/gesture data of the item only.
+
+        Args:
+            idx: An integer index of the item to get.
+
+        Returns:
+            A 2-Tuple:
+                noisy: A Tensor of pose/gesture data.
+                original: A Tensor with the same values as 'noisy'.
+        """
+        idx_lmdb = idx // self.n_poses
+
+        with self.lmdb_env.begin(write=False) as txn:
+            key = "{:010}".format(idx_lmdb).encode("ascii")
+            sample = txn.get(key)
+
+            sample = pyarrow.deserialize(sample)
+            word_seq, pose_seq, audio, aux_info = sample
+
+            # normalize
+            std = np.clip(self.data_std, a_min=0.01, a_max=None)
+            pose_seq = (pose_seq - self.data_mean) / std
+
+            original = pose_seq[idx % self.n_poses, :]
+            noisy = original
+
+            original = (
+                torch.from_numpy(original).reshape((original.shape[0], -1)).float()
+            )
+            noisy = torch.from_numpy(noisy).reshape((noisy.shape[0], -1)).float()
+
+            return noisy, original
+
+    def add_noise(
+        self, x: np.ndarray, variance_multiplier: np.ndarray, sigma: np.ndarray
+    ) -> np.ndarray:
+        """Add Gaussian noise to the input data.
+
+        The noise added is based on the variance_multiplier and sigma array values.
+
+        Args:
+            x: A numeric numpy array of data.
+            variance_multiplier: A numeric numpy array of coefficients to multiple variance of the noise on
+            sigma: A numeric numpy array of the variance of the dataset
+
+        Returns:
+            A numeric numpy array of the data with noise added.
+        """
+        eps = 1e-15
+        noise = np.random.normal(
+            0.0, np.multiply(sigma, variance_multiplier) + eps, x.shape
+        )
+        x = x + noise
+        return x
+
+    def add_noise2(self, x: np.ndarray, prob: float) -> np.ndarray:
+        """Add Gaussian noise to the input data.
+
+        The noise is added by zeroing the specific elements.
+
+        Args:
+            x: A numeric numpy array of data.
+            prob: A float percentage of adding noise to a single element.
+
+        Returns:
+            A numeric numpy array of the data with noise added.
+        """
+        for i in range(len(x)):
+            rnd = random.random()
+            if rnd < prob:
+                x[i] = 0
+        return x
+
+    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Get the item at a specific index in the dataset.
+
+        This method returns the data exactly as stored. The 'noisy' data does
+        not have noise added but assumes that noise has previously been added.
+
+        Args:
+            idx: An integer index of the item to get.
+
+        Returns:
+            A 2-Tuple:
+                noisy: A Tensor of pose/gesture data (with noise).
+                original: A Tensor of pose/gesture data.
+        """
+        # Keep and document for only large datasets
+        # Todo: I think this way is more efficient than the prev. approach
+        # I need to double check and make sure everything works as before.
+        # return self.get_item_Memory_Efficient(idx)
+        # original, noisy = self.get_item_Memory_Efficient(idx)
+        # return noisy, original
+
+        original = self.all_poses[idx]["original"]
+        noisy = self.all_poses[idx]["noisy"]
+
+        original = torch.from_numpy(original).reshape((original.shape[0], -1)).float()
+        noisy = torch.from_numpy(noisy).reshape((noisy.shape[0], -1)).float()
+
+        return noisy, original
+
+    def set_lang_model(self, lang_model: Vocab) -> None:
+        """Set the word vector representation to be used with the dataset.
+
+        Modifies the internal state of the object at the attribute 'lang_model'.
+
+        Args:
+            lang_model: A pre-trained word vector representation contained in the 'Vocab' class.
+        """
+        self.lang_model = lang_model
+
+
+class TrinityDataset_DAEed_Autoencoder(Dataset):
+    """Contains information and parameters of a (Trinity) dataset.
+
+    This is a PyTorch Dataset subclass containing information of a Trinity dataset.
+    https://trinityspeechgesture.scss.tcd.ie/data/Trinity%20Speech-Gesture%20I/GENEA_Challenge_2020_data_release/
+
+    This class is #TODO.
+
+    Attributes:
+        lmdb_dir: A string filepath of the directory containing the actual dataset.
+        lmdb_env: A Lmdb object loaded from .mdb files in the lmdb_dir.
+        n_poses: An integer number of frames in each clip in the dataset (normally 30 (in 30 fps)).
+        subdivision_stride: An integer number of frames between the start of one clip and the start of the next clip (clips can overlap).
+        skeleton_resampling_fps: An integer frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps).
+        n_samples: An integer number of clips/entries in the original dataset.
+        lang_model: A 'Vocab' pre-trained word vector representation or None.
+        data_mean: A mean calculated from each video in the original dataset.
+        data_std: A standard deviation calculcated from each video.
+        pairwise_enabled: #TODO
+        use_derivative: Boolean to stack gradients to data during training.
+        encoded_labeled_poses: #TODO
+        rep_learning_dim: An integer dimension of the model (unused).
+        rep_learning_checkpoint: A string filepath to saved DAE model checkpoints.
+        rep_model: A DAE neural net model loaded from the above checkpoint.
+            Models: VQ_Frame, VAE_Network, DAE_Network in DAE_model.py.
+    """
+
+    def __init__(
+        self,
+        args: argparse.Namespace,
+        lmdb_dir: str,
+        n_poses: int,
+        subdivision_stride: int,
+        pose_resampling_fps: int,
+        data_mean: list[float],
+        data_std: list[float],
+    ):
+        """Initialize with dataset location and several parameters.
+
+        The args argument must contain the following keys:
+            name: A string name of the model (ex. 'DAE' or 'autoencoder_vq').
+            rep_learning_checkpoint: If name is not 'DAE', a string filepath to a saved 'DAE' checkpoint model.
+            autoencoder_checkpoint: If sentence level is True, a string filepath to a saved VQVAE checkpoint model.
+            sentence_frame_length: An integer number of frames in each clip (for a sentence instead of gesture).
+            rep_learning_checkpoint: A string filepath to saved model checkpoints.
+            use_derivative: A boolean whether to use derivatives.
+
+        Args:
+            args: A configargparse object containing parameters (See above).
+            lmdb_dir: A string representing the filepath of the directory containing the actual dataset.
+            n_poses: An int representing the number of frames in each clip in the dataset (normally 30 (in 30 fps)).
+            subdivision_stride: An int representing the number of frames between the start of one clip and the start of the next clip (clips can overlap).
+            pose_resampling_fps: An int representing the frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps).
+            data_mean: A mean calculated from each video in the original dataset.
+            data_std: A standard deviation calculcated from each video in the original dataset.
+        """
+        self.lmdb_dir = lmdb_dir
+        self.n_poses = n_poses
+        self.subdivision_stride = subdivision_stride
+        self.skeleton_resampling_fps = pose_resampling_fps
+        self.lang_model: Vocab | None = None
+        self.data_mean = np.array(data_mean).squeeze()
+        self.data_std = np.array(data_std).squeeze()
+
+        # We will change it to true once we call the creat_similarity_dataset
+        self.pairwise_enabled: bool = False
+        self.use_derivative: bool = args.use_derivative == "True"
+
+        logging.info("Reading data '{}'...".format(lmdb_dir))
+        preloaded_dir = lmdb_dir + "_cache"
+        if not os.path.exists(preloaded_dir):
+            data_sampler = DataPreprocessor(
+                args,
+                lmdb_dir,
+                preloaded_dir,
+                n_poses,
+                subdivision_stride,
+                pose_resampling_fps,
+            )
+            data_sampler.run()
+        else:
+            logging.info("Found pre-loaded samples from {}".format(preloaded_dir))
+
+        # init lmdb
+        self.lmdb_env: lmdb.Environment = lmdb.open(
+            preloaded_dir, readonly=True, lock=False
+        )
+        with self.lmdb_env.begin() as txn:
+            self.n_samples = txn.stat()["entries"]
+
+        #     Todo: we need to initiate pre-trained representation learning model
+        checkpoint_path: str = args.rep_learning_checkpoint
+        self.rep_learning_dim: int = args.rep_learning_dim
+        (
+            rep_learning_args,
+            rep_model,
+            rep_loss_fn,
+            rep_lang_model,
+            rep_out_dim,
+        ) = utils.train_utils.load_checkpoint_and_model(checkpoint_path, device, "DAE")
+        self.rep_model: torch.nn.Module = rep_model.to("cpu")
+        self.rep_model.train(False)
+
+    def __len__(self) -> int:
+        """Get the number of samples in the dataset.
+
+        Returns:
+            The integer size of samples in the dataset.
+        """
+        return self.n_samples
+
+    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Get the item at a specific index in the dataset.
+
+        Args:
+            idx: An integer index of the item to get.
+
+        Returns:
+            A 2-Tuple:
+                encoded_poses: A Tensor of pose/gesture data.
+                encoded_poses: The same tensor as above.
+        """
+        with self.lmdb_env.begin(write=False) as txn:
+            key = "{:010}".format(idx).encode("ascii")
+            sample = txn.get(key)
+
+            sample: Tuple[
+                np.ndarray, np.ndarray, np.ndarray, dict
+            ] = pyarrow.deserialize(sample)
+            word_seq, pose_seq, audio, aux_info = sample
+
+        # normalize
+        std = np.clip(self.data_std, a_min=0.01, a_max=None)
+        pose_seq = (pose_seq - self.data_mean) / std
+
+        # Todo: Here we should apply rep_learning
+        target = torch.from_numpy(pose_seq)
+        # target = torch.unsqueeze(target, 2)
+        target = target.float()
+        # target = target.to(device)
+        with torch.no_grad():
+            if self.rep_model.encoder == None:  # for ablation study
+                encoded_poses = target
+            else:
+                encoded_poses: torch.Tensor = self.rep_model.encoder(target)
+
+            # encoded_poses = torch.squeeze(encoded_poses, 2)
+            # encoded_poses = encoded_poses.to('cpu')
+            # reconstructed = encoded_poses.detach().numpy()
+
+            # to tensors
+            # word_seq_tensor = words_to_tensor(self.lang_model, word_seq, aux_info['end_time'])
+            # pose_seq = torch.from_numpy(pose_seq).reshape((pose_seq.shape[0], -1)).float()
+            encoded_poses = encoded_poses.reshape((encoded_poses.shape[0], -1)).float()
+        # audio = torch.from_numpy(audio).float()
+
+        if self.use_derivative:
+            diff = [
+                (encoded_poses[n, :] - encoded_poses[n - 1, :])
+                for n in range(1, encoded_poses.shape[0])
+            ]
+            diff.insert(0, torch.zeros_like(encoded_poses[0, :]))
+            encoded_poses = torch.hstack((encoded_poses, torch.stack(diff)))
+
+        # return word_seq_tensor, pose_seq, audio, aux_info
+        return encoded_poses, encoded_poses
+
+    def create_similarity_dataset(self, pickle_file: str, labelstxt_file: str) -> None:
+        """TODO"""
+        # Todo: 1. Thos function gets the pickle file that I made in the clustering.py(or flowgmm) process as well
+        # Todo: as the labels text file that I annotated in the Unity application.
+        # Todo: 2. Then I will creat those pairs of similarity and dissimilarity
+        # Todo: 3. Finally, we store the pairs into the class.
+        # Todo: We will use pairwise label and an extra loss in backpropagation process later.
+
+        # 1. call preprocess load
+        (
+            self.data_rnn,
+            self.labels,
+            self.pairwise_labels,
+            self.data_original,
+        ) = self.load_gesture_data(pickle_file, labelstxt_file)
+
+        # normalize
+        std = np.clip(self.data_std, a_min=0.01, a_max=None)
+        self.data_original = (self.data_original - self.data_mean) / std
+
+        target = torch.from_numpy(self.data_original)
+
+        target = target.float()
+        # target = target.to(device)
+        with torch.no_grad():
+            self.encoded_labeled_poses = self.rep_model.encoder(target)
+            if self.use_derivative:
+                diff = [
+                    (
+                        self.encoded_labeled_poses[n, :]
+                        - self.encoded_labeled_poses[n - 1, :]
+                    )
+                    for n in range(1, self.encoded_labeled_poses.shape[0])
+                ]
+                diff.insert(0, torch.zeros_like(self.encoded_labeled_poses[0, :]))
+                self.encoded_labeled_poses = torch.cat(
+                    (self.encoded_labeled_poses, torch.stack(diff)), dim=2
+                )
+        self.pairwise_enabled = True
+        pass
+
+    def get_labeled_(
+        self, count: int
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """TODO"""
+        stack_pairs1 = torch.zeros(
+            count,
+            self.encoded_labeled_poses.shape[1],
+            self.encoded_labeled_poses.shape[2],
+        )
+        stack_pairs2 = torch.zeros(
+            count,
+            self.encoded_labeled_poses.shape[1],
+            self.encoded_labeled_poses.shape[2],
+        )
+        stack_labels = torch.zeros(count)
+        rnds = random.sample(range(1, len(self.pairwise_labels)), 3)
+        k = 0
+        for rnd in rnds:
+            current_pair = self.pairwise_labels[rnd]
+            s1_ = self.encoded_labeled_poses[current_pair[0]]
+            s2_ = self.encoded_labeled_poses[current_pair[1]]
+            ss_label = current_pair[2]
+            stack_pairs1[k, :, :] = s1_
+            stack_pairs2[k, :, :] = s2_
+            stack_labels[k] = ss_label
+            k = k + 1
+
+        return stack_pairs1, stack_pairs2, stack_labels
+
+    def set_lang_model(self, lang_model: Vocab) -> None:
+        """Set the word vector representation to be used with the dataset.
+
+        Modifies the internal state of the object at the attribute 'lang_model'.
+
+        Args:
+            lang_model: A pre-trained word vector representation contained in the 'Vocab' class.
+        """
+        self.lang_model = lang_model
+
+    def load_gesture_data(
+        self, pre_processed_pickle_adress: str, labelstxt_file: str
+    ) -> Tuple[np.ndarray, np.ndarray, list, list]:
+        """TODO"""
+        loaded = pickle.load(open(pre_processed_pickle_adress, "rb"))
+        liaded_len = len(loaded)
+        loaded = np.hstack(loaded)
+        print("len loaded", len(loaded))
+        print("Loaded successfully")
+
+        data_latent_rnn = []
+        data_latent_linear = np.zeros(
+            [
+                len(loaded),
+                loaded[0]["latent_linear"].shape[0],
+                loaded[0]["latent_linear"].shape[1],
+            ]
+        )
+        data_latent_linear_listwise = []
+        data_original = []
+        count = len(loaded)
+        # count = 4000
+        for i in range(count):
+            # 1
+            current_latent_linear = loaded[i]["latent_linear"]
+            current_latent_rnn = loaded[i]["latent_rnn"]
+            current_original = loaded[i]["original"]
+
+            if len(current_original) != len(loaded[0]["original"]):
+                continue
+
+            # 2
+            # current_latent_linear = np.hstack(current_latent_linear)
+            current_latent_rnn = np.hstack(current_latent_rnn)
+            # current_original = np.hstack(current_original)
+
+            # 3
+            data_latent_linear[i] = current_latent_linear
+            data_latent_linear_listwise.append(current_latent_linear)
+            data_latent_rnn.append(current_latent_rnn)
+            data_original.append(current_original)
+
+        # Should be constructed here since it is not obvious how many we will have at the end.
+        data_latent_linear = np.zeros(
+            [
+                len(data_latent_linear_listwise),
+                loaded[0]["latent_linear"].shape[0],
+                loaded[0]["latent_linear"].shape[1],
+            ]
+        )
+        for i in range(len(data_latent_linear_listwise)):
+            data_latent_linear[i] = data_latent_linear_listwise[i]
+
+        data_latent_rnn_ndarray = np.array(data_latent_rnn)
+
+        first_order_labels = np.ones(len(data_latent_rnn_ndarray)) * -1
+
+        labels_list = []
+        label_file = open(labelstxt_file, "r")
+
+        for line in label_file:
+            str = line.split(",")
+            lbl = str[4]
+            left = int(str[1])
+            middle = int(str[2])
+            right = int(str[3])
+            chance = random.random()
+            # if chance > 0.1:
+            #     continue
+            if lbl == "neither":
+                # continue
+                labels_list.append([right, middle, 0])
+                labels_list.append([left, middle, 0])
+                first_order_labels[right] = 1
+                first_order_labels[left] = 1
+                first_order_labels[middle] = 1
+            if lbl == "right":
+                labels_list.append([right, middle, 1])
+                first_order_labels[right] = 1
+                # first_order_labels[left] = 1
+                first_order_labels[middle] = 1
+            if lbl == "left":
+                labels_list.append([left, middle, 1])
+                # first_order_labels[right] = 1
+                first_order_labels[left] = 1
+                first_order_labels[middle] = 1
+
+        #     Todo: read from file
+
+        return (
+            data_latent_rnn_ndarray[:, 0:200],
+            first_order_labels,
+            (labels_list),
+            data_original,
+        )
+
+
+class TrinityDataset_with_cluster(Dataset):
+    """Contains information and parameters of a (Trinity) dataset.
+
+    This is a PyTorch Dataset subclass containing information of a Trinity dataset.
+    https://trinityspeechgesture.scss.tcd.ie/data/Trinity%20Speech-Gesture%20I/GENEA_Challenge_2020_data_release/
+
+    This class is #TODO
+
+    Attributes:
+        lmdb_dir: A string filepath of the directory containing the actual dataset.
+        lmdb_env: A Lmdb object loaded from .mdb files in the lmdb_dir.
+        n_poses: An integer number of frames in each clip in the dataset (normally 30 (in 30 fps)).
+        subdivision_stride: An integer number of frames between the start of one clip and the start of the next clip (clips can overlap).
+        skeleton_resampling_fps: An integer frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps).
+        n_samples: An integer number of clips/entries in the original dataset.
+        lang_model: A 'Vocab' pre-trained word vector representation or None.
+        data_mean: A mean calculated from each video in the original dataset.
+        data_std: A standard deviation calculcated from each video.
+        pairwise_enabled: #TODO
+        use_derivative: Boolean to stack gradients onto data during training.
+        encoded_labeled_poses: #TODO
+        sentence_level: #TODO
+        RNNautoencoder_model: #TODO
+        rep_learning_dim: An integer dimension of the model (unused).
+        rep_model: A DAE neural net model loaded from the above checkpoint.
+            Models: VQ_Frame, VAE_Network, DAE_Network in DAE_model.py.
+    """
+
+    def __init__(
+        self,
+        args,
+        lmdb_dir,
+        n_poses,
+        subdivision_stride,
+        pose_resampling_fps,
+        data_mean,
+        data_std,
+    ):
+        self.lmdb_dir = lmdb_dir
+        self.n_poses = n_poses
+        self.subdivision_stride = subdivision_stride
+        self.skeleton_resampling_fps = pose_resampling_fps
+        self.lang_model = None
+        self.data_mean = np.array(data_mean).squeeze()
+        self.data_std = np.array(data_std).squeeze()
+        self.use_derivative = args.use_derivative == "True"
+        self.kmeanmodel: DBSCAN | KMeans | AgglomerativeClustering = pickle.load(
+            open("../output/clustering_results/kmeans_model.pk", "rb")
+        )
+
+        if args.sentence_level == "True":
+            self.sentence_level = True
+        else:
+            self.sentence_level = False
+
+        logging.info("Reading data '{}'...".format(lmdb_dir))
+        if self.sentence_level:
+            preloaded_dir = lmdb_dir + "_sentence_level" + "_cache"
+        else:
+            preloaded_dir = lmdb_dir + "_cache"
+
+        if not os.path.exists(preloaded_dir):
+            data_sampler = DataPreprocessor(
+                lmdb_dir,
+                preloaded_dir,
+                n_poses,
+                subdivision_stride,
+                pose_resampling_fps,
+            )
+            data_sampler.run()
+        else:
+            logging.info("Found pre-loaded samples from {}".format(preloaded_dir))
+
+        # init lmdb
+        self.lmdb_env: lmdb.Environment = lmdb.open(
+            preloaded_dir, readonly=True, lock=False
+        )
+        with self.lmdb_env.begin() as txn:
+            self.n_samples = txn.stat()["entries"]
+
+        # Representation model
+        checkpoint_path: str = args.rep_learning_checkpoint
+        self.rep_learning_dim: int = args.rep_learning_dim
+        (
+            rep_learning_args,
+            rep_model,
+            rep_loss_fn,
+            rep_lang_model,
+            rep_out_dim,
+        ) = utils.train_utils.load_checkpoint_and_model(checkpoint_path, device, "DAE")
+        self.rep_model: torch.nn.Module = rep_model.to("cpu")
+        self.rep_model.train(False)
+        #   RNN autoencoder
+        checkpoint_path: str = args.autoencoder_checkpoint
+
+        # RNNAutoencoder_args, RNNAutoencoder_model, RNNAutoencoder_fn,\
+        # RNNAutoencoder_model, RNNAutoencoder_dim = utils.train_utils.load_checkpoint_and_model(
+        #     checkpoint_path, device)
+
+        (
+            args,
+            rnn,
+            loss_fn,
+            lang_model,
+            out_dim,
+        ) = utils.train_utils.load_checkpoint_and_model(
+            checkpoint_path, device, "autoencoder"
+        )
+        self.RNNAutoencoder_model: torch.nn.Module = rnn.to("cpu")
+        self.RNNAutoencoder_model.train(False)
+
+    def __len__(self) -> int:
+        """Get the size of the dataset.
+
+        Returns:
+            The number of samples in the dataset.
+        """
+        return self.n_samples
+
+    def __getitem__(
+        self, idx: int
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, dict, torch.Tensor]:
+        """TODO"""
+        with self.lmdb_env.begin(write=False) as txn:
+            key = "{:010}".format(idx).encode("ascii")
+            sample = txn.get(key)
+
+            sample = pyarrow.deserialize(sample)
+            word_seq, pose_seq, audio, aux_info, portion = sample
+
+        def words_to_tensor(
+            lang: Vocab | None, words: list[list], end_time: float | None = None
+        ):
+            indexes = [lang.SOS_token]
+            for word in words:
+                if end_time is not None and word[1] > end_time:
+                    break
+                indexes.append(lang.get_word_index(word[0]))
+            indexes.append(lang.EOS_token)
+            return torch.Tensor(indexes).long()
+
+        # normalize
+        std = np.clip(self.data_std, a_min=0.01, a_max=None)
+        pose_seq = (pose_seq - self.data_mean) / std
+
+        # Todo: Here we should apply rep_learning
+        target = torch.from_numpy(pose_seq)
+        target = target.float()
+        with torch.no_grad():
+            encoded_poses = self.rep_model.encoder(target)
+
+        encoded_poses = encoded_poses.reshape((encoded_poses.shape[0], -1)).float()
+
+        if self.use_derivative:
+            diff = [
+                (encoded_poses[n, :] - encoded_poses[n - 1, :])
+                for n in range(1, encoded_poses.shape[0])
+            ]
+            diff.insert(0, torch.zeros_like(encoded_poses[0, :]))
+            encoded_poses = torch.hstack((encoded_poses, torch.stack(diff)))
+        with torch.no_grad():
+            out_pose, latent, mue, logvar = self.RNNAutoencoder_model(
+                encoded_poses.unsqueeze(0), encoded_poses.unsqueeze(0)
+            )
+            latent = latent[: self.RNNAutoencoder_model.decoder.n_layers]
+            latent = latent.squeeze(1)
+            latent = latent.reshape((1, -1)).float()
+            # latent = latent.squeeze(0)
+            cluster_id = self.kmeanmodel.predict(latent.cpu().detach().numpy())
+            cluster_id = torch.Tensor(cluster_id).long()
+        # to tensors
+        word_seq_tensor = words_to_tensor(
+            self.lang_model, word_seq, aux_info["end_time"]
+        )
+        pose_seq = torch.from_numpy(pose_seq).reshape((pose_seq.shape[0], -1)).float()
+
+        # audio = torch.from_numpy(audio).float()
+
+        # return cluster_id
+        return word_seq_tensor, encoded_poses, audio, aux_info, cluster_id
+
+    def set_lang_model(self, lang_model: Vocab) -> None:
+        """Set the word vector representation to be used with the dataset.
+
+        Modifies the internal state of the object at the attribute 'lang_model'.
+
+        Args:
+            lang_model: A pre-trained word vector representation contained in the 'Vocab' class.
+        """
+        self.lang_model = lang_model
+
+
+class TrinityDataset_sentencelevel(Dataset):
+    """Contains information and parameters of a (Trinity) dataset.
+
+    This is a PyTorch Dataset subclass containing information of a Trinity dataset.
+    https://trinityspeechgesture.scss.tcd.ie/data/Trinity%20Speech-Gesture%20I/GENEA_Challenge_2020_data_release/
+
+    This class is #TODO.
+
+    Attributes:
+        lmdb_dir: A string filepath of the directory containing the actual dataset.
+        lmdb_env: A Lmdb object loaded from .mdb files in the lmdb_dir.
+        n_poses: An integer number of frames in each clip in the dataset (normally 30 (in 30 fps)).
+        subdivision_stride: An integer number of frames between the start of one clip and the start of the next clip (clips can overlap).
+        skeleton_resampling_fps: An integer frames per second of clip to use for training (usually downsampled to 20 fps, clips are normally 30 fps).
+        n_samples: An integer number of clips/entries in the original dataset.
+        lang_model: A 'Vocab' pre-trained word vector representation or None.
+        data_mean: A mean calculated from each video in the original dataset.
+        data_std: A standard deviation calculcated from each video.
+        args: #TODO
+        kmeanmodel: #TODO
+        sentence_level: #TODO
+        RNNAutoencoder_model: #TODO
+        use_derivative: #TODO
+        rep_learning_dim: An integer dimension of the model (unused).
+        rep_model: A DAE neural net model loaded from the above checkpoint.
+            Models: VQ_Frame, VAE_Network, DAE_Network in DAE_model.py.
+    """
+
+    def __init__(
+        self,
+        args: argparse.Namespace,
+        lmdb_dir: str,
+        n_poses: int,
+        subdivision_stride: int,
+        pose_resampling_fps: int,
+        data_mean: list[float],
+        data_std: list[float],
+    ):
+        """ """
+        self.lmdb_dir = lmdb_dir
+        self.n_poses = n_poses
+        self.subdivision_stride = subdivision_stride
+        self.subdivision_stride_sentence = args.subdivision_stride_sentence
+        self.args = args
+        self.skeleton_resampling_fps = pose_resampling_fps
+        self.lang_model = None
+        self.data_mean = np.array(data_mean).squeeze()
+        self.data_std = np.array(data_std).squeeze()
+        self.use_derivative = args.use_derivative == "True"
+
+        # Todo: fix adress -- Fixed
+        test = os.path.dirname(args.autoencoder_checkpoint)
+        self.kmeanmodel = pickle.load(
+            open(
+                os.path.dirname(args.autoencoder_checkpoint)
+                + "/clusters/kmeans_model.pk",
+                "rb",
+            )
+        )
+        # self.clustering_transforms = pickle.load(open(os.path.dirname(args.autoencoder_checkpoint)
+        #                                               + '/clusters/transforms.pkl', 'rb'))
+
+        if args.sentence_level == "True":
+            self.sentence_level = True
+        else:
+            self.sentence_level = False
+
+        logging.info("Reading data '{}'...".format(lmdb_dir))
+
+        if self.sentence_level:
+            # preloaded_dir = lmdb_dir + '_sentence_level' + '_cache'
+            preloaded_dir = (
+                args.model_save_path
+                + "lmdb/"
+                + os.path.basename((lmdb_dir))
+                + "_sentence_level"
+                + "_cache"
+            )
+        else:
+            # preloaded_dir = lmdb_dir + '_cache'
+            preloaded_dir = (
+                args.model_save_path + "lmdb/" + os.path.basename((lmdb_dir)) + "_cache"
+            )
+
+        if not os.path.exists(args.model_save_path + "lmdb"):
+            os.mkdir(args.model_save_path + "lmdb")
+        if not os.path.exists(preloaded_dir):
+            data_sampler = DataPreprocessor(
+                args,
+                lmdb_dir,
+                preloaded_dir,
+                n_poses,
+                self.subdivision_stride_sentence,
+                pose_resampling_fps,
+                sentence_level=self.sentence_level,
+            )
+            data_sampler.run()
+        else:
+            # Todo: Remove later
+
+            # data_sampler = DataPreprocessor(args, lmdb_dir, preloaded_dir, n_poses,
+            #                                 subdivision_stride, pose_resampling_fps,
+            #                                 sentence_level=self.sentence_level)
+            # data_sampler.run()
+            logging.info("Found pre-loaded samples from {}".format(preloaded_dir))
+
+        # init lmdb
+        self.lmdb_env: lmdb.Environment = lmdb.open(
+            preloaded_dir, readonly=True, lock=False
+        )
+        with self.lmdb_env.begin() as txn:
+            self.n_samples = txn.stat()["entries"]
+
+        # Representation model
+        checkpoint_path: str = args.rep_learning_checkpoint
+        self.rep_learning_dim: int = args.rep_learning_dim
+        (
+            rep_learning_args,
+            rep_model,
+            rep_loss_fn,
+            rep_lang_model,
+            rep_out_dim,
+        ) = utils.train_utils.load_checkpoint_and_model(checkpoint_path, device, "DAE")
+        self.rep_model = rep_model.to("cpu")
+        self.rep_model.train(False)
+        #   RNN autoencoder
+        checkpoint_path: str = args.autoencoder_checkpoint
+
+        # RNNAutoencoder_args, RNNAutoencoder_model, RNNAutoencoder_fn,\
+        # RNNAutoencoder_model, RNNAutoencoder_dim = utils.train_utils.load_checkpoint_and_model(
+        #     checkpoint_path, device)
+
+        (
+            args_rnn,
+            rnn,
+            loss_fn,
+            lang_model,
+            out_dim,
+        ) = utils.train_utils.load_checkpoint_and_model(
+            checkpoint_path, device, "autoencoder_vq"
+        )
+        self.RNNAutoencoder_model: torch.nn.Module = rnn.to("cpu")
+        self.RNNAutoencoder_model.train(False)
+
+    def __len__(self) -> int:
+        """Get the size of the dataset.
+
+        Returns:
+            The number of samples in the dataset.
+        """
+        return self.n_samples
+
+    def __getitem__(
+        self, idx: int
+    ) -> Tuple[
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        dict,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+    ]:
+        """TODO"""
+        with self.lmdb_env.begin(write=False) as txn:
+            key = "{:010}".format(idx).encode("ascii")
+            sample = txn.get(key)
+
+            sample = pyarrow.deserialize(sample)
+            (
+                word_seq,
+                pose_seq,
+                audio_raws,
+                audio_mels,
+                aux_info,
+                sentence_leve_latents,
+                GP3_Embedding,
+            ) = sample
+
+        def words_to_tensor(
+            lang: Vocab | None, words: list[list], end_time: float | None = None
+        ) -> torch.Tensor:
+            # indexes = [lang.SOS_token]
+            indexes = []
+            if len(words) <= 0:
+                print()
+            for word in words:
+                if end_time is not None and word[1] > end_time:
+                    break
+                indexes.append(lang.get_word_index(word[0]))
+            # indexes.append(lang.EOS_token)
+            return torch.Tensor(indexes).long()
+
+        def poseIndex_to_tensor(sop, eop, poses_indicies):
+            indexes = [sop]
+            for pose_index in poses_indicies:
+                indexes.append(pose_index)
+            indexes.append(eop)
+            return torch.Tensor(indexes).long()
+
+        # normalize
+        std = np.clip(self.data_std, a_min=0.01, a_max=None)
+        pose_seq = (pose_seq - self.data_mean) / std
+
+        # to tensors
+        word_seq_tensor = words_to_tensor(
+            self.lang_model, word_seq, aux_info["end_time"]
+        )
+        pose_seq = torch.from_numpy(pose_seq).reshape((pose_seq.shape[0], -1)).float()
+
+        sentence_leve_latents = torch.from_numpy(sentence_leve_latents)
+        sentence_leve_latents = sentence_leve_latents.reshape(
+            [sentence_leve_latents.shape[0], -1]
+        ).float()
+
+        # Todo: we may need do some preprocessing. (e.g. padding, resampling, etc.)
+        # Todo: Move list-->ndarray to the preprocessing.
+        audio_raw_for_now = False
+        if audio_raw_for_now:
+            audio = audio_raws
+        else:
+            audio = audio_mels
+        audio = np.array(audio)
+        audio = torch.from_numpy(audio).float()
+        # audio = torch.reshape(audio, (sentence_leve_latents.shape[0], -1))
+        # audio = torch.reshape(audio, (4, -1))
+
+        # return cluster_id
+        cluster_ids = np.zeros(sentence_leve_latents.shape[0])
+        if self.RNNAutoencoder_model.vq == True:
+            (
+                loss_vq,
+                quantized,
+                perplexity_vq,
+                encodings,
+            ) = self.RNNAutoencoder_model.vq_layer(sentence_leve_latents)
+            cluster_ids = torch.argmax(encodings, dim=1)
+
+            # q = poseIndex_to_tensor(sop=512, eop=513,
+            #                         poses_indicies=cluster_ids.cpu().detach().numpy())
+            # cluster_ids = q
+            # print(cluster_ids)
+        else:
+            cluster_ids = self.kmeanmodel.predict(
+                # self.clustering_transforms['scalar'].transform
+                (sentence_leve_latents.cpu().detach().numpy())
+            )
+            cluster_ids = torch.from_numpy(cluster_ids).long()
+        # cluster_id = torch.Tensor(cluster_id).long()
+        # print(cluster_ids)
+
+        # clusters_portion = self.clustering_transforms['portion'][cluster_ids]
+        # clusters_portion = torch.from_numpy(clusters_portion)
+
+        # print((cluster_ids))
+        try:
+            GP3_Embedding = torch.from_numpy(GP3_Embedding).float()
+        except:
+            pass
+
+        return (
+            word_seq_tensor,
+            pose_seq,
+            audio,
+            aux_info,
+            sentence_leve_latents,
+            cluster_ids,
+            GP3_Embedding,
+        )
+
+    def set_lang_model(self, lang_model: Vocab) -> None:
+        """Set the word vector representation to be used with the dataset.
+
+        Modifies the internal state of the object at the attribute 'lang_model'.
+
+        Args:
+            lang_model: A pre-trained word vector representation contained in the 'Vocab' class.
+        """
+        self.lang_model = lang_model