hearbenchmark · turian · Sep 6, 2021 · Sep 5, 2021 · Sep 5, 2021 · Sep 5, 2021
diff --git a/README.md b/README.md
@@ -87,7 +87,10 @@ python3 -m heareval.predictions.runner hearbaseline --model ./naive_baseline.pt
     [--embeddings-dir embeddings]
     [--task task]
     [--gpus INT]
+    [--in-memory False]
 ```
+`--in-memory False` will memmap the embeddings from disk, which
+will use less standard memory, but also be slower.
 
 2) Evaluate the generated predictions for the test set for one or
 all modules and for one or all tasks:

diff --git a/docker/Dockerfile-cuda11.2 b/docker/Dockerfile-cuda11.2
@@ -242,6 +242,17 @@ RUN APT_INSTALL="apt-get install -y --no-install-recommends" && \
 ##pip3 install cython
 ##pip3 install -e ".[dev]"
 
+RUN APT_INSTALL="apt-get install -y --no-install-recommends" && \
+    PIP_INSTALL="python -m pip --no-cache-dir install --upgrade" && \
+    GIT_CLONE="git clone --depth 10" && \
+    $PIP_INSTALL cython ipython
+RUN APT_INSTALL="apt-get install -y --no-install-recommends" && \
+    PIP_INSTALL="python -m pip --no-cache-dir install" && \
+    GIT_CLONE="git clone --depth 10" && \
+    $PIP_INSTALL \
+        numpy==1.19.2 \
+        pynvml
+
 
 # gsutil
 # https://cloud.google.com/storage/docs/gsutil_install#deb
@@ -266,19 +277,8 @@ RUN APT_INSTALL="apt-get install -y --no-install-recommends" && \
     GIT_CLONE="git clone --depth 10" && \
     $PIP_INSTALL hearbaseline hearvalidator
 
+RUN echo 20210905
 
-RUN APT_INSTALL="apt-get install -y --no-install-recommends" && \
-    PIP_INSTALL="python -m pip --no-cache-dir install --upgrade" && \
-    GIT_CLONE="git clone --depth 10" && \
-    $PIP_INSTALL cython ipython
-RUN APT_INSTALL="apt-get install -y --no-install-recommends" && \
-    PIP_INSTALL="python -m pip --no-cache-dir install" && \
-    GIT_CLONE="git clone --depth 10" && \
-    $PIP_INSTALL \
-        numpy==1.19.2 \
-        pynvml
-#        hearbaseline \
-#        hearvalidator
 RUN APT_INSTALL="apt-get install -y --no-install-recommends" && \
     PIP_INSTALL="python -m pip --no-cache-dir install" && \
     GIT_CLONE="git clone --depth 10" && \

diff --git a/heareval/predictions/runner.py b/heareval/predictions/runner.py
@@ -57,6 +57,12 @@
     "See https://pytorch-lightning.readthedocs.io/en/stable/advanced/multi_gpu.html#select-gpu-devices",  # noqa
     type=str,
 )
+@click.option(
+    "--in-memory",
+    default=True,
+    help="Load embeddings in memory, or memmap them from disk. (Default: True)",
+    type=click.BOOL,
+)
 @click.option(
     "--deterministic",
     default=True,
@@ -77,6 +83,7 @@ def runner(
     grid_points: int = 1,
     model_options: str = "{}",
     gpus: Any = None if not torch.cuda.is_available() else "[0]",
+    in_memory: bool = True,
     deterministic: bool = True,
     grid: str = "default",
 ) -> None:
@@ -133,6 +140,7 @@ def runner(
             timestamp_embedding_size=timestamp_embedding_size,
             grid_points=grid_points,
             gpus=gpus,
+            in_memory=in_memory,
             deterministic=deterministic,
             grid=grid,
         )

diff --git a/heareval/predictions/task_predictions.py b/heareval/predictions/task_predictions.py
@@ -421,7 +421,7 @@ def _score_epoch_end(self, name: str, outputs: List[Dict[str, List[Any]]]):
 
 class SplitMemmapDataset(Dataset):
     """
-    Embeddings are memmap'ed.
+    Embeddings are memmap'ed, unless in-memory = True.
 
     WARNING: Don't shuffle this or access will be SLOW.
     """
@@ -433,6 +433,8 @@ def __init__(
         nlabels: int,
         split_name: str,
         embedding_type: str,
+        in_memory: bool,
+        metadata: bool,
     ):
         self.embedding_path = embedding_path
         self.label_to_idx = label_to_idx
@@ -445,19 +447,23 @@ def __init__(
                 open(embedding_path.joinpath(f"{split_name}.embedding-dimensions.json"))
             )
         )
-        self.embedding_memmap = np.memmap(
+        self.embeddings = np.memmap(
             filename=embedding_path.joinpath(f"{split_name}.embeddings.npy"),
             dtype=np.float32,
             mode="r",
             shape=self.dim,
         )
+        if in_memory:
+            self.embeddings = torch.stack(
+                [torch.tensor(e) for e in tqdm(self.embeddings)]
+            )
         self.labels = pickle.load(
             open(embedding_path.joinpath(f"{split_name}.target-labels.pkl"), "rb")
         )
-        # Only used for event-based prediction
+        # Only used for event-based prediction, for validation and test scoring,
         # For timestamp (event) embedding tasks,
         # the metadata for each instance is {filename: , timestamp: }.
-        if self.embedding_type == "event":
+        if self.embedding_type == "event" and metadata:
             filename_timestamps_json = embedding_path.joinpath(
                 f"{split_name}.filename-timestamps.json"
             )
@@ -468,26 +474,28 @@ def __init__(
         else:
             self.metadata = [{}] * self.dim[0]
         assert len(self.labels) == self.dim[0]
-        assert len(self.labels) == len(self.embedding_memmap)
+        assert len(self.labels) == len(self.embeddings)
         assert len(self.labels) == len(self.metadata)
-        assert self.embedding_memmap[0].shape[0] == self.dim[1]
+        assert self.embeddings[0].shape[0] == self.dim[1]
 
-    def __len__(self) -> int:
-        return self.dim[0]
-
-    def __getitem__(self, idx) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, Any]]:
         """
         For all labels, return a multi or one-hot vector.
         This allows us to have tensors that are all the same shape.
         Later we reduce this with an argmax to get the vocabulary indices.
-            We also include the filename and timestamp, which we need
-        for evaluation of timestamp (event) tasks.
-        We also return the metadata as a Dict.
         """
-        x = self.embedding_memmap[idx]
-        labels = [self.label_to_idx[str(label)] for label in self.labels[idx]]
-        y = label_to_binary_vector(labels, self.nlabels)
-        return x, y, self.metadata[idx]
+        ys = []
+        for idx in tqdm(range(len(self.labels))):
+            labels = [self.label_to_idx[str(label)] for label in self.labels[idx]]
+            y = torch.tensor(label_to_binary_vector(labels, self.nlabels))
+            ys.append(y)
+        self.y = torch.stack(ys)
+        assert self.y.shape == (len(self.labels), self.nlabels)
+
+    def __len__(self) -> int:
+        return self.dim[0]
+
+    def __getitem__(self, idx) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, Any]]:
+        return self.embeddings[idx], self.y[idx], self.metadata[idx]
 
 
 def create_events_from_prediction(
@@ -654,6 +662,8 @@ def dataloader_from_split_name(
     label_to_idx: Dict[str, int],
     nlabels: int,
     embedding_type: str,
+    in_memory: bool,
+    metadata: bool = True,
     batch_size: int = 64,
 ) -> DataLoader:
     dataset = SplitMemmapDataset(
@@ -662,6 +672,8 @@ def dataloader_from_split_name(
         nlabels=nlabels,
         split_name=split_name,
         embedding_type=embedding_type,
+        in_memory=in_memory,
+        metadata=metadata,
     )
 
     print(
@@ -672,10 +684,12 @@ def dataloader_from_split_name(
     return DataLoader(
         dataset,
         batch_size=batch_size,
-        # We don't shuffle because it's slow.
+        # We don't shuffle because it's slow
+        # (except when in_memory = True).
         # Also we want predicted labels in the same order as
-        # target labels.
+        # target labels, for validation and test.
         shuffle=False,
+        pin_memory=True,
     )
 
 
@@ -689,6 +703,7 @@ def task_predictions_train(
     scores: List[ScoreFunction],
     conf: Dict,
     gpus: Any,
+    in_memory: bool,
     deterministic: bool,
 ) -> Tuple[
     str, int, Dict[str, Any], Tuple[Tuple[str, Any], ...], pl.Trainer, float, str
@@ -758,6 +773,8 @@ def task_predictions_train(
         nlabels,
         metadata["embedding_type"],
         batch_size=conf["batch_size"],
+        in_memory=in_memory,
+        metadata=False,
     )
     valid_dataloader = dataloader_from_split_name(
         "valid",
@@ -766,6 +783,7 @@ def task_predictions_train(
         nlabels,
         metadata["embedding_type"],
         batch_size=conf["batch_size"],
+        in_memory=in_memory,
     )
     trainer.fit(predictor, train_dataloader, valid_dataloader)
     if checkpoint_callback.best_model_score is not None:
@@ -819,6 +837,7 @@ def task_predictions(
     timestamp_embedding_size: int,
     grid_points: int,
     gpus: Optional[int],
+    in_memory: bool,
     deterministic: bool,
     grid: str,
 ):
@@ -894,6 +913,7 @@ def print_scores(mode, scores_and_trainers):
             scores=scores,
             conf=conf,
             gpus=gpus,
+            in_memory=in_memory,
             deterministic=deterministic,
         )
         scores_and_trainers.append(
@@ -932,6 +952,7 @@ def print_scores(mode, scores_and_trainers):
         nlabels,
         metadata["embedding_type"],
         batch_size=conf["batch_size"],
+        in_memory=in_memory,
     )
     # This hack is necessary because we use the best validation epoch to
     # choose the event postprocessing