Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Data loader optimizations #293

Merged
merged 4 commits into from
Sep 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,10 @@ python3 -m heareval.predictions.runner hearbaseline --model ./naive_baseline.pt
[--embeddings-dir embeddings]
[--task task]
[--gpus INT]
[--in-memory False]
```
`--in-memory False` will memmap the embeddings from disk, which
will use less standard memory, but also be slower.

2) Evaluate the generated predictions for the test set for one or
all modules and for one or all tasks:
Expand Down
24 changes: 12 additions & 12 deletions docker/Dockerfile-cuda11.2
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,17 @@ RUN APT_INSTALL="apt-get install -y --no-install-recommends" && \
##pip3 install cython
##pip3 install -e ".[dev]"

RUN APT_INSTALL="apt-get install -y --no-install-recommends" && \
PIP_INSTALL="python -m pip --no-cache-dir install --upgrade" && \
GIT_CLONE="git clone --depth 10" && \
$PIP_INSTALL cython ipython
RUN APT_INSTALL="apt-get install -y --no-install-recommends" && \
PIP_INSTALL="python -m pip --no-cache-dir install" && \
GIT_CLONE="git clone --depth 10" && \
$PIP_INSTALL \
numpy==1.19.2 \
pynvml


# gsutil
# https://cloud.google.com/storage/docs/gsutil_install#deb
Expand All @@ -266,19 +277,8 @@ RUN APT_INSTALL="apt-get install -y --no-install-recommends" && \
GIT_CLONE="git clone --depth 10" && \
$PIP_INSTALL hearbaseline hearvalidator

RUN echo 20210905

RUN APT_INSTALL="apt-get install -y --no-install-recommends" && \
PIP_INSTALL="python -m pip --no-cache-dir install --upgrade" && \
GIT_CLONE="git clone --depth 10" && \
$PIP_INSTALL cython ipython
RUN APT_INSTALL="apt-get install -y --no-install-recommends" && \
PIP_INSTALL="python -m pip --no-cache-dir install" && \
GIT_CLONE="git clone --depth 10" && \
$PIP_INSTALL \
numpy==1.19.2 \
pynvml
# hearbaseline \
# hearvalidator
RUN APT_INSTALL="apt-get install -y --no-install-recommends" && \
PIP_INSTALL="python -m pip --no-cache-dir install" && \
GIT_CLONE="git clone --depth 10" && \
Expand Down
8 changes: 8 additions & 0 deletions heareval/predictions/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,12 @@
"See https://pytorch-lightning.readthedocs.io/en/stable/advanced/multi_gpu.html#select-gpu-devices", # noqa
type=str,
)
@click.option(
"--in-memory",
default=True,
help="Load embeddings in memory, or memmap them from disk. (Default: True)",
type=click.BOOL,
)
@click.option(
"--deterministic",
default=True,
Expand All @@ -77,6 +83,7 @@ def runner(
grid_points: int = 1,
model_options: str = "{}",
gpus: Any = None if not torch.cuda.is_available() else "[0]",
in_memory: bool = True,
deterministic: bool = True,
grid: str = "default",
) -> None:
Expand Down Expand Up @@ -133,6 +140,7 @@ def runner(
timestamp_embedding_size=timestamp_embedding_size,
grid_points=grid_points,
gpus=gpus,
in_memory=in_memory,
deterministic=deterministic,
grid=grid,
)
Expand Down
59 changes: 40 additions & 19 deletions heareval/predictions/task_predictions.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,7 @@ def _score_epoch_end(self, name: str, outputs: List[Dict[str, List[Any]]]):

class SplitMemmapDataset(Dataset):
"""
Embeddings are memmap'ed.
Embeddings are memmap'ed, unless in-memory = True.

WARNING: Don't shuffle this or access will be SLOW.
"""
Expand All @@ -433,6 +433,8 @@ def __init__(
nlabels: int,
split_name: str,
embedding_type: str,
in_memory: bool,
metadata: bool,
):
self.embedding_path = embedding_path
self.label_to_idx = label_to_idx
Expand All @@ -445,19 +447,23 @@ def __init__(
open(embedding_path.joinpath(f"{split_name}.embedding-dimensions.json"))
)
)
self.embedding_memmap = np.memmap(
self.embeddings = np.memmap(
filename=embedding_path.joinpath(f"{split_name}.embeddings.npy"),
dtype=np.float32,
mode="r",
shape=self.dim,
)
if in_memory:
self.embeddings = torch.stack(
[torch.tensor(e) for e in tqdm(self.embeddings)]
)
self.labels = pickle.load(
open(embedding_path.joinpath(f"{split_name}.target-labels.pkl"), "rb")
)
# Only used for event-based prediction
# Only used for event-based prediction, for validation and test scoring,
# For timestamp (event) embedding tasks,
# the metadata for each instance is {filename: , timestamp: }.
if self.embedding_type == "event":
if self.embedding_type == "event" and metadata:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can be for a later issue, but I'm finding myself scratching my head a bit trying to remember how the metadata works here for event embeddings as well as the labels. Would be good to include in the docstring a bit of info on why we need metadata for event and how that is structured / should be used.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah it's a bit of a headscratcher.

filename_timestamps_json = embedding_path.joinpath(
f"{split_name}.filename-timestamps.json"
)
Expand All @@ -468,26 +474,28 @@ def __init__(
else:
self.metadata = [{}] * self.dim[0]
assert len(self.labels) == self.dim[0]
assert len(self.labels) == len(self.embedding_memmap)
assert len(self.labels) == len(self.embeddings)
assert len(self.labels) == len(self.metadata)
assert self.embedding_memmap[0].shape[0] == self.dim[1]
assert self.embeddings[0].shape[0] == self.dim[1]

def __len__(self) -> int:
return self.dim[0]

def __getitem__(self, idx) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, Any]]:
"""
For all labels, return a multi or one-hot vector.
This allows us to have tensors that are all the same shape.
Later we reduce this with an argmax to get the vocabulary indices.
We also include the filename and timestamp, which we need
for evaluation of timestamp (event) tasks.
We also return the metadata as a Dict.
"""
x = self.embedding_memmap[idx]
labels = [self.label_to_idx[str(label)] for label in self.labels[idx]]
y = label_to_binary_vector(labels, self.nlabels)
return x, y, self.metadata[idx]
ys = []
for idx in tqdm(range(len(self.labels))):
labels = [self.label_to_idx[str(label)] for label in self.labels[idx]]
y = torch.tensor(label_to_binary_vector(labels, self.nlabels))
ys.append(y)
self.y = torch.stack(ys)
assert self.y.shape == (len(self.labels), self.nlabels)

def __len__(self) -> int:
return self.dim[0]

def __getitem__(self, idx) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, Any]]:
return self.embeddings[idx], self.y[idx], self.metadata[idx]


def create_events_from_prediction(
Expand Down Expand Up @@ -654,6 +662,8 @@ def dataloader_from_split_name(
label_to_idx: Dict[str, int],
nlabels: int,
embedding_type: str,
in_memory: bool,
metadata: bool = True,
batch_size: int = 64,
) -> DataLoader:
dataset = SplitMemmapDataset(
Expand All @@ -662,6 +672,8 @@ def dataloader_from_split_name(
nlabels=nlabels,
split_name=split_name,
embedding_type=embedding_type,
in_memory=in_memory,
metadata=metadata,
)

print(
Expand All @@ -672,10 +684,12 @@ def dataloader_from_split_name(
return DataLoader(
dataset,
batch_size=batch_size,
# We don't shuffle because it's slow.
# We don't shuffle because it's slow
# (except when in_memory = True).
# Also we want predicted labels in the same order as
# target labels.
# target labels, for validation and test.
shuffle=False,
pin_memory=True,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any consideration with the metadata? With this I think the embeddings and labels will be transferred to CUDA, but the metadata won't (I think b/c they aren't tensors). I think it will be fine, just curious if there are any gotchas there.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

)


Expand All @@ -689,6 +703,7 @@ def task_predictions_train(
scores: List[ScoreFunction],
conf: Dict,
gpus: Any,
in_memory: bool,
deterministic: bool,
) -> Tuple[
str, int, Dict[str, Any], Tuple[Tuple[str, Any], ...], pl.Trainer, float, str
Expand Down Expand Up @@ -758,6 +773,8 @@ def task_predictions_train(
nlabels,
metadata["embedding_type"],
batch_size=conf["batch_size"],
in_memory=in_memory,
metadata=False,
)
valid_dataloader = dataloader_from_split_name(
"valid",
Expand All @@ -766,6 +783,7 @@ def task_predictions_train(
nlabels,
metadata["embedding_type"],
batch_size=conf["batch_size"],
in_memory=in_memory,
)
trainer.fit(predictor, train_dataloader, valid_dataloader)
if checkpoint_callback.best_model_score is not None:
Expand Down Expand Up @@ -819,6 +837,7 @@ def task_predictions(
timestamp_embedding_size: int,
grid_points: int,
gpus: Optional[int],
in_memory: bool,
deterministic: bool,
grid: str,
):
Expand Down Expand Up @@ -894,6 +913,7 @@ def print_scores(mode, scores_and_trainers):
scores=scores,
conf=conf,
gpus=gpus,
in_memory=in_memory,
deterministic=deterministic,
)
scores_and_trainers.append(
Expand Down Expand Up @@ -932,6 +952,7 @@ def print_scores(mode, scores_and_trainers):
nlabels,
metadata["embedding_type"],
batch_size=conf["batch_size"],
in_memory=in_memory,
)
# This hack is necessary because we use the best validation epoch to
# choose the event postprocessing
Expand Down