Skip to content

Commit

Permalink
Modify default model config behaviour (#8)
Browse files Browse the repository at this point in the history
  • Loading branch information
francoishernandez authored Jun 7, 2024
1 parent 60fbbe4 commit 1b6ac42
Show file tree
Hide file tree
Showing 12 changed files with 89 additions and 49 deletions.
11 changes: 5 additions & 6 deletions .github/workflows/push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ jobs:
-config eole/tests/data/data.yaml \
-save_data /tmp/eole.train.check \
-n_sample 30 \
-model '{"architecture": "rnn"}' \
-training '{"num_workers": 0, "bucket_size": 1024}' \
-src_vocab /tmp/eole.vocab.src \
-tgt_vocab /tmp/eole.vocab.tgt \
Expand All @@ -65,7 +66,7 @@ jobs:
-tgt_vocab /tmp/eole.vocab.tgt \
-src_vocab_size 1000 \
-tgt_vocab_size 1000 \
-model '{"hidden_size": 10, "embeddings": {"word_vec_size": 5}}' \
-model '{"architecture": "rnn", "hidden_size": 10, "embeddings": {"word_vec_size": 5}}' \
-training '{"batch_size": 10, "num_workers": 0, "bucket_size": 1024, "train_steps": 10}' \
-report_every 5\
-tensorboard \
Expand All @@ -79,7 +80,7 @@ jobs:
-tgt_vocab /tmp/eole.vocab.tgt \
-src_vocab_size 1000 \
-tgt_vocab_size 1000 \
-model '{"hidden_size": 10, "embeddings": {"word_vec_size": 5}, "decoder":{"decoder_type": "rnn"}}' \
-model '{"architecture": "rnn", "hidden_size": 10, "embeddings": {"word_vec_size": 5}}' \
-training '{"batch_size": 10, "num_workers": 0, "bucket_size": 1024, "train_steps": 10, "valid_steps": 5}' \
-report_every 5 \
-tensorboard \
Expand All @@ -95,7 +96,7 @@ jobs:
-src_vocab_size 1000 \
-tgt_vocab_size 1000 \
-report_every 5 \
-model '{"hidden_size": 10, "embeddings": {"word_vec_size": 5}, "decoder": {"decoder_type": "rnn", "coverage_attn": True, "lambda_coverage": 0.1}}' \
-model '{"architecture": "rnn", "hidden_size": 10, "embeddings": {"word_vec_size": 5}, "decoder": {"coverage_attn": True, "lambda_coverage": 0.1}}' \
-training '{"batch_size": 10, "num_workers": 0, "bucket_size": 1024, "train_steps": 10}'
- name: Test Transformer training with align
run: |
Expand Down Expand Up @@ -337,7 +338,7 @@ jobs:
-tgt_vocab /tmp/eole.vocab.tgt \
-src_vocab_size 1000 \
-tgt_vocab_size 1000 \
-model '{"hidden_size": 10, "embeddings": {"word_vec_size": 5}}' \
-model '{"architecture": "rnn", "hidden_size": 10, "embeddings": {"word_vec_size": 5}}' \
-training '{"batch_size": 10, "num_workers": 0, "bucket_size": 1024, "train_steps": 10, "model_path": "/tmp/eole.model", "save_checkpoint_steps": 10}' \
-report_every 5
sed -i '1s/^/new_tok\t100000000\n/' /tmp/eole.vocab.src
Expand All @@ -347,7 +348,6 @@ jobs:
-tgt_vocab /tmp/eole.vocab.tgt \
-src_vocab_size 1000 \
-tgt_vocab_size 1000 \
-model '{"hidden_size": 10, "embeddings": {"word_vec_size": 5}}' \
-training '{"batch_size": 10, "num_workers": 0, "bucket_size": 1024, "train_steps": 20, "train_from": "/tmp/eole.model/step_10", "save_checkpoint_steps": 10, "update_vocab": True, "reset_optim": "states"}' \
-report_every 5
- name: Test checkpoint vocabulary update with LM
Expand All @@ -366,7 +366,6 @@ jobs:
-config eole/tests/data/lm_data.yaml \
-src_vocab /tmp/eole.vocab.src \
-tgt_vocab /tmp/eole.vocab.src \
-model '{"layers": 2, "hidden_size": 16, "transformer_ff": 64, "embeddings": {"word_vec_size": 16}, "encoder": None, "decoder": {"decoder_type": "transformer_lm", "heads": 4}}' \
-training '{"batch_size": 10, "num_workers": 0, "bucket_size": 1024, "train_steps": 20, "train_from": "/tmp/lm.eole.model/step_10", "save_checkpoint_steps": 10, "update_vocab": True, "reset_optim": "states"}' \
-src_vocab_size 1000 \
-tgt_vocab_size 1000 \
Expand Down
3 changes: 2 additions & 1 deletion eole/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ def recursive_update_dict(_dict, new_dict, defaults):
if isinstance(v, dict):
_dict[k] = recursive_update_dict(_dict.get(k, {}), v, defaults.get(k, {}))
else:
previous_v = _dict.get(k, defaults.get(k, None))
default = defaults.get(k, None) if isinstance(defaults, dict) else None
previous_v = _dict.get(k, default)
if v != previous_v:
logger.info(f"Option: {k}, value: {v}, overriding model: {previous_v}")
_dict[k] = v
Expand Down
6 changes: 2 additions & 4 deletions eole/config/common.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# import torch
from typing import List, Literal
from pydantic import Field, computed_field, model_validator
from functools import cached_property
from pydantic import Field, model_validator
from eole.config.config import Config

# from eole.utils.logging import logger
Expand Down Expand Up @@ -33,8 +32,7 @@ class DistributedConfig(Config):
default=60, description="Timeout for one GPU to wait for the others."
)

@computed_field
@cached_property
@property
def parallel_gpu(self) -> int: # converted to a `property` by `computed_field`
return self.world_size if self.parallel_mode == "tensor_parallel" else 1

Expand Down
7 changes: 5 additions & 2 deletions eole/config/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def _get_all_transform(self):
all_transforms.update(_transforms)
if (
hasattr(self, "model")
and getattr(self.model.decoder, "lambda_align", 0.0) > 0.0
and getattr(getattr(self.model, "decoder", None), "lambda_align", 0.0) > 0.0
):
if not all_transforms.isdisjoint({"sentencepiece", "bpe", "onmt_tokenize"}):
raise ValueError(
Expand Down Expand Up @@ -296,7 +296,10 @@ def _validate_data(self):
if corpus.path_align is None:
if (
hasattr(self, "model")
and getattr(self.model.decoder, "lambda_align", 0.0) > 0.0
and getattr(
getattr(self.model, "decoder", None), "lambda_align", 0.0
)
> 0.0
):
raise ValueError(
f"Corpus {cname} alignment file path are "
Expand Down
7 changes: 4 additions & 3 deletions eole/config/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ class BaseModelConfig(Config):
CnnEncoderConfig,
MeanEncoderConfig,
] | None = Field(
default_factory=RnnEncoderConfig,
default=None,
discriminator="encoder_type",
description="Major parameters of an encoder.",
) # we shall use discriminators here
Expand All @@ -318,7 +318,7 @@ class BaseModelConfig(Config):
RnnDecoderConfig,
CnnDecoderConfig,
] | None = Field(
default_factory=RnnDecoderConfig,
default=None,
discriminator="decoder_type",
description="Major parameters of a decoder.",
) # we shall use discriminators here
Expand Down Expand Up @@ -417,7 +417,8 @@ def update_model_opts(self):

if self.encoder is not None:
self.encoder.src_word_vec_size = self.embeddings.src_word_vec_size
self.decoder.tgt_word_vec_size = self.embeddings.tgt_word_vec_size
if self.decoder is not None:
self.decoder.tgt_word_vec_size = self.embeddings.tgt_word_vec_size

# causing some weird recursion issue in unit test, to investigate
# if self.encoder is not None:
Expand Down
29 changes: 18 additions & 11 deletions eole/config/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,20 @@ class TrainConfig(
description="Print data loading and statistics for all process "
"(default only logs the first process shard).",
) # not sure this still works
model: ModelConfig # TypeAdapter handling discrimination directly
model: ModelConfig | None = None # TypeAdapter handling discrimination directly
training: TrainingConfig | None = Field(default_factory=TrainingConfig)

def get_model_path(self):
return self.training.get_model_path()

@classmethod
def get_defaults(cls):
return cls(src_vocab="dummy", tgt_vocab="dummy", data={}).model_dump()
def get_defaults(cls, architecture):
return cls(
src_vocab="dummy",
tgt_vocab="dummy",
data={},
model={"architecture": architecture},
).model_dump()

@field_validator("model", "training", mode="before")
@classmethod
Expand All @@ -53,20 +58,22 @@ def str_to_dict(cls, v) -> Dict:
@model_validator(mode="before")
@classmethod
def default_architecture(cls, data: Any) -> Any:
# somewhat dirty patch to make test_models pass, might do better later
# this explicit call to the field_validator should not be necessary
# only enforce default "custom" if some model settings are passed
if "model" in data.keys():
data["model"] = cls.str_to_dict(data["model"])
if isinstance(data.get("model", {}), dict):
if data.get("model", {}).get("architecture", None) is None:
if "model" not in data.keys():
data["model"] = {"architecture": "custom"}
else:
if isinstance(data.get("model", {}), dict):
if data.get("model", {}).get("architecture", None) is None:
data["model"]["architecture"] = "custom"
return data

@model_validator(mode="after")
def _validate_train_config(self):
if self.model is None and self.training.train_from is None:
raise ValueError(
"No model architecture is configured. "
"You should either finetune from an existing model, "
"or specify a model configuration."
)
return self


Expand All @@ -85,7 +92,7 @@ class PredictConfig(
src_subword_vocab: str | None = (
None # patch for CT2 inference engine (to improve later)
)
model: ModelConfig
model: ModelConfig | None = None

@model_validator(mode="after")
def _validate_predict_config(self):
Expand Down
4 changes: 2 additions & 2 deletions eole/decoders/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import copy
from eole.encoders.encoder import EncoderBase
from eole.decoders.decoder import DecoderBase
from eole.models import EncoderDecoderModel, get_model_class
from eole.models import EncoderDecoderModel, BaseModel


class EnsembleDecoderOutput(object):
Expand Down Expand Up @@ -191,7 +191,7 @@ def load_test_model(config, device_id=0):
for i, model_path in enumerate(config.model_path):
config2.model_path = [config.model_path[i]]
print(config2.model)
vocabs, model, model_config = get_model_class(config2.model).load_test_model(
vocabs, model, model_config = BaseModel.load_test_model(
config2, device_id, model_path=model_path
)
if shared_vocabs is None:
Expand Down
4 changes: 2 additions & 2 deletions eole/predict/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from eole.predict.greedy_search import GreedySearch, GreedySearchLM
from eole.predict.penalties import PenaltyBuilder
from eole.decoders.ensemble import load_test_model as ensemble_load_test_model
from eole.models import get_model_class
from eole.models import BaseModel
import codecs


Expand Down Expand Up @@ -44,7 +44,7 @@ def build_predictor(config, device_id=0, report_score=True, logger=None, out_fil
load_test_model = (
ensemble_load_test_model
if len(config.model_path) > 1
else get_model_class(config.model).load_test_model
else BaseModel.load_test_model
)

vocabs, model, model_config = load_test_model(config, device_id)
Expand Down
10 changes: 5 additions & 5 deletions eole/tests/pull_request_check.sh
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ rm -f -r $TMP_OUT_DIR/sample
echo -n "[+] Testing NMT vocab? /transforms prepare..."
${PYTHON} eole/bin/main.py train \
-config ${DATA_DIR}/data.yaml \
-model '{"architecture": "rnn"}' \
-save_data $TMP_OUT_DIR/eole.train.check \
-n_sample 30 \
-overwrite \
Expand All @@ -109,7 +110,7 @@ ${PYTHON} eole/bin/main.py train \
-tgt_vocab $TMP_OUT_DIR/eole.vocab.tgt \
-src_vocab_size 1000 \
-tgt_vocab_size 1000 \
-model '{"hidden_size": 10, "embeddings": {"word_vec_size": 5}}' \
-model '{"architecture": "rnn", "hidden_size": 10, "embeddings": {"word_vec_size": 5}}' \
-training '{"batch_size": 10, "num_workers": 0, "bucket_size": 1024, "train_steps": 10}' \
-report_every 5 \
-tensorboard \
Expand All @@ -126,7 +127,7 @@ ${PYTHON} eole/bin/main.py train \
-tgt_vocab $TMP_OUT_DIR/eole.vocab.tgt \
-src_vocab_size 1000 \
-tgt_vocab_size 1000 \
-model '{"hidden_size": 10, "embeddings": {"word_vec_size": 5}, "decoder":{"decoder_type": "rnn"}}' \
-model '{"architecture": "rnn", "hidden_size": 10, "embeddings": {"word_vec_size": 5}}' \
-training '{"batch_size": 10, "num_workers": 0, "bucket_size": 1024, "train_steps": 10, "valid_steps": 5}' \
-report_every 2 \
-tensorboard \
Expand Down Expand Up @@ -159,7 +160,7 @@ ${PYTHON} eole/bin/main.py train \
-tgt_vocab $TMP_OUT_DIR/eole.vocab.tgt \
-src_vocab_size 1000 \
-tgt_vocab_size 1000 \
-model '{"hidden_size": 10, "embeddings": {"word_vec_size": 5}, "decoder": {"decoder_type": "rnn", "coverage_attn": True, "lambda_coverage": 0.1}}' \
-model '{"architecture": "rnn", "hidden_size": 10, "embeddings": {"word_vec_size": 5}, "decoder": {"coverage_attn": True, "lambda_coverage": 0.1}}' \
-training '{"batch_size": 10, "num_workers": 0, "bucket_size": 1024, "train_steps": 10}' \
-report_every 5 \
>> ${LOG_FILE} 2>&1
Expand Down Expand Up @@ -271,7 +272,7 @@ ${PYTHON} eole/bin/main.py train \
-src_vocab $TMP_OUT_DIR/eole.vocab.src \
-tgt_vocab $TMP_OUT_DIR/eole.vocab.tgt \
-src_vocab_size 1000 -tgt_vocab_size 1000 \
-model '{"hidden_size": 10, "embeddings": {"word_vec_size": 5}}' \
-model '{"architecture": "rnn", "hidden_size": 10, "embeddings": {"word_vec_size": 5}}' \
-training '{"batch_size": 10, "num_workers": 0, "bucket_size": 1024, "train_steps": 10, "model_path": "'"$TMP_OUT_DIR"'/eole.model", "save_checkpoint_steps": 10}' \
-report_every 5 \
>> ${LOG_FILE} 2>&1
Expand All @@ -281,7 +282,6 @@ ${PYTHON} eole/bin/main.py train \
-src_vocab $TMP_OUT_DIR/eole.vocab.src \
-tgt_vocab $TMP_OUT_DIR/eole.vocab.tgt \
-src_vocab_size 1000 -tgt_vocab_size 1000 \
-model '{"hidden_size": 10, "embeddings": {"word_vec_size": 5}}' \
-training '{"batch_size": 10, "num_workers": 0, "bucket_size": 1024, "train_steps": 20, "train_from": "'"$TMP_OUT_DIR"'/eole.model/step_10", "save_checkpoint_steps": 10, "update_vocab": True, "reset_optim": "states"}' \
-report_every 5 \
>> ${LOG_FILE} 2>&1
Expand Down
27 changes: 18 additions & 9 deletions eole/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from eole.utils.misc import sequence_mask
from eole.config.run import TrainConfig
from eole.config.data import Dataset
from eole.config.models import CustomModelConfig

# In theory we should probably call ModelConfig here,
# but we can't because model building relies on some params
Expand All @@ -22,6 +23,7 @@
}, # actual file path (tested in validate)
src_vocab="dummy",
share_vocab=True,
model=CustomModelConfig(),
) # now required by validation


Expand Down Expand Up @@ -113,7 +115,7 @@ def encoder_forward(self, opt, source_l=3, bsize=1):

# Initialize vectors to compare size with
test_hid = torch.zeros(
self.opt.model.encoder.layers, bsize, opt.model.encoder.hidden_size
opt.model.encoder.layers, bsize, opt.model.encoder.hidden_size
)
test_out = torch.zeros(bsize, source_l, opt.model.decoder.hidden_size)

Expand Down Expand Up @@ -192,20 +194,28 @@ def test_method(self):
"""
# opt.brnn = False # deprecated and not used here

test_embeddings = [[], [("model", {"architecture": "transformer"})]]
test_embeddings = [
[("model", {"architecture": "rnn"})],
[("model", {"architecture": "transformer"})],
]

for p in test_embeddings:
_add_test(p, "embeddings_forward")

tests_encoder = [
[],
[
# ("encoder_type", "mean"),
("model", {"architecture": "custom", "encoder": {"encoder_type": "mean"}})
(
"model",
{
"architecture": "custom",
"encoder": {"encoder_type": "mean"},
"decoder": {"decoder_type": "rnn"},
},
)
],
# [('encoder_type', 'transformer'),
# ('word_vec_size', 16), ('hidden_size', 16)],
[],
]

for p in tests_encoder:
Expand All @@ -223,8 +233,8 @@ def test_method(self):
},
)
],
[("model", {"layers": 10})],
[("model", {"input_feed": 0})],
[("model", {"architecture": "rnn", "layers": 10})],
[("model", {"architecture": "rnn", "input_feed": 0})],
[
(
"model",
Expand Down Expand Up @@ -274,7 +284,7 @@ def test_method(self):
# [("encoder_type", "brnn"), ("brnn_merge", "sum")],
# not sure about this one, brnn_merge does not seem to exist in the codebase
[
("model", {"architecture": "custom", "encoder": {"encoder_type": "brnn"}})
("model", {"architecture": "rnn", "encoder": {"encoder_type": "brnn"}})
# ("encoder_type", "brnn"),
],
[
Expand Down Expand Up @@ -317,7 +327,6 @@ def test_method(self):
# ("encoder_type", "rnn"),
# ("global_attention", "mlp"),
],
[],
]


Expand Down
Loading

0 comments on commit 1b6ac42

Please sign in to comment.