From f58569351caa2f10dde6bf96516346aef1964a7c Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Fri, 5 Apr 2024 13:34:19 -0700 Subject: [PATCH 1/4] Fixing the dependency situation (#613) --- .github/workflows/main.yml | 3 ++- CHANGELOG.md | 4 ++++ pyproject.toml | 8 ++++---- tango/__main__.py | 2 +- tango/integrations/beaker/executor.py | 10 +++++++++- tango/integrations/flax/data.py | 2 +- tango/integrations/flax/optim.py | 3 ++- tango/integrations/flax/util.py | 4 ++-- tango/integrations/transformers/__init__.py | 1 + tests/integrations/beaker/executor_test.py | 3 +++ tests/integrations/flax/train_test.py | 7 ++++++- 11 files changed, 35 insertions(+), 12 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 3a589ca60..b00d8238a 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -211,7 +211,7 @@ jobs: if: steps.virtualenv-cache.outputs.cache-hit != 'true' && (contains(matrix.task.extras, 'flax') || contains(matrix.task.extras, 'all')) run: | . .venv/bin/activate - pip install flax==0.6.1 jax==0.4.1 jaxlib==0.4.1 tensorflow-cpu==2.9.1 optax==0.1.3 + pip install flax jax jaxlib "tensorflow-cpu>=2.9.1" optax - name: Install editable (no cache hit) if: steps.virtualenv-cache.outputs.cache-hit != 'true' @@ -282,6 +282,7 @@ jobs: spec: | version: v2 description: GPU Tests + budget: ai2/oe-training tasks: - name: tests image: diff --git a/CHANGELOG.md b/CHANGELOG.md index 86fa40edb..5b842499e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +### Fixed + +- Fixed a bunch of dependencies + ## [v1.3.2](https://github.com/allenai/tango/releases/tag/v1.3.2) - 2023-10-27 ### Fixed diff --git a/pyproject.toml b/pyproject.toml index 31a1807ab..6db17ec82 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -89,10 +89,10 @@ fairscale = [ ] flax = [ "datasets>=1.12,<3.0", - "jax>=0.4.1,<=0.4.13", - "jaxlib>=0.4.1,<=0.4.13", - "flax>=0.6.1,<=0.7.0", - "optax>=0.1.2", + "jax", + "jaxlib", + "flax", + "optax", "tensorflow-cpu>=2.9.1" ] wandb = [ diff --git a/tango/__main__.py b/tango/__main__.py index c13d6e45e..49ec9cac5 100644 --- a/tango/__main__.py +++ b/tango/__main__.py @@ -115,7 +115,7 @@ class SettingsObject(NamedTuple): called_by_executor: bool -@click.group(**_CLICK_GROUP_DEFAULTS) +@click.group(name=None, **_CLICK_GROUP_DEFAULTS) @click.version_option(version=VERSION) @click.option( "--settings", diff --git a/tango/integrations/beaker/executor.py b/tango/integrations/beaker/executor.py index c7254433c..b0d95f5dc 100644 --- a/tango/integrations/beaker/executor.py +++ b/tango/integrations/beaker/executor.py @@ -355,6 +355,7 @@ def __init__( priority: Optional[Union[str, Priority]] = None, allow_dirty: bool = False, scheduler: Optional[BeakerScheduler] = None, + budget: Optional[str] = None, **kwargs, ): # Pre-validate arguments. @@ -365,6 +366,11 @@ def __init__( "Either 'beaker_image' or 'docker_image' must be specified for BeakerExecutor, but not both." ) + if budget is None: + raise ConfigurationError("You must specify a budget to use the beaker executor.") + else: + self._budget = budget + from tango.workspaces import LocalWorkspace, MemoryWorkspace if isinstance(workspace, MemoryWorkspace): @@ -1029,7 +1035,9 @@ def _build_experiment_spec( return ( experiment_name, ExperimentSpec( - tasks=[task_spec], description=f'Tango step "{step_name}" ({step.unique_id})' + tasks=[task_spec], + description=f'Tango step "{step_name}" ({step.unique_id})', + budget=self._budget, ), [step_graph_dataset], ) diff --git a/tango/integrations/flax/data.py b/tango/integrations/flax/data.py index b434d720a..483789670 100644 --- a/tango/integrations/flax/data.py +++ b/tango/integrations/flax/data.py @@ -40,7 +40,7 @@ def __init__( self.logger = logging.getLogger(FlaxDataLoader.__name__) - def __call__(self, rng: jax.random.PRNGKeyArray, do_distributed: bool): + def __call__(self, rng: jax._src.random.KeyArrayLike, do_distributed: bool): steps_per_epoch = self.dataset_size // self.batch_size if self.shuffle: diff --git a/tango/integrations/flax/optim.py b/tango/integrations/flax/optim.py index 920991560..ea773790a 100644 --- a/tango/integrations/flax/optim.py +++ b/tango/integrations/flax/optim.py @@ -28,6 +28,7 @@ class Optimizer(Registrable): :options: +ELLIPSIS optax::adabelief + optax::adadelta optax::adafactor optax::adagrad optax::adam @@ -100,7 +101,7 @@ def factory_func(): Optimizer.register("optax::" + name)(factory_func) # Register all learning rate schedulers. -for name, cls in optax._src.schedule.__dict__.items(): +for name, cls in optax.schedules.__dict__.items(): if isfunction(cls) and not name.startswith("_") and cls.__annotations__: factory_func = scheduler_factory(cls) LRScheduler.register("optax::" + name)(factory_func) diff --git a/tango/integrations/flax/util.py b/tango/integrations/flax/util.py index 387974e93..311440ce2 100644 --- a/tango/integrations/flax/util.py +++ b/tango/integrations/flax/util.py @@ -3,7 +3,7 @@ import jax -def get_PRNGkey(seed: int = 42) -> Union[Any, jax.random.PRNGKeyArray]: +def get_PRNGkey(seed: int = 42) -> Union[Any, jax._src.random.KeyArray]: """ Utility function to create a pseudo-random number generator key given a seed. @@ -11,7 +11,7 @@ def get_PRNGkey(seed: int = 42) -> Union[Any, jax.random.PRNGKeyArray]: return jax.random.PRNGKey(seed) -def get_multiple_keys(key, multiple: int = 1) -> Union[Any, jax.random.PRNGKeyArray]: +def get_multiple_keys(key, multiple: int = 1) -> Union[Any, jax._src.random.KeyArray]: """ Utility function to split a PRNG key into multiple new keys. Used in distributed training. diff --git a/tango/integrations/transformers/__init__.py b/tango/integrations/transformers/__init__.py index 386e48ecb..950de3680 100644 --- a/tango/integrations/transformers/__init__.py +++ b/tango/integrations/transformers/__init__.py @@ -70,6 +70,7 @@ transformers::Adafactor transformers::AdamW + transformers::LayerWiseDummyOptimizer - :class:`~tango.integrations.torch.LRScheduler`: All learning rate scheduler function from transformers are registered according to their type name (e.g. "transformers::linear"). diff --git a/tests/integrations/beaker/executor_test.py b/tests/integrations/beaker/executor_test.py index 6fe71e033..b8f20a908 100644 --- a/tests/integrations/beaker/executor_test.py +++ b/tests/integrations/beaker/executor_test.py @@ -19,6 +19,7 @@ def test_from_params(beaker_workspace_name: str): beaker_image="ai2/conda", github_token="FAKE_TOKEN", datasets=[{"source": {"beaker": "some-dataset"}, "mount_path": "/input"}], + budget="ai2/allennlp", ), workspace=BeakerWorkspace(workspace=beaker_workspace_name), clusters=["fake-cluster"], @@ -38,6 +39,7 @@ def test_init_with_mem_workspace(beaker_workspace_name: str): beaker_image="ai2/conda", github_token="FAKE_TOKEN", clusters=["fake-cluster"], + budget="ai2/allennlp", ) @@ -50,6 +52,7 @@ def settings(beaker_workspace_name: str) -> TangoGlobalSettings: "beaker_workspace": beaker_workspace_name, "install_cmd": "pip install .[beaker]", "clusters": ["ai2/allennlp-cirrascale", "ai2/general-cirrascale"], + "budget": "ai2/allennlp", }, ) diff --git a/tests/integrations/flax/train_test.py b/tests/integrations/flax/train_test.py index 0eb6bcebf..16b1b830f 100644 --- a/tests/integrations/flax/train_test.py +++ b/tests/integrations/flax/train_test.py @@ -20,5 +20,10 @@ def test_trainer(self): ], ) assert ( - result_dir / "train" / "work" / "checkpoint_state_latest" / "checkpoint_0" + result_dir + / "train" + / "work" + / "checkpoint_state_latest" + / "checkpoint_0" + / "checkpoint" ).is_file() From 94b4df6ebaacae7ed1bc2454d481e51d04fee5ca Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 5 Apr 2024 13:49:32 -0700 Subject: [PATCH 2/4] Update more-itertools requirement from <10.0,>=8.0 to >=8.0,<11.0 (#594) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Dirk Groeneveld --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6db17ec82..7e45ec466 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ dependencies = [ "click-help-colors>=0.9.1,<0.10", "rich>=12.3,<14.0", "tqdm>=4.62,<5.0", - "more-itertools>=8.0,<10.0", + "more-itertools>=8.0,<11.0", "sqlitedict", "glob2>=0.7", "petname>=2.6,<3.0", From 437aa160aff6ac47c85142038ca517c780ea6efb Mon Sep 17 00:00:00 2001 From: Jun Harashima Date: Wed, 29 May 2024 09:49:14 +0900 Subject: [PATCH 3/4] fix first_steps.md (#566) --- docs/source/first_steps.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/first_steps.md b/docs/source/first_steps.md index 16989205c..cb7d34200 100644 --- a/docs/source/first_steps.md +++ b/docs/source/first_steps.md @@ -247,7 +247,7 @@ Computing...: 100%|##########| 100/100 [00:05<00:00, 18.99it/s] ✓ The output for "add_numbers" is in workspace/runs/live-tarpon/add_numbers ``` -The last line in the output tells us where we can find the result of our "add_numbers" step. `live-parpon` is +The last line in the output tells us where we can find the result of our "add_numbers" step. `live-tarpon` is the name of the run. Run names are randomly generated and may be different on your machine. `add_numbers` is the name of the step in your config. The whole path is a symlink to a directory, which contains (among other things) a file `data.json`: From 6aaa8ff0f20387c51dcdf4ab0718787eb55ea794 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 30 May 2024 10:33:29 -0700 Subject: [PATCH 4/4] Update wandb requirement from <0.14.3,>=0.12 to >=0.12,<0.15.9 (#598) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Dirk Groeneveld --- .github/workflows/main.yml | 3 +-- CHANGELOG.md | 1 + pyproject.toml | 2 +- tango/integrations/transformers/__init__.py | 1 + tango/integrations/wandb/step_cache.py | 8 +++----- tango/integrations/wandb/util.py | 13 ++++++++++++- 6 files changed, 19 insertions(+), 9 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index b00d8238a..c2131a5a1 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -20,7 +20,6 @@ env: WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} BEAKER_TOKEN: ${{ secrets.BEAKER_TOKEN }} BEAKER_WORKSPACE: ai2/tango-testing - BEAKER_DEFAULT_CLUSTER: ai2/allennlp-cirrascale BEAKER_IMAGE: petew/tango-testing GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} @@ -288,7 +287,7 @@ jobs: image: beaker: ${{ env.BEAKER_IMAGE }} context: - cluster: ${{ env.BEAKER_DEFAULT_CLUSTER }} + preemptible: true resources: gpuCount: 2 envVars: diff --git a/CHANGELOG.md b/CHANGELOG.md index 5b842499e..aa674066c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - Fixed a bunch of dependencies +- Upgraded to new version of wandb ## [v1.3.2](https://github.com/allenai/tango/releases/tag/v1.3.2) - 2023-10-27 diff --git a/pyproject.toml b/pyproject.toml index 7e45ec466..3b29a2ad2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -96,7 +96,7 @@ flax = [ "tensorflow-cpu>=2.9.1" ] wandb = [ - "wandb>=0.12,<0.14.3", + "wandb>=0.16", "retry" ] beaker = [ diff --git a/tango/integrations/transformers/__init__.py b/tango/integrations/transformers/__init__.py index 950de3680..817c97f7a 100644 --- a/tango/integrations/transformers/__init__.py +++ b/tango/integrations/transformers/__init__.py @@ -93,6 +93,7 @@ transformers::constant transformers::constant_with_warmup transformers::cosine + transformers::cosine_with_min_lr transformers::cosine_with_restarts transformers::inverse_sqrt transformers::linear diff --git a/tango/integrations/wandb/step_cache.py b/tango/integrations/wandb/step_cache.py index 35851b2c1..56ea8b84c 100644 --- a/tango/integrations/wandb/step_cache.py +++ b/tango/integrations/wandb/step_cache.py @@ -72,7 +72,7 @@ def _step_artifact_name(self, step: Union[Step, StepInfo]) -> str: def _step_result_remote( # type: ignore self, step: Union[Step, StepInfo] - ) -> Optional[wandb.apis.public.Artifact]: + ) -> Optional[wandb.Artifact]: artifact_kind = (step.metadata or {}).get("artifact_kind", ArtifactKind.STEP_RESULT.value) try: return self.wandb_client.artifact( @@ -88,9 +88,7 @@ def _step_result_remote( # type: ignore def create_step_result_artifact(self, step: Step, objects_dir: Optional[PathOrStr] = None): self._upload_step_remote(step, objects_dir) - def get_step_result_artifact( - self, step: Union[Step, StepInfo] - ) -> Optional[wandb.apis.public.Artifact]: + def get_step_result_artifact(self, step: Union[Step, StepInfo]) -> Optional[wandb.Artifact]: artifact_kind = (step.metadata or {}).get("artifact_kind", ArtifactKind.STEP_RESULT.value) try: return self.wandb_client.artifact( @@ -144,7 +142,7 @@ def use_step_result_artifact(self, step: Union[Step, StepInfo]) -> None: def _download_step_remote(self, step_result, target_dir: PathOrStr): try: - step_result.download(root=target_dir, recursive=True) + step_result.download(root=target_dir) except (WandbError, ValueError): raise RemoteNotFoundError() diff --git a/tango/integrations/wandb/util.py b/tango/integrations/wandb/util.py index 29d5ae644..7f5b3d211 100644 --- a/tango/integrations/wandb/util.py +++ b/tango/integrations/wandb/util.py @@ -1,4 +1,5 @@ import os +import re import warnings from enum import Enum @@ -13,7 +14,17 @@ def is_missing_artifact_error(err: WandbError): Check if a specific W&B error is caused by a 404 on the artifact we're looking for. """ # This is brittle, but at least we have a test for it. - return "does not contain artifact" in err.message + + # This is a workaround for a bug in the wandb API + if err.message == "'NoneType' object has no attribute 'get'": + return True + + if re.search(r"^artifact '.*' not found in '.*'$", err.message): + return True + + return ("does not contain artifact" in err.message) or ( + "Unable to fetch artifact with name" in err.message + ) def check_environment():