From f29c95dc4f796cc6f9b93b6df19aa6bb7f833341 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20Rowlands=20=28=EB=B3=80=EA=B8=B0=ED=98=B8=29?= Date: Fri, 29 Sep 2023 21:22:37 +0900 Subject: [PATCH 1/2] ref: document `dvc artifacts get` (#4809) * command-ref: document `dvc artifacts get` * api-ref: document `dvc.api.artifacts_show()` * review updates * add CLI examples * Update content/docs/api-reference/artifacts_show.md * Apply suggestions from code review * Restyled by prettier (#4890) Co-authored-by: Restyled.io * Apply suggestions from code review * Restyled by prettier (#4891) Co-authored-by: Restyled.io --------- Co-authored-by: Dave Berenbaum Co-authored-by: restyled-io[bot] <32688539+restyled-io[bot]@users.noreply.github.com> Co-authored-by: Restyled.io --- content/docs/api-reference/artifacts_show.md | 106 ++++++++++++++ .../docs/command-reference/artifacts/get.md | 134 ++++++++++++++++++ .../docs/command-reference/artifacts/index.md | 27 ++++ content/docs/sidebar.json | 15 ++ 4 files changed, 282 insertions(+) create mode 100644 content/docs/api-reference/artifacts_show.md create mode 100644 content/docs/command-reference/artifacts/get.md create mode 100644 content/docs/command-reference/artifacts/index.md diff --git a/content/docs/api-reference/artifacts_show.md b/content/docs/api-reference/artifacts_show.md new file mode 100644 index 0000000000..08c81b465c --- /dev/null +++ b/content/docs/api-reference/artifacts_show.md @@ -0,0 +1,106 @@ +# dvc.api.artifacts_show() + +Get the path and Git revision for an artifact tracked in a +DVC repository. + +```py +def artifacts_show( + name: str, + version: Optional[str] = None, + stage: Optional[str] = None, + repo: Optional[str] = None, +) -> Dict[str, str]: +``` + +## Usage: + +```py: +import dvc.api + +artifact = dvc.api.artifacts_show( + 'text-classification', + repo='https://github.com/iterative/example-get-started.git', +) +``` + +## Description + +Returns a path and Git revision for a named artifact which can then be used in +other Python API calls. + +The returned dictionary will be of the form: + +```py +{ + 'path': 'model.pkl', + 'rev': 'c7c6ae0', +} +``` + +where `path` contains the relative path to the artifact in the DVC repository, +and `rev` contains the Git revision for the specified artifact version or stage. + +When neither `version` nor `stage` are provided, the Git revision for the latest +version of the model will be returned. + +## Parameters + +- `name` (required) - name of the artifact. By default DVC will search for + artifacts declared in a `dvc.yaml` file located at the root of the DVC + repository. Artifacts declared in other `dvc.yaml` files should be addressed + in the form `path/to/dvc.yaml:artifact_name` or `path/to:artifact_name` (where + `dvc.yaml` is omitted). + +- `version` - version of the artifact (mutually exclusive with `stage`). + +- `stage` - stage of the artifact (mutually exclusive with `version`). + +- `repo` - the location of the DVC project. It can be a URL or a file system + path. Both HTTP and SSH protocols are supported for online Git repos (e.g. + `[user@]server:project.git`). _Default_: The current project (found by walking + up from the current working directory tree). + +## Example: Read the contents of an artifact + +```py +import pickle +import dvc.api + +artifact = dvc.api.artifacts_show( + 'text-classification', + version='v1.0.0', + repo='https://github.com/iterative/example-get-started.git', +) +data = dvc.api.read( + artifact['path'], + rev=artifact['rev'], + repo='https://github.com/iterative/example-get-started.git', + mode='rb', +) +model = pickle.loads(data) +``` + +This example uses the returned path and Git revision in conjunction with +`dvc.api.read()` to read the file content for the artifact. + +## Example: Download an artifact + +```py +import os +import dvc.api + +artifact = dvc.api.artifacts_show( + 'text-classification', + stage='prod', + repo='https://github.com/iterative/example-get-started.git', +) +fs = dvc.api.DVCFileSystem( + 'https://github.com/iterative/example-get-started.git', + rev=artifact['rev'], +) +fs.get_file(artifact['path'], os.path.basename(artifact['path'])) +``` + +This example uses the returned path and Git revision in conjunction with +`dvc.api.DVCFileSystem` to download the artifact to the current working +directory. diff --git a/content/docs/command-reference/artifacts/get.md b/content/docs/command-reference/artifacts/get.md new file mode 100644 index 0000000000..049c5c3d63 --- /dev/null +++ b/content/docs/command-reference/artifacts/get.md @@ -0,0 +1,134 @@ +## artifacts get + +Download an artifact tracked in a DVC project into the current +working directory. + +## Synopsis + +```usage +usage: dvc artifacts get [-h] [-q | -v] + [--rev []] [--stage []] + [-o []] [-j ] [-f] + [--config CONFIG] + [--remote REMOTE] [--remote-config [REMOTE_CONFIG ...]] + url name + +positional arguments: + url Location of DVC repository to download from + name Name of artifact in the repository +``` + +## Description + +Provides a way to download artifacts tracked in a DVC project. Unlike `dvc get`, +`dvc artifacts get` supports downloading an artifact by name, rather than by +path. Likewise, `dvc artifacts get` supports downloading a registered artifact +version or stage, instead of requiring a specified Git revision. + +`dvc artifacts get` also supports downloading artifacts both from the +model registry and from DVC remotes. + + + +Downloading an artifact from the model registry only requires a +valid Studio +[access token](/doc/studio/user-guide/account-management#studio-access-token). +It does not require the client to have DVC remote credentials. + + + +The `url` argument specifies the address of the DVC or Git repository containing +the artifact. Both HTTP and SSH protocols are supported (e.g. +`[user@]server:project.git`). `url` can also be a local file system path +(including the current project e.g. `.`). + +The `name` argument specifies the name of the artifact to download. By default +DVC will search for artifacts declared in a `dvc.yaml` file located at the root +of the DVC repository. Artifacts declared in other `dvc.yaml` files should be +addressed in the form `path/to/dvc.yaml:artifact_name` or +`path/to:artifact_name` (where `dvc.yaml` is omitted). + + + +`dvc artifacts get` will first try to download artifacts via the model +registry. If you do not have a valid Studio token, or the artifact is not +tracked in the model registry, DVC will fall back to downloading the artifact +from the project's default DVC remote. + + + +## Options + +- `--rev ` - Version of the artifact to download. The latest version of + the artifact is used by default when neither `rev` nor `stage` are specified. + +- `--stage ` - Stage of the artifact to download. The latest version of + the artifact is used by default when neither `rev` nor `stage` are specified. + +- `-o `, `--out ` - specify a `path` to the desired location in the + workspace to place the downloaded file or directory (instead of using the + current working directory). Directories specified in the path will be created + by this command. + +- `-j `, `--jobs ` - parallelism level for DVC to download data + from the remote. The default value is `4 * cpu_count()`. Using more jobs may + speed up the operation. Note that the default value can be set in the source + repo using the `jobs` config option of `dvc remote modify`. + +- `-f`, `--force` - when using `--out` to specify a local target file or + directory, the operation will fail if those paths already exist. this flag + will force the operation causing local files/dirs to be overwritten by the + command. + +- `--config ` - path to a [config file](/doc/command-reference/config) + that will be merged with the config in the target repository. + +- `--remote ` - name of the `dvc remote` to set as a default in the target + repository. Only applicable when downloading artifacts from a DVC remote. + +- `--remote-config [= ...]` - `dvc remote` config options to merge + with a remote's config (default or one specified by `--remote`) in the target + repository. Only applicable when downloading artifacts from a DVC remote. + +- `-h`, `--help` - prints the usage/help message, and exit. + +- `-q`, `--quiet` - do not write anything to standard output. Exit with 0 if no + problems arise, otherwise 1. + +- `-v`, `--verbose` - displays detailed tracing information. + +## Example: Download an artifact from a DVC remote + +```cli +$ dvc artifacts get https://github.com/iterative/example-get-started.git text-classification --rev=v1.0.0 +Downloaded 1 file(s) to 'model.pkl' +``` + +In this example, we download version `v1.0.0` of the artifact. Since we have no +Studio credentials set in our environment, `dvc artifacts get` will download the +artifact from the default DVC remote defined in the repository. + +## Example: Download an artifact using a Studio token + +```cli +$ DVC_STUDIO_TOKEN=mytoken dvc artifacts get https://github.com/iterative/example-get-started.git text-classification --stage=prod +Downloaded 1 file(s) to 'model.pkl' +``` + +In this example, we download stage `prod` of the artifact. Since we have set our +Studio access token in the `DVC_STUDIO_TOKEN` environment variable, +`dvc artifacts get` will download the artifact via the model +registry rather than from a DVC remote. + +## Example: Download an artifact defined in a specific `dvc.yaml` file + +```cli +$ dvc artifacts get https://github.com/iterative/lstm_seq2seq.git results/dvc.yaml:best +Downloaded 1 file(s) to 'epoch=0-step=16.ckpt' +``` + +In this example, we download the latest version of the `best` artifact. In this +case, the artifact is defined in `results/dvc.yaml` so we must include the path +to the `dvc.yaml` file when addressing the artifact. Since we do not specify +`--rev` or `--stage`, `dvc artifacts get` will download the latest version of +the artifact by default. diff --git a/content/docs/command-reference/artifacts/index.md b/content/docs/command-reference/artifacts/index.md new file mode 100644 index 0000000000..be9286dcf6 --- /dev/null +++ b/content/docs/command-reference/artifacts/index.md @@ -0,0 +1,27 @@ +# artifacts + +Commands for working with DVC artifacts and the model +registry. + +## Synopsis + +```usage +usage: dvc artifacts [-h] [-q | -v] {get} ... + +positional arguments: + COMMAND + get Download an artifact from a DVC project. +``` + +## Description + +`dvc artifacts` subcommands provide a command line client for working with +model registry artifacts. + +## Options + +- `-h`, `--help` - prints the usage/help message, and exit. + +- `-q`, `--quiet` - do not write anything to standard output. + +- `-v`, `--verbose` - displays detailed tracing information. diff --git a/content/docs/sidebar.json b/content/docs/sidebar.json index 2a7626e132..d633a2dbe6 100644 --- a/content/docs/sidebar.json +++ b/content/docs/sidebar.json @@ -244,6 +244,17 @@ "label": "add", "slug": "add" }, + { + "label": "artifacts", + "slug": "artifacts", + "source": "artifacts/index.md", + "children": [ + { + "label": "artifacts get", + "slug": "get" + } + ] + }, { "label": "cache", "slug": "cache", @@ -571,6 +582,10 @@ "slug": "dvcfilesystem", "label": "DVCFileSystem" }, + { + "slug": "artifacts_show", + "label": "artifacts_show()" + }, { "slug": "exp_show", "label": "exp_show()" From 504c2855344c343a5dc91db389976a744beacc57 Mon Sep 17 00:00:00 2001 From: Dave Berenbaum Date: Fri, 29 Sep 2023 15:16:10 -0400 Subject: [PATCH 2/2] Dvclive pipeline transition (#4888) * dvclive: explain transition to pipelines * mention other friction points in transitioning to pipelines * Update content/docs/dvclive/how-it-works.md Co-authored-by: David de la Iglesia Castro * minor edit --------- Co-authored-by: David de la Iglesia Castro --- content/basic-concepts/pipeline.md | 8 +- content/docs/dvclive/how-it-works.md | 88 ++++++++++++++----- content/docs/dvclive/index.md | 19 ++-- content/docs/dvclive/live/index.md | 3 + content/docs/dvclive/live/log_artifact.md | 3 + content/docs/dvclive/live/log_param.md | 6 ++ content/docs/dvclive/live/log_params.md | 6 ++ .../start/experiments/experiment-pipelines.md | 4 +- 8 files changed, 103 insertions(+), 34 deletions(-) diff --git a/content/basic-concepts/pipeline.md b/content/basic-concepts/pipeline.md index 61354719a3..9a38d78cc3 100644 --- a/content/basic-concepts/pipeline.md +++ b/content/basic-concepts/pipeline.md @@ -1,6 +1,12 @@ --- name: Pipeline -match: [pipeline, pipelines, 'data pipeline', 'data pipelines', 'dvc pipelines'] +match: + - pipeline + - pipelines + - 'data pipeline' + - 'data pipelines' + - 'dvc pipelines' + - 'dvc pipeline' tooltip: >- DVC pipelines describe data processing workflows in a standard declarative YAML format ([`dvc.yaml`](/doc/user-guide/project-structure/dvcyaml-files)). diff --git a/content/docs/dvclive/how-it-works.md b/content/docs/dvclive/how-it-works.md index 8172cc1512..315a00e626 100644 --- a/content/docs/dvclive/how-it-works.md +++ b/content/docs/dvclive/how-it-works.md @@ -109,11 +109,58 @@ with Git, in which case you can use ## Setup to Run with DVC -You can create or modify the `dvc.yaml` file at the base of your repository (or -elsewhere) to define a [pipeline](#setup-to-run-with-dvc) to run experiments -with DVC or -[customize plots](/doc/user-guide/experiment-management/visualizing-plots#defining-plots). -A pipeline stage for model training might look like: +Running experiments with DVC provides a structured and reproducible +pipeline for end-to-end model training. To run experiments with +DVC, define a pipeline using `dvc stage add` or by editing `dvc.yaml`. A +pipeline stage for model training might look like: + + + + +```cli +$ dvc stage add --name train \ + --deps data_dir --deps src/train.py \ + --outs model.pt --outs dvclive \ + python train.py +``` + + + + +```yaml +stages: + train: + cmd: python train.py + deps: + - train.py + - data_dir + outs: + - model.pt + - dvclive +``` + + + + +Adding the DVCLive [directory] to the [outputs] will add it to the DVC [cache] +(if you previously tracked the directory in Git, you must first stop tracking it +there). If you want to keep it in Git, you can disable the cache. You can also +choose to cache only some paths, like keeping lightweight metrics in Git but +adding more heavyweight plots data to the cache: + + + + +```cli +$ dvc stage add --name train \ + --deps data_dir --deps src/train.py \ + --outs model.pt --outs-no-cache dvclive/metrics.json \ + --outs dvclive/plots \ + python train.py +``` + + + ```yaml stages: @@ -121,10 +168,22 @@ stages: cmd: python train.py deps: - train.py + - data_dir outs: - model.pt + - dvclive/metrics.json: + cache: false + - dvclive/plots ``` + + + +Now you can run an experiment using `dvc exp run`. Instead of DVCLive handling +caching and saving experiments, DVC will do this at the end of each run. See +examples of how to [add DVCLive to a pipeline] or [add a pipeline to DVCLive +code], including how to parametrize your code to iterate on experiments. + You may have previously tracked [outputs] with `Live.log_artifact()` that @@ -135,24 +194,11 @@ pipeline. You can optionally drop `Live.log_artifact()` from your code. -Optionally add any subpaths of the DVCLive [directory] to the [outputs]. DVC -will [cache] them by default, and you can use those paths as [dependencies] -downstream in your pipeline. For example, to cache all DVCLive plots: - -```diff - stages: - train: - cmd: python train.py - deps: - - train.py - outs: - - model.pt -+ - dvclive/plots -``` - [directory]: /doc/dvclive/how-it-works#directory-structure [cache]: /doc/start/data-management/data-versioning [outputs]: /doc/user-guide/pipelines/defining-pipelines#outputs [dependencies]: /doc/user-guide/pipelines/defining-pipelines#simple-dependencies -[pipelines]: /doc/start/experiments/experiment-pipelines +[pipeline]: /doc/start/experiments/experiment-pipelines [generates]: /doc/dvclive/live/make_dvcyaml +[add DVCLive to a pipeline]: /doc/start/data-management/metrics-parameters-plots +[add a pipeline to DVCLive code]: /doc/start/experiments/experiment-pipelines diff --git a/content/docs/dvclive/index.md b/content/docs/dvclive/index.md index 8852abff85..4882260c00 100644 --- a/content/docs/dvclive/index.md +++ b/content/docs/dvclive/index.md @@ -153,23 +153,20 @@ with Live() as live: ## Outputs After you run your training code, all the logged data will be stored in the -`dvclive` directory. Check the [DVCLive outputs](/doc/dvclive/how-it-works) page -for more details. +`dvclive` [directory] and [tracked] as a DVC experiment for +analysis and comparison. ## Run with DVC Experimenting in Python interactively (like in notebooks) is great for exploration, but eventually you may need a more structured way to run -reproducible experiments. By configuring DVC [pipelines], you can [run -experiments] with `dvc exp run`. This will track the inputs and outputs of code, -and enable more advanced workflows like multi-step pipelines and queueing -multiple experiments or even an entire grid search. See examples of how to [add -DVCLive to a pipeline] or [add a pipeline to DVCLive code], or get more -information about how to [setup a pipeline] to work with DVCLive. +reproducible experiments. By configuring DVC pipelines, you can +[run experiments] with `dvc exp run`. Pipelines help you organize your ML +workflow beyond a single notebook or script so you can modularize and +parametrize your code. See how to [setup a pipeline] to work with DVCLive. [release notes]: https://github.com/iterative/dvclive/releases/tag/3.0.0 +[directory]: /doc/dvclive/how-it-works +[tracked]: /doc/start/experiments/experiment-tracking [run experiments]: /doc/user-guide/experiment-management/running-experiments -[pipelines]: /doc/user-guide/pipelines -[add DVCLive to a pipeline]: /doc/start/data-management/metrics-parameters-plots -[add a pipeline to DVCLive code]: /doc/start/experiments/experiment-pipelines [setup a pipeline]: /doc/dvclive/how-it-works#setup-to-run-with-dvc diff --git a/content/docs/dvclive/live/index.md b/content/docs/dvclive/live/index.md index 2ebcceea0e..c88ce42d58 100644 --- a/content/docs/dvclive/live/index.md +++ b/content/docs/dvclive/live/index.md @@ -93,6 +93,9 @@ You can use `Live()` as a context manager. When exiting the context manager, - `cache_images` - If `True`, DVCLive will cache any images logged with `Live.log_image()` as part of `Live.end()`. Defaults to `False`. + If running a DVC pipeline, `cache_images` will be ignored, and + you should instead cache images as pipeline outputs. + - `exp_message` - If not `None`, and `save_dvc_exp` is `True`, the provided string will be passed to [`dvc exp save --message`](/doc/command-reference/exp/save#--message). diff --git a/content/docs/dvclive/live/log_artifact.md b/content/docs/dvclive/live/log_artifact.md index e29f673961..3ce6887249 100644 --- a/content/docs/dvclive/live/log_artifact.md +++ b/content/docs/dvclive/live/log_artifact.md @@ -77,6 +77,9 @@ it in the model registry. Git. Defaults to `True`, but set to `False` if you want to annotate metadata about the artifact without storing a copy in the DVC cache. + If running a DVC pipeline, `cache` will be ignored, and you + should instead cache artifacts as pipeline outputs. + ## Exceptions - `dvclive.error.InvalidDataTypeError` - thrown if the provided `path` does not diff --git a/content/docs/dvclive/live/log_param.md b/content/docs/dvclive/live/log_param.md index 578831d8fb..69bafc11c1 100644 --- a/content/docs/dvclive/live/log_param.md +++ b/content/docs/dvclive/live/log_param.md @@ -34,6 +34,9 @@ The logged params can be visualized with `dvc params`: $ dvc params diff dvclive/params.yaml ``` +If you use DVC pipelines, [parameter dependencies] are tracked +automatically, and you can skip logging them with DVCLive. + ## Parameters @@ -57,3 +60,6 @@ $ dvc params diff dvclive/params.yaml Dict[str, "ParamLike"] ] ``` + +[parameter dependencies]: + /doc/user-guide/pipelines/defining-pipelines#parameter-dependencies diff --git a/content/docs/dvclive/live/log_params.md b/content/docs/dvclive/live/log_params.md index f395d039db..63b999a3c6 100644 --- a/content/docs/dvclive/live/log_params.md +++ b/content/docs/dvclive/live/log_params.md @@ -47,6 +47,9 @@ The logged params can be visualized with `dvc params`: dvc params diff dvclive/params.yaml ``` +If you use DVC pipelines, [parameter dependencies] are tracked +automatically, and you can skip logging them with DVCLive. + ## Parameters @@ -68,3 +71,6 @@ dvc params diff dvclive/params.yaml Dict[str, "ParamLike"] ] ``` + +[parameter dependencies]: + /doc/user-guide/pipelines/defining-pipelines#parameter-dependencies diff --git a/content/docs/start/experiments/experiment-pipelines.md b/content/docs/start/experiments/experiment-pipelines.md index c9c8768df2..26345a2a0e 100644 --- a/content/docs/start/experiments/experiment-pipelines.md +++ b/content/docs/start/experiments/experiment-pipelines.md @@ -113,7 +113,7 @@ $ dvc stage add -n train \ $ dvc stage add -n evaluate \ -p base,evaluate \ -d src/evaluate.py -d models/model.pkl -d data/test_data \ - python src/evaluate.py + -o results python src/evaluate.py ``` The `dvc.yaml` file is updated automatically and should include all the stages @@ -155,6 +155,8 @@ stages: params: - base - evaluate + outs: + - results ```