diff --git a/.github/workflows/testpydra.yml b/.github/workflows/testpydra.yml index 1684191129..e46533f516 100644 --- a/.github/workflows/testpydra.yml +++ b/.github/workflows/testpydra.yml @@ -93,6 +93,25 @@ jobs: - name: Update pip run: python -m pip install --upgrade pip + - name: Set-up Git-annex + run: | + if [ "${{ runner.os }}" == "Linux" ]; then + wget -O- http://neuro.debian.net/lists/jammy.us-ca.libre | sudo tee /etc/apt/sources.list.d/neurodebian.sources.list + sudo apt-key adv --recv-keys --keyserver hkps://keyserver.ubuntu.com 0xA5D32F012649A5A9 + sudo apt-get update + sudo apt-get install git-annex-standalone + elif [ "${{ runner.os }}" == "macOS" ]; then + brew install git-annex + elif [ "${{ runner.os }}" == "Windows" ]; then + pip install datalad-installer + datalad-installer git-annex -m datalad/packages + fi + + - name: Set-up git credentials + run: | + git config --global user.email "github-actions[bot]@users.noreply.github.com" + git config --global user.name "github-actions[bot]" + - name: Determine installation target run: | if [[ "$INSTALL" = "sdist" ]]; then diff --git a/pydra/engine/tests/utils.py b/pydra/engine/tests/utils.py index b2f3b6652d..f32343de04 100644 --- a/pydra/engine/tests/utils.py +++ b/pydra/engine/tests/utils.py @@ -29,6 +29,19 @@ reason="sge not available", ) +need_gitannex = pytest.mark.skipif( + not (shutil.which("git-annex")) + or bool( + float( + sp.check_output(["git-annex", "version", "--raw"], universal_newlines=True)[ + :6 + ] + ) + < 8.20200309 + ), + reason="git-annex is not installed or version is less than 8.20200309", +) + def result_no_submitter(shell_task, plugin=None): """helper function to return result when running without submitter""" diff --git a/pydra/tasks/datalad.py b/pydra/tasks/datalad.py new file mode 100644 index 0000000000..1e34672756 --- /dev/null +++ b/pydra/tasks/datalad.py @@ -0,0 +1,184 @@ +"""A :obj:`~nipype.interfaces.utility.base.IdentityInterface` with a grafted Datalad getter.""" +import os +import logging +import typing as ty +from pathlib import Path +from ..engine.specs import ( + File, + Directory, + SpecInfo, + BaseSpec, +) +from ..engine.core import TaskBase +from ..engine.helpers import output_from_inputfields +from ..utils.messenger import AuditFlag + +logger = logging.getLogger("pydra.tasks.datalad") + +input_fields = [ + ( + "in_file", + str, + { + "help_string": "Path to the data to be downloaded through datalad", + "mandatory": True, + }, + ), + ( + "dataset_path", + Directory, + { + "help_string": "Path to the dataset that will be used to get data", + "mandatory": True, + }, + ), + ( + "dataset_url", + str, + { + "help_string": "URL to the dataset that will be used to get data", + }, + ), +] + + +output_fields = [ + ( + "out_file", + File, + { + "help_string": "file downloaded through datalad", + "requires": ["in_file"], + "output_file_template": "{in_file}", + }, + ) +] + +# define a TaskBase calss +class DataladInterface(TaskBase): + """A :obj:`~nipype.interfaces.utility.base.IdentityInterface` with a grafted Datalad getter.""" + + def __init__( + self, + name: str, + audit_flags: AuditFlag = AuditFlag.NONE, + cache_dir=None, + cache_locations=None, + input_spec: ty.Optional[ty.Union[SpecInfo, BaseSpec]] = None, + output_spec: ty.Optional[ty.Union[SpecInfo, BaseSpec]] = None, + cont_dim=None, + messenger_args=None, + messengers=None, + rerun=False, + **kwargs, + ): + """Initialize a DataladInterface instance.""" + + self.input_spec = input_spec or SpecInfo( + name="Inputs", fields=input_fields, bases=(BaseSpec,) + ) + self.output_spec = output_spec or SpecInfo( + name="Output", fields=output_fields, bases=(BaseSpec,) + ) + self.output_spec = output_from_inputfields(self.output_spec, self.input_spec) + super().__init__( + name=name, + inputs=kwargs, + audit_flags=audit_flags, + cache_dir=cache_dir, + cache_locations=cache_locations, + cont_dim=cont_dim, + messenger_args=messenger_args, + messengers=messengers, + rerun=rerun, + ) + + def _run_task(self): + in_file = self.inputs.in_file + dataset_path = self.inputs.dataset_path + + _dl_found = False + try: + import datalad.api as dl + + _dl_found = True + except: + raise ImportError("Datalad is not installed.") + + # checking if the dataset is already downloaded + + if not (Path(dataset_path) / ".datalad").exists(): + logger.info("Datalad interface without dataset path defined.") + try: + dataset_url = self.inputs.dataset_url + os.makedirs(dataset_path, exist_ok=True) + dl.install(source=dataset_url, path=dataset_path) + except Exception as e: + logger.error(e) + else: + ds = dl.Dataset(self.inputs.dataset_path) + + # getting the file + ds.get(self.inputs.in_file) + + # checking if the file was downloaded + if not Path(dataset_path, in_file).exists(): + raise FileNotFoundError(f"File {in_file} not found in {dataset_path}") + + _pth = Path(in_file) + if not _pth.is_absolute(): + _pth = dataset_path / _pth + + _datalad_candidate = _pth.is_symlink() and not _pth.exists() + if not _datalad_candidate: + logger.warning("datalad was required but not found") + + if _datalad_candidate: + try: + result = dl.get(_pth, dataset=dataset_path) + except Exception as exc: + logger.warning(f"datalad get on {_pth} failed.") + ## discussed with @djarecka, we keep it commented here for now + ## do we still need it for pydra? + # if ( + # config.environment.exec_env == "docker" + # and ("This repository is not initialized for use by git-annex, " + # "but .git/annex/objects/ exists") in f"{exc}" + # ): + # logger.warning( + # "Execution seems containerirzed with Docker, please make sure " + # "you are not running as root. To do so, please add the argument " + # "``-u $(id -u):$(id -g)`` to your command line." + # ) + # else: + # logger.warning(str(exc)) + else: + if result[0]["status"] == "error": + logger.warning(f"datalad get failed: {result}") + + self.output_ = None + output = os.path.abspath( + os.path.join(self.inputs.dataset_path, self.inputs.in_file) + ) + output_names = [el[0] for el in self.output_spec.fields] + if output is None: + self.output_ = {nm: None for nm in output_names} + elif len(output_names) == 1: + # if only one element in the fields, everything should be returned together + self.output_ = {output_names[0]: output} + elif isinstance(output, tuple) and len(output_names) == len(output): + self.output_ = dict(zip(output_names, output)) + else: + raise RuntimeError( + f"expected {len(self.output_spec.fields)} elements, " + f"but {output} were returned" + ) + # outputs = self.output_spec().get() + # outputs["out_file"] = os.path.abspath(os.path.join(self.inputs.dataset_path, self.inputs.in_file)) + + def _list_outputs(self): + outputs = self.output_spec().get() + outputs["out_file"] = os.path.abspath( + os.path.join(self.inputs.dataset_path, self.inputs.in_file) + ) + return outputs diff --git a/pydra/tasks/tests/__init__.py b/pydra/tasks/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/pydra/tasks/tests/test_datalad.py b/pydra/tasks/tests/test_datalad.py new file mode 100644 index 0000000000..b63dd30fac --- /dev/null +++ b/pydra/tasks/tests/test_datalad.py @@ -0,0 +1,52 @@ +import typing as ty +from pathlib import Path +import os, sys +import attr +import pytest + + +from ...tasks.datalad import DataladInterface +from ...engine.core import Workflow +from ...engine.submitter import Submitter +from ...engine.helpers import hash_value +from ...engine.tests.utils import need_gitannex + + +@need_gitannex +def test_datalad_interface(tmpdir): + """ + Testing datalad interface + """ + import datalad.api as dl + + # change PosixPath to str + tmpdir = str(tmpdir) + # creating a dataset + ds = dl.Dataset(tmpdir).create() + ds.save() + ds_path = ds.pathobj + + # creating a file to download + file_path = ds_path / "file.txt" + file_path.write_text("test") + ds.save() + + tmpdir = Path(tmpdir) + + # install the dataset to a new location + ds2 = dl.install(source=tmpdir, path=tmpdir / "ds2") + ds2_path = ds2.pathobj + + # use datalad interface to download the file + dl_interface = DataladInterface( + name="dl_interface", in_file="file.txt", dataset_path=ds2_path + ) + # running the task + res = dl_interface() + + assert os.path.exists(res.output.out_file) + assert os.path.basename(res.output.out_file) == "file.txt" + + +# Path: pydra/tasks/tests/test_datalad.py +# Compare this snippet from pydra/tasks/datalad.py: diff --git a/pyproject.toml b/pyproject.toml index ce7eb465dc..29bd9475db 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,6 +75,7 @@ test = [ "tornado", "boutiques", "pympler", + "datalad", ] # Aliases tests = ["pydra[test]"]