diff --git a/datalad_osf/create_sibling_osf.py b/datalad_osf/create_sibling_osf.py index a5776bc..7531c83 100644 --- a/datalad_osf/create_sibling_osf.py +++ b/datalad_osf/create_sibling_osf.py @@ -56,7 +56,7 @@ class CreateSiblingOSF(Interface): node will - in opposition to the 'annex' - be human-readable. - For authentification with OSF, you can define environment variables: Either + For authentication with OSF, you can define environment variables: Either 'OSF_TOKEN', or both 'OSF_USERNAME' and 'OSF_PASSWORD'. If neither of these is defined, the tool will fall back to the datalad credential manager and inquire for credentials interactively. @@ -144,7 +144,7 @@ class CreateSiblingOSF(Interface): args=("--description",), metavar="TEXT", doc="""Description of the OSF node that will be displayed on - the associated project page. By default a description will + the associated project page. By default a description will be generated based on the mode the sibling is put into.""", constraints=EnsureStr() | EnsureNone()), ) diff --git a/docs/source/_static/clueless.gif b/docs/source/_static/clueless.gif new file mode 100644 index 0000000..53ea6cc Binary files /dev/null and b/docs/source/_static/clueless.gif differ diff --git a/docs/source/_static/datastore_sibling.png b/docs/source/_static/datastore_sibling.png new file mode 100644 index 0000000..59f29da Binary files /dev/null and b/docs/source/_static/datastore_sibling.png differ diff --git a/docs/source/_static/git-annex-osf-logo.png b/docs/source/_static/git-annex-osf-logo.png new file mode 100644 index 0000000..e218de2 Binary files /dev/null and b/docs/source/_static/git-annex-osf-logo.png differ diff --git a/docs/source/_static/public_annex_sibling.png b/docs/source/_static/public_annex_sibling.png new file mode 100644 index 0000000..3082d52 Binary files /dev/null and b/docs/source/_static/public_annex_sibling.png differ diff --git a/docs/source/_static/public_exportonly_sibling.png b/docs/source/_static/public_exportonly_sibling.png new file mode 100644 index 0000000..db5fc62 Binary files /dev/null and b/docs/source/_static/public_exportonly_sibling.png differ diff --git a/docs/source/acknowledgements.rst b/docs/source/acknowledgements.rst new file mode 100644 index 0000000..8129f8d --- /dev/null +++ b/docs/source/acknowledgements.rst @@ -0,0 +1,23 @@ +Acknowledgments +=============== + +DataLad development is being performed as part of a US-German collaboration in +computational neuroscience (CRCNS) project "DataGit: converging catalogues, +warehouses, and deployment logistics into a federated 'data distribution'" +(Halchenko_/Hanke_), co-funded by the US National Science Foundation (`NSF 1912266`_) and the German Federal Ministry of Education and Research (BMBF 01GQ1905). +Additional support is provided by the German federal state of +Saxony-Anhalt and the European Regional Development +Fund (ERDF), Project: `Center for Behavioral Brain Sciences`_, Imaging Platform. + +DataLad is built atop the git-annex_ software that is being developed and +maintained by `Joey Hess`_. + +The extension was created during the OHBM Hackathon 2020 and wouldn't have been possible without a `dedicated team of volunteers `_. + +.. _Halchenko: http://haxbylab.dartmouth.edu/ppl/yarik.html +.. _Hanke: http://www.psychoinformatics.de +.. _NSF 1912266: http://www.nsf.gov/awardsearch/showAward?AWD_ID=1912266 +.. _BMBF 01GQ1411: http://www.gesundheitsforschung-bmbf.de/de/2550.php +.. _Center for Behavioral Brain Sciences: http://cbbs.eu/en/ +.. _git-annex: http://git-annex.branchable.com +.. _Joey Hess: https://joeyh.name diff --git a/docs/source/api.rst b/docs/source/api.rst new file mode 100644 index 0000000..423fb5d --- /dev/null +++ b/docs/source/api.rst @@ -0,0 +1,12 @@ +Python API +========== + +``datalad-osf`` has two main commands that are exposed as functions via ``datalad.api`` and as methods of the ``Dataset`` class: ``osf_credentials`` for OSF authentication management, and ``create_sibling_osf`` for interactions with the OSF. +Find out more about each command below. + +.. currentmodule:: datalad.api +.. autosummary:: + :toctree: generated + + osf_credentials + create_sibling_osf \ No newline at end of file diff --git a/docs/source/cloneosfdata.rst b/docs/source/cloneosfdata.rst deleted file mode 100644 index f7ac937..0000000 --- a/docs/source/cloneosfdata.rst +++ /dev/null @@ -1,2 +0,0 @@ -Cloning an OSF dataset to your local PC -*************************************** diff --git a/docs/source/cmd.rst b/docs/source/cmd.rst new file mode 100644 index 0000000..618b92a --- /dev/null +++ b/docs/source/cmd.rst @@ -0,0 +1,13 @@ +.. _cmd: + +Command line reference +====================== + +``datalad-osf`` has two main commands: ``datalad osf-credentials`` for OSF authentication management, and ``datalad create-sibling-osf`` for interactions with the OSF. +Find out more about each command below. + +.. toctree:: + :maxdepth: 1 + + generated/man/datalad-create-sibling-osf + generated/man/datalad-osf-credentials diff --git a/docs/source/conf.py b/docs/source/conf.py index 1350cd8..a97c12e 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -95,8 +95,8 @@ master_doc = 'index' # General information about the project. -project = u'Datalad Extension Template' -copyright = u'2018-{}, DataLad team'.format(datetime.datetime.now().year) +project = u'Datalad-OSF' +copyright = u'2020-{}, DataLad team'.format(datetime.datetime.now().year) author = u'DataLad team' # The version info for the project you're documenting, acts as replacement for diff --git a/docs/source/contributing.rst b/docs/source/contributing.rst new file mode 100644 index 0000000..921f08b --- /dev/null +++ b/docs/source/contributing.rst @@ -0,0 +1,7 @@ +.. _contribute: + +Contributing +============ + +If you have any questions, comments, bug fixes or improvement suggestions, feel free to contact us via our `Github page `_. +Before contributing, be sure to read the `contributing guidelines `_. diff --git a/docs/source/exportdatacode.rst b/docs/source/exportdatacode.rst deleted file mode 100644 index bbccb06..0000000 --- a/docs/source/exportdatacode.rst +++ /dev/null @@ -1,87 +0,0 @@ -.. include:: ./links.inc - -Export version-controlled data to OSF and code to GitHub -******************************************************** - -Imagine you are a PhD student and want to collaborate on a fun little side -project with a student at another institute. It is quite obvious for the two of -you that your code will be hosted on GitHub_. And you also know enough about -DataLad_, that using it for the whole project will be really beneficial. - -But what about the data you are collecting? -The Dropbox is already full (`DataLad third party providers `_). -And Amazon services don't seem to be your best alternative. -Suddenly you remember, that you got an OSF_ account recently, and that there is this nice `Datalad extension `_ to set up a `Special Remote`_ on OSF_. - -Walk through ------------- - -Installation -^^^^^^^^^^^^ -For installation checkout the installation page of the documentation. - -.. toctree:: - - settingup.rst - - -Creating an Example Dataset -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -As a very first step you want to set up a DataLad dataset. For this you should -run. In all examples a `$` in front indicates a new line in the Bash-Shell, -copying it will prevent your code from execution. - -.. code-block:: bash - - $ datalad create collab_osf - -After having created the dataset we want to populate it with some content (just like in the `DataLad Handbook`_). -Importantly we don't want to upload this file on GitHub, only on OSF - in the real world this could be your data that is too large to upload to GitHub. - -.. code-block:: bash - - $ cd collab_osf - $ datalad download-url http://www.tldp.org/LDP/Bash-Beginners-Guide/Bash-Beginners-Guide.pdf \ - --dataset . \ - -m "add beginners guide on bash" \ - -O books/bash_guide.pdf - -And we also want to add a text file, which will be saved on GitHub_ - in your case this could be the code you are using. - -.. code-block:: bash - - $ mkdir code - $ cd code - $ echo "This is just an example file just to show the different ways of saving data in a DataLad dataset." > example.txt - $ datalad save --to-git -m "created an example.txt" - -We now have a dataset with one file that can be worked on using GitHub and one that should be tracked using `git-annex`. - -Setting up the OSF Remote -^^^^^^^^^^^^^^^^^^^^^^^^^ - -To use OSF as a storage, you need to provide either your OSF credentials or an OSF access token. -You can create such a token in your account settings (`Personal access token` and then `Create token`), make sure to create a `full_write` token to be able to create OSF projects and upload data to OSF. - -.. code-block:: bash - - $ export OSF_TOKEN=YOUR_TOKEN_FROM_OSF.IO - -We are now going to use datalad to create a sibling dataset on OSF with name `osf` - this will create a new OSF project entitle `OSF_PROJECT_NAME` on the OSF account associated with the OSF token in `$OSF_TOKEN`. - -.. code-block:: bash - - $ datalad create-sibling-osf -s osf --title OSF_PROJECT_NAME - -Setting up GitHub Remote -^^^^^^^^^^^^^^^^^^^^^^^^ - -We can set-up a GitHub Remote with name `github` and include a publish dependency with OSF - that way, when we publish our dataset to GitHub, the data files get automatically uploaded to OSF. - -.. code-block:: bash - - $ datalad create-sibling-github REPRONAME -s github --github-login GITHUB_NAME --publish-depends osf - $ datalad publish . --to github --transfer-data all - -This will publish example.txt in code/ to GitHub and only add the folder structure and symbolic links for all other file; at the same time it will upload the data to OSF - this way you can let OSF handle your data and GitHub your code. diff --git a/docs/source/exporthumandata.rst b/docs/source/exporthumandata.rst deleted file mode 100644 index 9ea537e..0000000 --- a/docs/source/exporthumandata.rst +++ /dev/null @@ -1,82 +0,0 @@ -.. include:: ./links.inc - -Export a human-readable dataset to OSF -************************************** - -Imagine you have been creating a reproducible workflow using DataLad_ from the -get go. Everything is finished now, code, data, and paper are ready. Last thing -to do: Publish your data. - -Using datalad-osf makes this really convenient. - -Walk through ------------- - -Installation -^^^^^^^^^^^^ -For installation checkout the installation page of the documentation. - -.. toctree:: - - settingup.rst - - -Creating an Example Dataset -^^^^^^^^^^^^^^^^^^^^^^^^^^^ -We will create a small example DataLad dataset to show the functionality. - -.. code-block:: bash - - $ datalad create collab_osf - # collab_osf being the name of your new dataset - # In all examples a `$` in front indicates a new line in the Bash-Shell - # Copying the $ will prevent your code from execution. - -After having created the dataset we want to populate it with some content (just -like in the `Datalad Handbook`_): - -.. code-block:: bash - - $ cd collab_osf - $ datalad download-url http://www.tldp.org/LDP/Bash-Beginners-Guide/Bash-Beginners-Guide.pdf \ - --dataset . \ - -m "add beginners guide on bash" \ - -O books/bash_guide.pdf - -Setting up the OSF Remote -^^^^^^^^^^^^^^^^^^^^^^^^^ - -To use OSF as a storage, you first need to provide either your OSF credentials (username and password) or an OSF access token. - -If you choose to use your credentials, proceed as follows: - -.. code-block:: bash - - export OSF_USERNAME=YOUR_USERNAME_FOR_OSF.IO - export OSF_PASSWORD=YOUR_PASSWORD_FOR_OSF.IO - -In this example, we are going to use an OSF access token instead. -You can create such a token in your account settings (`Personal access token` and then `Create token`). -Make sure to create a `full_write` token to be able to create OSF projects and upload data to OSF. - -.. code-block:: bash - - export OSF_TOKEN=YOUR_TOKEN_FROM_OSF.IO - -We are now going to use datalad to create a sibling dataset on OSF with name `OSF_PROJECT_NAME`. -This will create a new project entitled `OSF_PROJECT_NAME` on the OSF account associated with the OSF token in `$OSF_TOKEN`. - -Note that the ``-s NAME_OF_REMOTE>`` flag is used to specify how ``git`` internally refers to your OSF project with the name `OSF_PROJECT_NAME`. -It would be completely fine to use `OSF_PROJECT_NAME` also as a value for the ``-s`` flag. - -You can later on list your remotes from the command line using the ``git remote -v`` command. - -.. code-block:: bash - - $ datalad create-sibling-osf -s NAME_OF_REMOTE --title OSF_PROJECT_NAME --mode export - -After that we can export the current state (the `HEAD`) of our dataset in human readable form to OSF: - -.. code-block:: bash - - git annex export HEAD --to NAME_OF_REMOTE diff --git a/docs/source/git-annex-utils.rst b/docs/source/git-annex-utils.rst new file mode 100644 index 0000000..38144d7 --- /dev/null +++ b/docs/source/git-annex-utils.rst @@ -0,0 +1,13 @@ +.. include:: ./links.inc + +git-annex utilities +=================== + +``datalad-osf`` comes with a git-annex special remote implementation for the OSF. +Find out more by clicking on the module name below. + +.. currentmodule:: datalad_osf +.. autosummary:: + :toctree: generated + + annex_remote.OSFSpecialRemote diff --git a/docs/source/index.rst b/docs/source/index.rst index 58770ca..e1d8424 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,69 +1,39 @@ .. include:: ./links.inc -DataLad extension to interface with OSF -*************************************** +DataLad-OSF: Opening up the Open Science Framework for DataLad +-------------------------------------------------------------- -This extension enables DataLad_ to work with the Open Science Framework (OSF_). -Use it to publish your dataset's data to an OSF project to utilize the OSF for dataset data storage and easy dataset sharing. +This `DataLad extension `_ enables DataLad_ to work with the Open Science Framework (OSF_). +Use it to publish your dataset's data to an OSF project to utilize the OSF for dataset data storage and share and collaborate on datasets. -The extension was created during the OHBM Hackathon 2020. +.. figure:: _static/git-annex-osf-logo.png -If you have any questions, comments, bug fixes or improvement suggestions, feel free to contact us via our `Github page `_. -Before contributing, be sure to read the `contributing guidelines `_. +The extension was created during the `OHBM Hackathon 2020 `_ and wouldn't have been possible without a `dedicated team of volunteers `_. +If you want to get in touch or on board as well, please see our :ref:`contributing guidelines `. -.. toctree:: - -Documentation -============= +Documentation overview +^^^^^^^^^^^^^^^^^^^^^^ .. toctree:: :maxdepth: 2 intro settingup - exporthumandata - exportdatacode - cloneosfdata - - -API -=== - -High-level API commands ------------------------ - -.. currentmodule:: datalad.api -.. autosummary:: - :toctree: generated - - create_sibling_osf - osf_credentials - - -Command line reference ----------------------- + tutorial + contributing + acknowledgements .. toctree:: :maxdepth: 1 - generated/man/datalad-create-sibling-osf - generated/man/datalad-osf-credentials - - - -Git-annex utilities -------------------- - -.. currentmodule:: datalad_osf -.. autosummary:: - :toctree: generated - - annex_remote.OSFSpecialRemote + cmd + api + git-annex-utils Indices and tables -================== +^^^^^^^^^^^^^^^^^^ * :ref:`genindex` * :ref:`modindex` diff --git a/docs/source/intro.rst b/docs/source/intro.rst index a217121..eb55db5 100644 --- a/docs/source/intro.rst +++ b/docs/source/intro.rst @@ -1,24 +1,66 @@ .. include:: ./links.inc +.. _intro: Introduction ------------- +============ + +The Open Science Framework +-------------------------- + +The Open Science Framework (OSF_), developed and maintained by the Center for Open Science (COS), is a tool that promotes open, centralized workflows by enabling capture of different aspects and products of the research life-cycle, including developing a research idea, designing a study, storing and analyzing collected data, and writing and publishing reports or papers. +In the scientific community, it is commonly used for registered reports, as a preprint server, and for study archival and data sharing. +In order to use the OSF, a free registration is required. + +The core functionality of the OSF_ is its ability to create and develop *projects*, with a project being a private or public workspace for collaboration, data sharing, or data archival. +Projects have an associate storage (either via OSF storage or third party providers) for (large) data, and can have *components*, associated sub-projects. +Each OSF user, project, component, and file is given a unique, persistent uniform resource locator (URL) to enable sharing and promote attribution. +Projects can also be assigned digital object identifiers (DOIs) and archival resource keys (ARKs) if they are made publicly available. + +At the moment, a the OSF storage provides virtually unlimited storage capacity. +As long as individual files are smaller than 5GB, any amount of data can be uploaded to the OSF. +This makes the OSF_ a powerful, accessible, and free data sharing and collaboration platform for researchers. + Goal of the extension -^^^^^^^^^^^^^^^^^^^^^ +--------------------- -This extension aims to allow DataLad_ to work with the Open Science Framework (OSF_). -This is done by transforming storage on the Open Science Framework (OSF) into a `git-annex`_ repository. +This extension allows DataLad_ to work with the Open Science Framework (OSF_) to make sharing and collaboration on data or DataLad datasets even easier. +It comes with several features that enable the following main use cases: + +#. Export existing datasets to the OSF +#. Clone published datasets from the OSF +#. Utilize OSF projects as a third party storage provider for annexed data +#. Export single-view snapshots of datasets to an OSF project + +To enable these use cases, a dataset is published as an OSF project, and its OSF storage is used as a `git-annex`_ `special remote`_ to publish (large) file contents. +Major OSF flexibility is exposed to control whether the resulting project is private (default) or public, and to attach meaningful metadata to it. +You can find out demonstrations of these use cases in the :ref:`Tutorial`. What can I use this extension for? -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +---------------------------------- + +You can use this extension to publish, store, collaborate on, or share your dataset (data) via the OSF_. +Here is some inspiration on what you could do: + +- Publish your study (including its version history, data, code, results, and provenance) as a DataLad dataset to the OSF. + Share the project's OSF URL with colleagues and collaborators to give them easy access to your work with a single ``datalad clone``. +- Clone a friend's dataset -- from the OSF! +- Use the OSF as a `special remote`_ to store data in the annex of your dataset. + With this, you can publish a dataset to `GitHub`_ or similar Git repository hosting services, and have your data published to the OSF (via a publication dependency). + Your dataset will be exposed and available on GitHub, while data is stored on and retrieved from the OSF. +- Take a version snapshot of all of your dataset's files and export them to the OSF. + This publishes one version of your project in a human-readable fashion to the OSF to make it available to the outside world. + +``datalad-osf`` comes with a range of hidden convenience functions for OSF interactions. +Importantly, you will not need to create OSF projects via the OSF web interface -- given appropriate credentials, ``datalad create-sibling-osf`` will create new projects under your user account and report back the generated URL. -You can use this extension to use the OSF as a special remote to store data in the annex of a dataset. -With this, you can `datalad publish` a dataset to GitHub or similar services and the data to the OSF (via a publication dependency). -The extension is most beneficial for easy access to data stored on OSF via GitHub. -If you are sharing your dataset via OSF and code via GitHub, this will allow smooth integration of both along with unified version management provided by DataLad. What can I **not** use this extension for? -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +------------------------------------------ -This tool does not work for data that is stored in a storage service other than OSF. -Please refer to the list of `special remotes`_ as hosted by the git-annex website for other storage services. +- This tool does not work for data that is stored in a storage service other than the OSF_, and within the OSF, only OSF storage, no other third party storage, is supported. + Please refer to the list of `special remotes`_ as hosted by the `git-annex`_ website for other storage services and how to use them with DataLad. +- Also, be mindful that OSF storage imposes a maximum file size of 5GB. + Individual files larger than 5GB can not be published with this extension. +- Finally, the starting point for working with this extension is a (published) DataLad dataset, not a regular OSF project. + This extension will not transform normal OSF projects into datasets, but expose DataLad datasets as OSF projects. \ No newline at end of file diff --git a/docs/source/links.inc b/docs/source/links.inc index 504f0cd..e102b19 100644 --- a/docs/source/links.inc +++ b/docs/source/links.inc @@ -15,6 +15,7 @@ .. _git-annex: git-annex.branchable.com/ .. _git: git-scm.com/ .. _OSF: https://www.osf.io/ +.. _pip: https://pypi.org/project/pip/ .. _Python: https://www.python.org/ .. _Special Remote: https://git-annex.branchable.com/special_remotes/ .. _Special Remotes: https://git-annex.branchable.com/special_remotes/ diff --git a/docs/source/settingup.rst b/docs/source/settingup.rst index ad6901b..84fcb68 100644 --- a/docs/source/settingup.rst +++ b/docs/source/settingup.rst @@ -1,29 +1,91 @@ .. include:: ./links.inc -Setting up +.. _install: + +Quickstart ========== Requirements ------------- +^^^^^^^^^^^^ -- DataLad +DataLad and ``datalad-osf`` are available for all major operating systems (Linux, MacOS, Windows 10 [#f1]_). +The relevant requirements are listed below. -Before being able to use the extension, you need to have DataLad installed, which relies on `git-annex`_, `git`_ and `Python`_. -If you don't have DataLad installed yet, please follow the instructions from `the datalad handbook `_. + An OSF_ account + You need an OSF account to be able to interact with it. If you don't have an account yet, `register here `_ -- its free! -- An account on the OSF + DataLad + If you don't have DataLad_ and its underlying tools (`git`_, `git-annex`_) installed yet, please follow the instructions from `the datalad handbook `_. -You need an OSF account to be able to interact with it. If you don't have an account yet, `register here `_. + [optional] An account on a Git repository hosting site + You should consider having an account on one or more repository hosting sites such as `GitHub `__ , `GitLab `_, `Bitbucket `_ or similar. -- An account on a git repository hosting site +Installation +^^^^^^^^^^^^ -You should consider having an account on one or more repository hosting sites such as `GitHub `__ , `GitLab `_, `Bitbucket `_ or similar" +``datalad-osf`` is a Python package available on `pypi `_ and installable via pip_. -Installation ------------- +.. code-block:: bash + + # create and enter a new virtual environment (optional) + $ virtualenv --python=python3 ~/env/dl-osf + $ . ~/env/dl-osf/bin/activate + # install from PyPi + $ pip install datalad-osf. + +Getting started +^^^^^^^^^^^^^^^ + +Here's the gist of some of this extension's functionality. +Checkout the :ref:`Tutorial` for more detailed demonstrations. + +First, :ref:`provide your credentials `: + +.. code-block:: bash + + # provide your OSF credentials, ideally as a token: + $ datalad osf-credentials + You need to authenticate with 'https://osf.io' credentials. https://osf.io/settings/tokens provides information on how to gain access + token: + You need to authenticate with 'https://osf.io' credentials. https://osf.io/settings/tokens provides information on how to gain access + token (repeat): + osf_credentials(ok): [authenticated as ] + +Next, create a sibling on the OSF for a DataLad dataset of your choice. +Choose between different sibling modes to adjust how much of your dataset can be published and how it will be displayed, adjust whether your project should be private or public, attach additional meta data, or configure local sibling properties. +The minimal example below will create a new (private) project with minimal metadata on the OSF and apply the necessary configurations to publish your complete dataset to it. + +.. code-block:: bash + + # inside of a DataLad dataset + $ datalad create-sibling-osf --title best-study-ever -s osf + create-sibling-osf(ok): https://osf.io/czgpf/ + [INFO ] Configure additional publication dependency on "osf-storage" + configure-sibling(ok): /home/me/mydataset (sibling) + +Afterwards, publish your dataset to the OSF sibling project to share it or collaborate with others: + +.. code-block:: bash + + $ datalad push --to osf + +Finally, you or others can clone it using its project ID. +All annexed data in this dataset will be available via ``datalad get``. + +.. code-block:: bash + + $ datalad clone osf://czgpf/ + +Curious to find out more? +Read on in the :ref:`tutorial` for more functionality and use cases. + + +.. admonition:: HELP! I'm new to this! + + If this is your reaction to reading the words DataLad dataset, sibling, or dataset publishing, please head over to the `DataLad Handbook`_ for an introduction to DataLad. -Before you can start using the extension, you have to install it. + .. image:: ./_static/clueless.gif -``datalad-osf`` is a package on `pypi `_, so you can open your shell and type: ``pip install datalad-osf``. +.. rubric:: Footnotes -If you want to use the most recent development version, use the following command instead: ``pip install -e git+https://github.com/datalad/datalad-osf#egg=datalad-osf`` +.. [#f1] While installable for Windows 10, the extension may not be able to perform all functionality documented here. Please get in touch if you are familiar with Windows `to help us fix bugs `_. diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst new file mode 100644 index 0000000..35083e9 --- /dev/null +++ b/docs/source/tutorial.rst @@ -0,0 +1,14 @@ +.. _tutorial: + +Tutorial +======== + +.. toctree:: + :maxdepth: 2 + + tutorial/authentication + tutorial/osf-sibling + tutorial/codealong + tutorial/cloneosfdata + tutorial/exporthumandata + tutorial/exportdatacode diff --git a/docs/source/tutorial/authentication.rst b/docs/source/tutorial/authentication.rst new file mode 100644 index 0000000..b128d4e --- /dev/null +++ b/docs/source/tutorial/authentication.rst @@ -0,0 +1,86 @@ +.. include:: ../links.inc +.. _authenticate: + +Step 1: Authentication +====================== + +``datalad-osf`` needs to communicate with the OSF to create and modify projects under an associated user account. +To enable this, the associated user needs to be authenticated using the :command:`osf-credentials` command. +Therefore, as the very first step, ``datalad osf-credentials`` needs to be ran to authenticate a user. +Unless credentials expire or change, this command needs to be ran only once per user and system. + +Setting credentials +^^^^^^^^^^^^^^^^^^^ + +To set credentials, run ``datalad osf-credentials`` anywhere on your system. +This command prompts for user credentials and stores them in your system's secure credential store for OSF operations. + +.. code-block:: bash + + # the default authentication method is token + $ datalad osf-credentials + You need to authenticate with 'https://osf.io' credentials. https://osf.io/settings/tokens provides information on how to gain access + token: + You need to authenticate with 'https://osf.io' credentials. https://osf.io/settings/tokens provides information on how to gain access + token (repeat): + osf_credentials(ok): [authenticated as ] + +Two different methods of authentication are supported and can be set with the ``--method`` flag: + +- ``token``: A personal access token. This is the recommended authentication type and default. + Generate a personal access token under your user account at `osf.io/settings/tokens `_. Make sure to create a ``full_write`` token to be able to create OSF projects and upload data to OSF! +- ``userpassword``: Your username and password combination from the OSF_ web interface. + +.. code-block:: bash + + # authenticate with user name and password + $ datalad osf-credentials --method userpassword + You need to authenticate with 'https://osf.io' credentials. https://osf.io/settings/account provides information on how to gain access + user: + + password: + password (repeat): + osf_credentials(ok): [authenticated as ] + +The credentials are stored within a system's `encrypted keyring `_ and DataLad_ retrieves them automatically for all future interactions with the OSF. +Information on which user's credentials are stored can be found by re-running ``datalad osf-credentials``. + +.. code-block:: bash + + $ datalad osf-credentials + osf_credentials(ok): [authenticated as ] + +.. admonition:: Environment variables + + Alternatively, credentials can be set via environment variables: + ``OSF_TOKEN``, or both ``OSF_USERNAME`` and ``OSF_PASSWORD``, as in + + .. code-block:: bash + + export OSF_TOKEN=YOUR_TOKEN_FROM_OSF + + +Resetting credentials +^^^^^^^^^^^^^^^^^^^^^ + +If credentials change they can be re-set using the ``--reset`` flag: + +.. code-block:: bash + + # token method is used by default, use --method userpassword for user + password credentials + $ datalad osf-credentials --reset + You need to authenticate with 'https://osf.io' credentials. https://osf.io/settings/tokens provides information on how to gain access + token: + You need to authenticate with 'https://osf.io' credentials. https://osf.io/settings/tokens provides information on how to gain access + token (repeat): + osf_credentials(ok): [authenticated as ] + + +Invalid credentials +^^^^^^^^^^^^^^^^^^^ + +If you supply invalid credentials such as a mismatching user name and password combination or a wrong token, you will see the following error:: + + $ osf_credentials(error): None [Invalid credentials] + +Please check for spelling mistakes, check your user name and password combination under your user account, or regenerate a token, and reset your credentials to fix this. diff --git a/docs/source/tutorial/cloneosfdata.rst b/docs/source/tutorial/cloneosfdata.rst new file mode 100644 index 0000000..a0efdcd --- /dev/null +++ b/docs/source/tutorial/cloneosfdata.rst @@ -0,0 +1,60 @@ +.. include:: ../links.inc + +Use case 1: Publishing and cloning datasets +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. admonition:: Problem statement + + Imagine you have been creating a reproducible workflow using DataLad_ from the + get go. Everything is finished now, code, data, and paper are ready. Last thing + to do: Publish your data, code, results, and workflows -- ideally, all together, easily accessible, and also fast. + + The solution: Publish the complete dataset to the OSF and let others clone the project to get access to data, code, version history, and workflows. + Therefore, you decide on the ``annex`` sibling mode. + +Creating the OSF sibling +"""""""""""""""""""""""" + +Given OSF credentials are set, we can create a sibling in ``annex`` mode. +We will also make the project public (``--public``), and attach some meta data (``--category``, ``--tag``) to it. + +The code below will create a new public OSF project called ``best-study-ever``, a dataset sibling called ``osf-annex``, and a readily configured storage sibling ``osf-annex-storage``. +The project on the OSF will have a description with details on how to clone it and some meta data. + +.. code-block:: bash + + # inside of the tutorial DataLad dataset + $ datalad create-sibling-osf --title best-study-ever \ + -s osf-annex \ + --category data \ + --tag reproducibility \ + --public + + create-sibling-osf(ok): https://osf.io// + [INFO ] Configure additional publication dependency on "osf-annex-storage" + configure-sibling(ok): /tmp/collab_osf (sibling) + +Publishing the dataset +"""""""""""""""""""""" + +Afterwards, all that's left to do is a ``datalad push`` to publish the dataset to the OSF. + +.. code-block:: bash + + $ datalad push --to osf-annex + +The resulting dataset has all data and its Git history, but is not as human-readable as on a local computer: + +.. image:: ../_static/public_annex_sibling.png + +Cloning the dataset +""""""""""""""""""" + +The dataset can be cloned with an ``osf://`` URL, where ID is the project ID assigned at project creation: + +.. code-block:: bash + + $ datalad clone osf://n6bgd/ best-study-ever + install(ok): /tmp/best-study-ever (dataset) + +All data can subsequently be obtained using ``datalad get``. \ No newline at end of file diff --git a/docs/source/tutorial/codealong.rst b/docs/source/tutorial/codealong.rst new file mode 100644 index 0000000..38a4370 --- /dev/null +++ b/docs/source/tutorial/codealong.rst @@ -0,0 +1,70 @@ +.. include:: ../links.inc + +.. _codealong: + +Walk-Trough +^^^^^^^^^^^ + +The upcoming use cases are walk-throughs and meant as code-along tutorials. +If you want, open a terminal and code along to try out each method. +If you have DataLad and ``datalad-osf`` installed, each tutorial will not take more than 5 minutes. + +As a general preparation, build an example dataset and configure OSF credentials for reuse in all usecases. +You can execute all following examples in this dataset. + +Create an Example Dataset +""""""""""""""""""""""""" + +For the sake of this tutorial, let's create a small example DataLad dataset. + +.. code-block:: bash + + # collab_osf being the name of your new dataset + $ datalad create collab_osf + + +After dataset creation, populate it with some content (just like in the `Datalad Handbook`_): + +.. code-block:: bash + + $ cd collab_osf + # add a PDF file to git-annex + $ datalad download-url http://www.tldp.org/LDP/Bash-Beginners-Guide/Bash-Beginners-Guide.pdf \ + --dataset . \ + -m "add beginners guide on bash" \ + -O books/bash_guide.pdf + + download_url(ok): /tmp/collab_osf/books/bash_guide.pdf (file) + add(ok): /tmp/collab_osf/books/bash_guide.pdf (file) + save(ok): /tmp/collab_osf dataset) + action summary: + add (ok: 1) + download_url (ok: 1) + save (ok: 1) + # add a text file to Git + $ mkdir code + $ echo "This is just an example file just to show the different ways of saving data in a DataLad dataset." > code/example.txt + $ datalad save --to-git -m "created an example.txt" + add(ok): /tmp/collab_osf/code/example.txt (file) + save(ok): /tmp/collab_osf(dataset) + action summary: + add (ok: 1) + save (ok: 1) + + +Authenticate +"""""""""""" + +First, if you haven't done so yet, configure either your OSF credentials (username and password) or an OSF access token, either as environment variables, or using ``datalad osf-credentials``. +Below, we use an OSF access token: + +.. code-block:: bash + + $ datalad osf-credentials + You need to authenticate with 'https://osf.io' credentials. https://osf.io/settings/tokens provides information on how to gain access + token: + You need to authenticate with 'https://osf.io' credentials. https://osf.io/settings/tokens provides information on how to gain access + token (repeat): + osf_credentials(ok): [authenticated as ] + +More information on authentication is detailed in the section :ref:`authenticate`. diff --git a/docs/source/tutorial/exportdatacode.rst b/docs/source/tutorial/exportdatacode.rst new file mode 100644 index 0000000..b85542a --- /dev/null +++ b/docs/source/tutorial/exportdatacode.rst @@ -0,0 +1,85 @@ +.. include:: ../links.inc + +Use case 3: Using the OSF as a data store for a GitHub-based project +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. admonition:: Problem statement + + Imagine you are a PhD student and want to collaborate on a fun little side project with a student at another institute. + You agree that your code will be hosted on GitHub_ due to its easier accessibility and greater feature selection. + But what about the data you are collecting? + The Dropbox is already full (`DataLad third party providers `_). + And Amazon services don't seem to be your best alternative. + Suddenly you remember, that you got an OSF_ account recently: You decide to publish your dataset to GitHub_ and your data to the OSF_ and link both via a publication dependency. + + Therefore, you go with a sibling in ``annex`` mode. + While others *can* clone it from the OSF, you mostly utilize it for data access. Others clone the dataset from GitHub and are unaware where your data is stored. + + +Creating the OSF sibling +"""""""""""""""""""""""" + +Given OSF credentials are set, we can create a sibling in ``annex`` mode. + +As in use case 1, the code below will create a new public OSF project called ``our-study-data``, a dataset sibling called ``osf-annex2``, and a readily configured storage sibling ``osf-annex2-storage``. + +.. code-block:: bash + + # inside of the tutorial DataLad dataset + $ datalad create-sibling-osf --title our-study-data \ + -s osf-annex2 \ + --category data \ + --tag reproducibility \ + --public + + create-sibling-osf(ok): https://osf.io// + [INFO ] Configure additional publication dependency on "osf-annex2-storage" + configure-sibling(ok): /tmp/collab_osf (sibling) + +Creating a sibling on GitHub +"""""""""""""""""""""""""""" + +As the goal is to use OSF for data storage and expose the dataset also via GitHub, we're not done yet. +We can set-up a GitHub Remote with name ``github`` and include a publication dependency to the OSF storage sibling -- that way, when we publish our dataset to GitHub, the data files get automatically uploaded to OSF. + +.. code-block:: bash + + $ datalad create-sibling-github our-study-data \ + -s github \ + --github-login LOGIN \ + --publish-depends osf-annex2-storage + + You need to authenticate with '@github' credentials. https://github.com/login provides information on how to gain access + password: + You need to authenticate with '@github' credentials. https://github.com/login provides information on how to gain access + password (repeat): + [INFO ] Configure additional publication dependency on "osf-annex2-storage" + .: github(-) [https://@github.com//our-study-data.git (git)] + 'https://@github.com//our-study-data.git' configured as sibling 'github' for Dataset(/tmp/collab_osf) + +Publish the dataset to GitHub and its data to OSF +""""""""""""""""""""""""""""""""""""""""""""""""" + +Because a publication dependency to the OSF is set up, a ``datalad push`` to GitHub is sufficient. + +.. code-block:: bash + + $ datalad push --to github + Push to 'github': [...] | 1.00/4.00 [00:00<00:00, 25.9k Steps/s] + Password for 'https://adswa@github.com': + copy(ok): /tmp/collab_osf/books/bash_guide.pdf (file) [to osf-annex2-storage...] + Push to 'github': [...] | 1.00/4.00 [00:33<01:41, 33.9s/ Steps] + Update availability for 'github': [...] | 3.00/4.00 [00:00<00:00, 60.5k Steps/s] + Password for 'https://adswa@github.com': + publish(ok): /tmp/collab_osf (dataset) [refs/heads/master->github:refs/heads/master [new branch]] + Update availability for 'github': [...] | 3.00/4.00 [00:15<00:05, 5.27s/ Steps] + Publish(ok): /tmp/collab_osf (dataset) [refs/heads/git-annex->github:refs/heads/git-annex [new branch]] + Update availability for 'github': [...] | 3.00/4.00 [00:15<00:05, 5.27s/ Steps] + +Afterwards, the dataset can be cloned from GitHub. +For a user, the experience will feel similar to use case 1: After cloning, the files in Git and all dataset history are available, all data stored in the annex is retrieved upon ``datalad get``. +The file content, though, will be retrieved from the OSF, which now serves as a data store for the GitHub repository. + +.. image:: ../_static/datastore_sibling.png + +This way you can let OSF handle your data, but still use GitHub to expose your dataset. diff --git a/docs/source/tutorial/exporthumandata.rst b/docs/source/tutorial/exporthumandata.rst new file mode 100644 index 0000000..2aef2b0 --- /dev/null +++ b/docs/source/tutorial/exporthumandata.rst @@ -0,0 +1,72 @@ +.. include:: ../links.inc +.. _export: + +Use case 2: Export a human-readable dataset to OSF +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + + +.. admonition:: Problem statement + + Imagine you have been collecting data and want to share it with others. + All your colleagues have only sparse experience with the command line, but are frequent OSF users. + Therefore, you place all data in a dataset and export the latest version of your data to the OSF in a human readable way, for others to view it conveniently in the web interface. + + As you want a human-readable representation and decide to not share the version history, but only the most recent state of the data, you pick the ``exportonly`` sibling mode. + + +Creating the OSF sibling +"""""""""""""""""""""""" + +Given OSF credentials are set, we can create a sibling in ``export-only`` mode. +We will also make the project public (``--public``), and attach a custom description (``--description``) to it. + +The code below will create a new public OSF project called ``best-data-ever``, a dataset sibling called ``osf-exportonly-storage``. This sibling is a sole storage sibling -- in technical terms, a git-annex_ `special remote`_ -- and can only be used with pure git-annex commands. + +.. code-block:: bash + + # inside of the tutorial DataLad dataset + $ datalad create-sibling-osf --title best-data-ever \ + --mode exportonly \ + -s osf-export \ + --description "This carefully acquired data will bring science forward" \ + --public + + create-sibling-osf(ok): https://osf.io// + # note that the sibling name as an appended "storage" suffix! + $ datalad siblings + .: here(+) [git] + .: osf-annex(-) [osf://n6bgd (git)] + .: osf-export-storage(+) [osf] # created in this example + .: osf-annex-storage(+) [osf] + + +Publishing the dataset +"""""""""""""""""""""" + +In ``exportonly`` mode, the only way to publish data is via ``git annex export`` (more on this command `here `_). +This command is a pure git-annex_ command and can export a *tree* (a branch, tag, or any other tree-ish accepted by Git including e.g., ``master:subdir``) to the OSF without obscuring filenames. + +To export the current state (the ``HEAD``) of the dataset with this command we run: + +.. code-block:: bash + + $ git-annex export HEAD --to osf-export-storage + export osf-export-storage .datalad/.gitattributes + ok + export osf-export-storage .datalad/config + ok + export osf-export-storage .gitattributes + ok + export osf-export-storage books/bash_guide.pdf + ok + export osf-export-storage code/example.txt + ok + (recording state in git...) + +The resulting project has a human readable structure, and all its data can be viewed and downloaded via the OSF interface. +It is not possible to clone this dataset with DataLad, however potential users can still download it from the standard OSF interface. + +.. image:: ../_static/public_exportonly_sibling.png + + diff --git a/docs/source/tutorial/osf-sibling.rst b/docs/source/tutorial/osf-sibling.rst new file mode 100644 index 0000000..9f5fd0e --- /dev/null +++ b/docs/source/tutorial/osf-sibling.rst @@ -0,0 +1,98 @@ +.. include:: ../links.inc +.. _osfsibling: + +Step 2: Create an OSF sibling +============================= + +Once authenticated, DataLad can -- if called from within a DataLad dataset -- create and modify projects on the OSF and publish annexed data, a single version view, or the complete dataset to it. +The command that enables this is :command:`datalad create-sibling-osf`. +It supports different modes, exposes a large number of features from the OSF web interface and yields a custom dataset configuration for your use case at hand. +This section introduces the command and its functionality, and the upcoming use cases demonstrate several workflow types it can be used for. + +What's a sibling? +^^^^^^^^^^^^^^^^^ + +A sibling is a dataset clone that a given DataLad dataset knows about. +In most cases, changes can be retrieved and pushed between a dataset and its sibling. +It is the equivalent of a *remote* in Git. +The :command:`datalad create-sibling-osf` command can create a dataset clone under an authenticated user account on the OSF as a new project. + +General command +^^^^^^^^^^^^^^^ + +When relying on default parameters, ``create-sibling-osf`` requires only a project name for the resulting OSF project (``--title``) and a sibling name (``-s/--name``, which defaults to ``osf``). + +.. code-block:: bash + + # within a dataset mydataset + $ datalad create-sibling-osf --title -s + create-sibling-osf(ok): https://osf.io/czgpf/ + [INFO ] Configure additional publication dependency on "-storage" + configure-sibling(ok): (sibling) + +In the default modes of operation and in most other modes, this command will create one project on the OSF (reported under its URL in the command summary) and two dataset siblings: One sibling to publish Git history and files stored in Git to, and a *storage* sibling to which annexed data will be published. + + +.. admonition:: dataset publishing + + Note that data still needs to be *published* to be available on the OSF after sibling creation. + The relevant commands for this are dependent of the sibling *mode*. + If used in the default mode, both siblings will be automatically configured such that a single ``datalad push`` is sufficient to publish the complete dataset. + For general information on publishing datasets, please refer to `this Handbook chapter `_, and for more information on publishing depending on sibling mode, please pay close attention to the paragraph :ref:`siblingmode` and the upcoming use cases. + +Sibling configuration +^^^^^^^^^^^^^^^^^^^^^ + +``create-sibling-osf`` has a number of parameters that expose the underlying flexibility of DataLad, git-annex, and the OSF. +The most important one is the *mode* (``--mode``) parameter. +Depending on the mode for your OSF sibling, you will be able to publish different aspects of your dataset to the OSF, and each mode requires different commands for publishing. +Other important parameters include the ``--public`` flag (for access control), and ``--tag``, ``--category`` and ``--description`` for additional project meta data. +For a complete overview of all parameters, see the :ref:`command documentation `. + +.. _siblingmode: + +Sibling modes +""""""""""""" + +``create-sibling-osf`` supports several modes that determine the functionality and usage of the resulting sibling. + +- ``annex`` (default): **You can publish the complete dataset to the resulting OSF project**. This includes all Git history and annexed data. Afterwards, the OSF project URL can be cloned to retrieve the dataset, and ``datalad get`` will be able retrieve all file contents, even older versions. This mode is the most convenient if you aim to share complete datasets with all data and version history. Note that the dataset representation in the OSF project is not as readable as in a local dataset (clone), but a non-human readable representation [#f1]_ tuned to enable cloning. Publishing the dataset requires only ``datalad push``. +- ``export``: **You can push the Git history of a dataset as well as one snapshot of its data to the resulting OSF project**. Afterwards, the OSF project URL can be cloned to retrieve the dataset and ``datalad get`` will be able to retrieve all file contents *in one version*. Compared to the ``annex`` mode, the dataset representation on the OSF is human-readable, but only one version of each file can be published. This mode is convenient if you want to share a dataset and its history in a human-readable way but only make one version of it available. Publishing Git history requires ``git push`` or ``datalad push``, and exporting a single view of the data must be done via ``git-annex export``. +- ``gitonly``: **You can push the Git history of a dataset, but no annexed data to the resulting OSF project**. Afterwards, the OSF project URL can be cloned to retrieve the dataset, but ``datalad get`` will not be able to retrieve file contents. This can be convenient if you want to use the OSF as an alternative to GitHub_. Note that the representation of the dataset is not human-readable, but tuned for cloning. Publishing Git history requires ``git push`` or ``datalad push``. +- ``exportonly``: **You can export the dataset in a human-readable way in one version**. Note that this type of sibling can not be cloned from the OSF. This option is the most convenient if you want to make one snapshot of your dataset available via the OSF. Exporting needs to be done via ``git-annex export`` and your dataset will only get a storage sibling. + + +In deciding which mode suits your use case you can consider the following questions: + +#. Do you want collaborators to be able to ``datalad clone`` your project? If yes, go for ``annex``, ``export``, or ``gitonly`` +#. Do you want to share your data? If yes, go for ``annex``, or -- if you're okay with sharing only a one version per file -- ``export`` and ``export only`` +#. Do you care how data looks like on the OSF? If not, go for ``annex``, if yes, use one of the ``export`` modes. Find out more about this in the :ref:`tutorial on exporting data `. + + + + +Access Management: Public or private projects +""""""""""""""""""""""""""""""""""""""""""""" + +By default, any new project created with ``create-sibling-osf`` is a `private OSF project `_ that can only be accessed by its creator and collaborators added via OSF's interface. +To make a project public, you can either transform it into a public project via the web interface, or use the ``--public`` flag of ``create-sibling-osf`` to create it publicly from the very start. +This constitutes a convenient access management system for your dataset. + +OSF project metadata +"""""""""""""""""""" + +Meta data helps to make your project discoverable and understandable. +The OSF provides several means of attaching meta data to a project: Tags and Categories. +By default, two tags are created upon project creation: "DataLad dataset" and the unique ID of your dataset. +Any amount of arbitrary additional tags can be specified with one or more ``--tag`` options. +Note that each tag needs to be specified with its own ``--tag`` parameter. + +The category of a project determines the small icon displayed in a project and helps search organization. +You can chose one out of several categories ("analysis", "communication", "data", "hypothesis", +"instrumentation", "methods and measures", "procedure", "project", "software", "other") and specify it using the ``--category`` parameter. +By default, the category "data" is used. + + +.. rubric:: Footnotes + +.. [#f1] What exactly is a non-human readable representation? On the OSF, the Git history will be compressed to a text file and a zip file, while all annexed data will appear under the hash it is stored in in the git-annex object tree, i.e., with an obscured file name. If you are interested in finding out more about this, take a look at `this section in the DataLad handbook `_. Once cloned from the OSF to a local file system, the dataset will have its usual, human-readable format. \ No newline at end of file