From 0be48f97f30aeeb528943e541d92104b4a447d79 Mon Sep 17 00:00:00 2001 From: AlexVCaron Date: Wed, 18 Dec 2024 17:52:10 +0000 Subject: [PATCH] revamp test data loader, now handles cache for real. fix nf-test diff rendering --- .devcontainer/devops/devcontainer.json | 4 +- docs/cspell/neuroscience.txt | 3 +- poetry.lock | 4 +- pyproject.toml | 1 + subworkflows/nf-neuro/load_test_data/main.nf | 161 ++++++++++++++++--- tests/nextflow.config | 3 +- tests/test_data.json | 1 - 7 files changed, 144 insertions(+), 33 deletions(-) diff --git a/.devcontainer/devops/devcontainer.json b/.devcontainer/devops/devcontainer.json index ad6788aa..9c044d5a 100755 --- a/.devcontainer/devops/devcontainer.json +++ b/.devcontainer/devops/devcontainer.json @@ -4,7 +4,9 @@ "dockerfile": "Dockerfile", "args": { "NFTEST_VERSION": "0.9.0", - "POETRY_VERSION": "1.8.*" + "POETRY_VERSION": "1.8.*", + "NFT_DIFF": "pdiff", + "NFT_DIFF_ARGS": "--line-numbers --width 120 --expand-tabs=2" } }, "forwardPorts": [3000], diff --git a/docs/cspell/neuroscience.txt b/docs/cspell/neuroscience.txt index 3008e2f8..2b27901f 100644 --- a/docs/cspell/neuroscience.txt +++ b/docs/cspell/neuroscience.txt @@ -29,6 +29,7 @@ *metrics *morph* *mov* +*neuro* *normalise *pack *par @@ -64,11 +65,11 @@ fsl* gagnon* interp medde +mkdirs mppca mrdegibbs mrtrix msmt -neuro* nextflow nf* nifti diff --git a/poetry.lock b/poetry.lock index 8100f7b8..185cdd26 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand. [[package]] name = "annotated-types" @@ -2429,4 +2429,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "<3.11,>=3.9" -content-hash = "86ff61aa15873147e59b28a6ff0b1ac405796dde631e28216829299332fdfa26" +content-hash = "84934e125505bfaca09ac4232c63d5c53fa66696503f122fcf1feef8d306e4e8" diff --git a/pyproject.toml b/pyproject.toml index 155550ea..fc149a48 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,3 +38,4 @@ python = "<3.11,>=3.9" nf-core = "~2.14.1" black = "^24.1.1" isort = "^5.13.2" +pdiff = "^1.1.4" diff --git a/subworkflows/nf-neuro/load_test_data/main.nf b/subworkflows/nf-neuro/load_test_data/main.nf index 612c68a7..5a1dbcc3 100644 --- a/subworkflows/nf-neuro/load_test_data/main.nf +++ b/subworkflows/nf-neuro/load_test_data/main.nf @@ -1,36 +1,147 @@ -import java.nio.file.Files -def fetch_archive ( name, destination, remote, database, data_identifiers ) { - // Find cache location for test archives + +def locate_local_cache () { + // Find cache location for test archives, in order of preference: + // 1. Using environment variable $NFNEURO_TEST_DATA_HOME + // 2. Using environment variable $XDG_DATA_HOME + // 3. Using default location $HOME/.local/share + // + // Location selected is appended with 'nf-neuro-test-archives'. + // If the location does not exist, it is created. + def storage = file( System.getenv('NFNEURO_TEST_DATA_HOME') ?: System.getenv('XDG_DATA_HOME') ?: "${System.getenv('HOME')}/.local/share" ) def cache_location = file("$storage/nf-neuro-test-archives") - if ( !cache_location.exists() ) cache_location.mkdirs() - // Fetch file from remote if not present in cache - def data_id = data_identifiers[name] - if ( !data_id ) { - error "Invalid test data identifier supplied: $name" + if ( !cache_location.exists() ) { + try { + cache_location.mkdirs() + } + catch (Exception _e) { + error "Failed to create cache location: $cache_location" + } + } + + return cache_location +} + +def locate_remote_cache () { + return "$params.test_data_remote/$params.test_database_path" +} + +def load_manifest () { + // Load test data associations from params.test_data_associations + // which must be a map of test data identifiers [filename: identifier] + + if ( ! params.test_data_associations ) { + error """ + No test data associations provided, cannot create cache manifest. Please + provide a map of test data identifiers [filename: identifier] using + params.test_data_associations. + """ + } + + return params.test_data_associations +} + +def validate_cache_entry ( name, manager ) { + // Check if the cache entry is present in the manifest + + if ( !manager.manifest[name] ) { + error "Invalid cache entry supplied : $name" + } + +} + +def add_cache_entry ( name, manager ) { + // Add the test data archive as an entry in the cache. The archive is + // fetched from the remote location and stored in the cache location. + // The given name is validated against the manifest before adding. + + manager.validate_entry(name) + + def identifier = "${manager.manifest[name]}" + def cache_entry = file("${manager.cache_location}/$identifier") + def remote_subpath = "${identifier[0..1]}/${identifier[2..-1]}" + def remote_entry = file("$manager.remote_location/$remote_subpath") + + try { + remote_entry.copyTo(cache_entry) + } + catch (Exception _e) { + manager.delete_entry(name) + error "Failed to fetch test data archive: $name | $_e" } - def cache_entry = file("$cache_location/$data_id") - if ( !cache_entry.exists() ) { + return cache_entry +} + +def get_cache_entry ( name, manager ) { + // Retrieve the cache entry for the given test data archive name. + // If the entry does not exist, it is added to the cache. The add + // operation will validate the name against the manifest. + + def identifier = "${manager.manifest[name]}" + def cache_entry = file("${manager.cache_location}/$identifier") + + if ( !cache_entry.exists() ) manager.add_entry(name) + + return cache_entry +} + +def delete_cache_entry ( name, manager ) { + // Delete the cache entry for the given test data archive name. + + def identifier = "${manager.manifest[name]}" + def cache_entry = file("${manager.cache_location}/$identifier") + if ( cache_entry.exists() ) { try { - def remote_entry = "${data_id[0..1]}/${data_id[2..-1]}" - file("$remote/$database/$remote_entry").copyTo(cache_entry) + cache_entry.delete() } - catch (Exception e) { - error "Failed to fetch test data archive: $name" - file("$remote/$database/$remote_entry").delete() + catch (Exception _e) { + error "Failed to delete test data archive: $name" } } +} +def update_cache_entry ( name, manager ) { + // Update the cache entry for the given test data archive name. The + // procedure uses add to carry the update, but deletes the entry first + // if it exists. The add operation will validate the name against + // the manifest. + + manager.delete_entry(name) + manager.add_entry(name) +} + +def setup_cache () { + // Build a cache manager to encapsulate interaction with the test data cache. + // The manager follows simple CRUD operation to handle update and retrieval of + // test data archives from the cache and the remote location. + + def cache_manager = new Expando( + remote_location: locate_remote_cache(), + cache_location: locate_local_cache(), + manifest: load_manifest() + ) + cache_manager.validate_entry = { v -> validate_cache_entry( v, cache_manager ) } + cache_manager.add_entry = { v -> add_cache_entry(v, cache_manager) } + cache_manager.get_entry = { v -> get_cache_entry(v, cache_manager) } + cache_manager.delete_entry = { v -> delete_cache_entry(v, cache_manager) } + cache_manager.update_entry = { v -> update_cache_entry(v, cache_manager) } + + return cache_manager +} + + +def fetch_archive ( name, destination, manager ) { // Unzip all archive content to destination - def content = new java.util.zip.ZipFile("$cache_entry") + def content = null try { + content = new java.util.zip.ZipFile("${manager.get_entry(name)}") content.entries().each{ entry -> def local_target = file("$destination/${entry.getName()}") if (entry.isDirectory()) { @@ -42,11 +153,14 @@ def fetch_archive ( name, destination, remote, database, data_identifiers ) { } } } + content.close() return destination.resolve("${name.take(name.lastIndexOf('.'))}") } - finally { - content.close() + catch (Exception _e) { + if (content) content.close() + manager.delete_entry(name) + error "Failed to extract test data archive: $name | $_e" } } @@ -57,16 +171,11 @@ workflow LOAD_TEST_DATA { test_data_prefix main: + manager = setup_cache() - ch_versions = Channel.empty() - test_data_path = Files.createTempDirectory("$test_data_prefix") + test_data_path = java.nio.file.Files.createTempDirectory("$test_data_prefix") ch_test_data_directory = ch_archive.map{ archive -> - fetch_archive( - archive, test_data_path, - params.test_data_remote, - params.test_database_path, - params.test_data_associations - ) + fetch_archive(archive, test_data_path, manager) } emit: diff --git a/tests/nextflow.config b/tests/nextflow.config index c7b2182a..77959c0a 100644 --- a/tests/nextflow.config +++ b/tests/nextflow.config @@ -1,4 +1,3 @@ -import groovy.json.JsonSlurper params { outdir = "output/" @@ -7,7 +6,7 @@ params { test_data_remote = "https://scil.usherbrooke.ca" test_database_path = "scil_test_data/dvc-store/files/md5" - test_data_associations = new JsonSlurper().parse( + test_data_associations = new groovy.json.JsonSlurper().parse( new File("$projectDir/tests/test_data.json") ) } diff --git a/tests/test_data.json b/tests/test_data.json index c7a1bdb5..cd5e7f52 100644 --- a/tests/test_data.json +++ b/tests/test_data.json @@ -57,6 +57,5 @@ "freesurfer_nifti.zip": "adb5ac4cf5c45040339e04e7c142e8c9", "transform.zip": "148afd665ddbd2bb80493208480571a9", "dicom.zip": "234913cbad53c19aa19aef9eda0a3839", - "freesurfer_nifti.zip": "adb5ac4cf5c45040339e04e7c142e8c9", "TOPUP.zip": "da11914087a1a4ed1d21d478540d41b0" }