some test on merged authors

esg-epfl-apc · May 23, 2024 · 533bf39 · 533bf39
1 parent 2e5bff3
commit 533bf39
Show file tree

Hide file tree

Showing 4 changed files with 289 additions and 6 deletions.
diff --git a/tests/test_absolutize.py b/tests/test_absolutize.py
@@ -19,7 +19,7 @@ def get_test_data_file(filename=""):
 
 class TestAbsolutizePaths:  # (unittest.TestCase):
     # NOTE: ids can not be found, like 634, or forbidden, like 678
-    @pytest.mark.parametrize("workflow_id", [41, 552, 883])
+    @pytest.mark.parametrize("workflow_id", [41, 31, 552, 883])
     def test_make_paths_absolute(self, workflow_id):
         with patch_rdflib_urlopen(get_test_data_file(), write_cache=False):
             with open(
@@ -42,11 +42,19 @@ def test_make_paths_absolute(self, workflow_id):
     def test_merged(self):
         G = merge_all_files(
             get_test_data_file("[0-9]*ro-crate*.json"),
-            cache_base_dir=get_test_data_file(),
-            write_cache=False,
+            cache_kwargs=dict(
+                cache_base_dir=get_test_data_file(),
+                write_cache=False,
+            ),
         )
 
         assert is_all_absolute(G)
 
-        for s, o in G.subject_objects(rdflib.URIRef("http://schema.org/author")):
-            print(s, o)
+        authors = set(
+            [o for s, o in G.subject_objects(rdflib.URIRef("http://schema.org/author"))]
+        )
+
+        # checking that we got some useful data about the authors
+        assert authors == set(
+            [rdflib.term.Literal("Arnaud Meng, Maxim Scheremetjew, Michael Crusoe")]
+        )
diff --git a/tests/test_data/31_ro-crate-metadata.json b/tests/test_data/31_ro-crate-metadata.json
@@ -0,0 +1,253 @@
+{
+  "@context": "https://w3id.org/ro/crate/1.0/context",
+  "@graph": [
+    {
+      "@id": "ro-crate-metadata.jsonld",
+      "@type": "CreativeWork",
+      "about": {
+        "@id": "./"
+      }
+    },
+    {
+      "@id": "ro-crate-preview.html",
+      "@type": "CreativeWork",
+      "about": {
+        "@id": "./"
+      }
+    },
+    {
+      "@id": "./",
+      "@type": "Dataset",
+      "name": "TranscriptsAnnotation-i5only-wf",
+      "description": "Transcripts annotation workflow",
+      "license": "https://www.apache.org/licenses/LICENSE-2.0",
+      "author": "Arnaud Meng, Maxim Scheremetjew, Michael Crusoe",
+      "publisher": "EMBL - European Bioinformatics Institute, 2018",
+      "url": "https://github.com/EBI-Metagenomics/workflow-is-cwl/blob/master/workflows/TranscriptsAnnotation-i5only-wf.cwl",
+      "mainEntity": {
+        "@id": "workflows/TranscriptsAnnotation-i5only-wf.cwl"
+      },
+      "hasPart": [
+        {
+          "@id": "workflows/TranscriptsAnnotation-i5only-wf.cwl"
+        },
+        {
+          "@id": "diagram.svg"
+        },
+        {
+          "@id": "workflows/TransDecoder-v5-wf-2steps.cwl"
+        },
+        {
+          "@id": "tools/TransDecoder/TransDecoder.Predict-v5.cwl"
+        },
+        {
+          "@id": "tools/TransDecoder/TransDecoder-v5-genetic_codes.yaml"
+        },
+        {
+          "@id": "tools/TransDecoder/TransDecoder.LongOrfs-v5.cwl"
+        },
+        {
+          "@id": "workflows/InterProScan-v5-chunked-wf.cwl"
+        },
+        {
+          "@id": "tools/InterProScan/InterProScan-v5.cwl"
+        },
+        {
+          "@id": "tools/InterProScan/InterProScan-apps.yaml"
+        },
+        {
+          "@id": "tools/InterProScan/InterProScan-protein_formats.yaml"
+        },
+        {
+          "@id": "utils/fasta_chunker.cwl"
+        },
+        {
+          "@id": "utils/concatenate.cwl"
+        },
+        {
+          "@id": "utils/clean_fasta_headers.cwl"
+        },
+        {
+          "@id": "tools/Diamond/Diamon.blastx-v0.9.21.cwl"
+        },
+        {
+          "@id": "tools/Diamond/Diamond-strand_values.yaml"
+        },
+        {
+          "@id": "tools/Diamond/Diamond-output_formats.yaml"
+        },
+        {
+          "@id": "utils/esl-reformat.cwl"
+        },
+        {
+          "@id": "utils/esl-reformat-replace.yaml"
+        },
+        {
+          "@id": "utils/cut_fasta_headers.cwl"
+        },
+        {
+          "@id": "tools/BUSCO/BUSCO-assessment_modes.yaml"
+        },
+        {
+          "@id": "workflows/cmsearch-multimodel-wf.cwl"
+        },
+        {
+          "@id": "tools/cmsearch-deoverlap/cmsearch-deoverlap-v0.02.cwl"
+        },
+        {
+          "@id": "tools/Infernal/cmsearch/infernal-cmsearch-v1.1.2.cwl"
+        },
+        {
+          "@id": "tools/BUSCO/BUSCO-v3.cwl"
+        }
+      ]
+    },
+    {
+      "@id": "workflows/TranscriptsAnnotation-i5only-wf.cwl",
+      "@type": [
+        "File",
+        "SoftwareSourceCode",
+        "Workflow"
+      ],
+      "programmingLanguage": {
+        "@id": "#cwl"
+      },
+      "name": "TranscriptsAnnotation-i5only-wf",
+      "contentSize": 5490,
+      "image": {
+        "@id": "diagram.svg"
+      }
+    },
+    {
+      "@id": "diagram.svg",
+      "@type": [
+        "File",
+        "ImageObject",
+        "WorkflowSketch"
+      ],
+      "contentSize": 34556
+    },
+    {
+      "@id": "workflows/TransDecoder-v5-wf-2steps.cwl",
+      "@type": "File",
+      "contentSize": 1547
+    },
+    {
+      "@id": "tools/TransDecoder/TransDecoder.Predict-v5.cwl",
+      "@type": "File",
+      "contentSize": 5722
+    },
+    {
+      "@id": "tools/TransDecoder/TransDecoder-v5-genetic_codes.yaml",
+      "@type": "File",
+      "contentSize": 415
+    },
+    {
+      "@id": "tools/TransDecoder/TransDecoder.LongOrfs-v5.cwl",
+      "@type": "File",
+      "contentSize": 3920
+    },
+    {
+      "@id": "workflows/InterProScan-v5-chunked-wf.cwl",
+      "@type": "File",
+      "contentSize": 2940
+    },
+    {
+      "@id": "tools/InterProScan/InterProScan-v5.cwl",
+      "@type": "File",
+      "contentSize": 4556
+    },
+    {
+      "@id": "tools/InterProScan/InterProScan-apps.yaml",
+      "@type": "File",
+      "contentSize": 283
+    },
+    {
+      "@id": "tools/InterProScan/InterProScan-protein_formats.yaml",
+      "@type": "File",
+      "contentSize": 78
+    },
+    {
+      "@id": "utils/fasta_chunker.cwl",
+      "@type": "File",
+      "contentSize": 2178
+    },
+    {
+      "@id": "utils/concatenate.cwl",
+      "@type": "File",
+      "contentSize": 1217
+    },
+    {
+      "@id": "utils/clean_fasta_headers.cwl",
+      "@type": "File",
+      "contentSize": 928
+    },
+    {
+      "@id": "tools/Diamond/Diamon.blastx-v0.9.21.cwl",
+      "@type": "File",
+      "contentSize": 4441
+    },
+    {
+      "@id": "tools/Diamond/Diamond-strand_values.yaml",
+      "@type": "File",
+      "contentSize": 54
+    },
+    {
+      "@id": "tools/Diamond/Diamond-output_formats.yaml",
+      "@type": "File",
+      "contentSize": 79
+    },
+    {
+      "@id": "utils/esl-reformat.cwl",
+      "@type": "File",
+      "contentSize": 1408
+    },
+    {
+      "@id": "utils/esl-reformat-replace.yaml",
+      "@type": "File",
+      "contentSize": 381
+    },
+    {
+      "@id": "utils/cut_fasta_headers.cwl",
+      "@type": "File",
+      "contentSize": 970
+    },
+    {
+      "@id": "tools/BUSCO/BUSCO-assessment_modes.yaml",
+      "@type": "File",
+      "contentSize": 63
+    },
+    {
+      "@id": "workflows/cmsearch-multimodel-wf.cwl",
+      "@type": "File",
+      "contentSize": 1892
+    },
+    {
+      "@id": "tools/cmsearch-deoverlap/cmsearch-deoverlap-v0.02.cwl",
+      "@type": "File",
+      "contentSize": 1925
+    },
+    {
+      "@id": "tools/Infernal/cmsearch/infernal-cmsearch-v1.1.2.cwl",
+      "@type": "File",
+      "contentSize": 4376
+    },
+    {
+      "@id": "tools/BUSCO/BUSCO-v3.cwl",
+      "@type": "File",
+      "contentSize": 9661
+    },
+    {
+      "@id": "#cwl",
+      "@type": "ComputerLanguage",
+      "name": "Common Workflow Language",
+      "alternateName": "CWL",
+      "identifier": {
+        "@id": "https://w3id.org/cwl/v1.0/"
+      },
+      "url": {
+        "@id": "https://www.commonwl.org/"
+      }
+    }
+  ]
+}
diff --git a/workflowhub_graph/cachedurlopen.py b/workflowhub_graph/cachedurlopen.py
@@ -8,6 +8,12 @@
 
 
 def url_to_filename(url):
+    """
+    Converts a URL to a filename by removing non-alphanumeric characters and replacing them with dashes.
+    :param url: The URL to convert.
+    :return: The filename.
+    """
+
     parsed = urlparse(url)
     if parsed.scheme not in ["http", "https"]:
         raise ValueError(f"Unsupported scheme {parsed.scheme}")
@@ -21,6 +27,13 @@ def patch_rdflib_urlopen(
     write_cache=True,
     allowed_urls_pattern=r"https://w3id.org/ro/crate/1\.[01]/context",
 ):
+    """
+    Context manager to patch rdflib.parser.urlopen to cache and return the content of a URL.
+    :param cache_base_dir: The base directory to store the cached files.
+    :param write_cache: Whether to write the cache if the file is not found.
+    :param allowed_urls_pattern: A regex pattern to match the allowed URLs to cache.
+    """
+
     allowed_urls_re = re.compile(allowed_urls_pattern)
     if cache_base_dir is None:
         cache_base_dir = "cached_urlopen"

diff --git a/workflowhub_graph/merge.py b/workflowhub_graph/merge.py
@@ -2,6 +2,7 @@
 import glob
 import json
 import os
+from typing import Optional
 
 import rdflib
 
@@ -11,8 +12,16 @@
 
 
 def merge_all_files(
-    pattern="data/*.json", base_url=BASE_URL, **cache_kwargs
+    pattern="data/*.json", base_url: str = BASE_URL, cache_kwargs: None = Optional[dict]
 ) -> rdflib.Graph:
+    """
+    Merges all JSON-LD files in the given pattern into a single RDF graph.
+    :param pattern: The pattern to match the files.
+    :param base_url: The base URL for the WorkflowHub.
+    :param cache_kwargs: Keyword arguments to pass to urllib cache
+    :return: The merged RDF graph.
+    """
+
     G = rdflib.Graph()
 
     filenames = glob.glob(pattern)