Backport PR #837 to 1.0.x (#842)

* Concatenate_h5mu: fix writing multi-dimensional annotation frames. (#837) * Concatenate_h5mu: fix writing multidomensional annotation frames. * Undo some changes * Update test * Add PR number * Trigger CI * Update CHANGELOG * deploy: 57add3f * Update CHANGELOG --------- Co-authored-by: DriesSchaumont <[email protected]>
openpipelines-bio · Jul 22, 2024 · 2963adb · 2963adb
1 parent 0ead6c6
commit 2963adb
Show file tree

Hide file tree

Showing 573 changed files with 2,368 additions and 2,251 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,12 @@
+# openpipelines 1.0.2
+
+## BUG FIXES
+
+* `dataflow/concatenate_h5mu`: fix writing out multidimensional annotation dataframes (e.g. `.varm`) that had their 
+  data dtype (dtype) changed as a result of adding more observations after concatenation, causing `TypeError`.
+  One notable example of this happening is when one of the samples does not have a multimodal annotation dataframe 
+  which is present in another sample; causing the values being filled with `NA` (PR #842, backported from PR #837).
+
 # openpipelines 1.0.1
 
 ## BUG FIXES

diff --git a/src/dataflow/concatenate_h5mu/script.py b/src/dataflow/concatenate_h5mu/script.py
@@ -220,7 +220,15 @@ def split_conflicts_modalities(n_processes: int, samples: dict[str, anndata.AnnD
         output_index = getattr(output, matrix_name).index 
         conflicts, concatenated_matrix = concatenate_matrices(n_processes, matrices, output_index)
         if concatenated_matrix.empty:
-           concatenated_matrix.index = output_index 
+           concatenated_matrix.index = output_index
+
+        # Even though we did not touch the varm and obsm matrices that were already present,
+        # the joining of observations might have caused a dtype change in these matrices as well
+        # so these also need to be casted to a writable dtype...
+        for multidim_name, multidim_data in getattr(output, f"{matrix_name}m").items():
+            new_data = cast_to_writeable_dtype(multidim_data) if isinstance(multidim_data, pd.DataFrame) else multidim_data 
+            getattr(output, f"{matrix_name}m")[multidim_name] = new_data
+
         # Write the conflicts to the output
         for conflict_name, conflict_data in conflicts.items():
             getattr(output, f"{matrix_name}m")[conflict_name] = conflict_data

diff --git a/src/dataflow/concatenate_h5mu/test.py b/src/dataflow/concatenate_h5mu/test.py
@@ -478,9 +478,14 @@ def test_concat_different_columns_per_modality_and_per_sample(run_component, sam
     non_shared_features = data_sample1.var_names.difference(data_sample2.var_names)
     assert concatenated_data.var.loc[non_shared_features, 'mod2:Feat4'].isna().all()
 
-@pytest.mark.parametrize("test_value,expected", [("bar", "bar"), (True, True), (0.1, 0.1), (np.nan, pd.NA)])
+@pytest.mark.parametrize("test_value,test_value_dtype,expected", [("bar", "str", "bar"),
+                                                                  (True, pd.BooleanDtype(), True),
+                                                                  (1, pd.Int16Dtype(), 1),
+                                                                  (0.1, float, 0.1),
+                                                                  (0.1, np.float64, 0.1),
+                                                                  (np.nan, np.float64, pd.NA)])
 def test_concat_remove_na(run_component, sample_1_h5mu, sample_2_h5mu, 
-                          write_mudata_to_file, random_h5mu_path, test_value, expected,
+                          write_mudata_to_file, random_h5mu_path, test_value, test_value_dtype, expected,
                           change_column_contents):
     """
     Test concatenation of samples where the column from one sample contains NA values
@@ -492,7 +497,7 @@ def test_concat_remove_na(run_component, sample_1_h5mu, sample_2_h5mu,
     """
     change_column_contents(sample_1_h5mu, 'var', 'Shared_feat', {'mod1': np.nan, 'mod2': np.nan})
     change_column_contents(sample_2_h5mu, 'var', 'Shared_feat', {'mod1': test_value, 'mod2': np.nan})
-
+    sample_2_h5mu.var['Shared_feat'] = sample_2_h5mu.var['Shared_feat'].astype(test_value_dtype)
     output_path = random_h5mu_path()
 
     run_component([
@@ -547,9 +552,17 @@ def test_concat_invalid_h5_error_includes_path(run_component, tmp_path,
         err.value.stdout.decode('utf-8'))
 
 
-@pytest.mark.parametrize("test_value_1,test_value_2,expected", [(1, "1", pd.CategoricalDtype(categories=['1.0', '1']))])
+@pytest.mark.parametrize("test_value_1,value_1_dtype,test_value_2,value_2_dtype,expected", 
+                         [(1, float, "1", str, pd.CategoricalDtype(categories=['1.0', '1'])),
+                          (1, np.float64, "1", str, pd.CategoricalDtype(categories=['1.0', '1'])),
+                          (1, pd.Int16Dtype(), 2.0,  pd.Int16Dtype(), pd.Int64Dtype()),
+                          (True, bool, False, bool, pd.BooleanDtype()),
+                          (True, pd.BooleanDtype(), False, bool, pd.BooleanDtype()),
+                          ("foo", str, "bar", str, pd.CategoricalDtype(categories=['bar', 'foo'])),
+                         ]
+                        )
 def test_concat_dtypes_per_modality(run_component, write_mudata_to_file, change_column_contents, 
-                                    sample_1_h5mu, sample_2_h5mu, test_value_1, test_value_2,
+                                    sample_1_h5mu, sample_2_h5mu, test_value_1, value_1_dtype, test_value_2, value_2_dtype,
                                     expected, random_h5mu_path):
     """
     Test joining column with different dtypes to make sure that they are writable.
@@ -561,7 +574,10 @@ def test_concat_dtypes_per_modality(run_component, write_mudata_to_file, change_
     for the test column in mod2 is still writable.
     """
     change_column_contents(sample_1_h5mu, "var", "test_col", {"mod1": test_value_1, "mod2": test_value_1})
+    sample_1_h5mu.var['test_col'] = sample_1_h5mu.var['test_col'].astype(value_1_dtype)
     change_column_contents(sample_2_h5mu, "var", "test_col", {"mod1": test_value_2, "mod2": test_value_2})
+    sample_2_h5mu.var['test_col'] = sample_2_h5mu.var['test_col'].astype(value_2_dtype)
+
     output_file = random_h5mu_path()
     run_component([
         "--input_id", "sample1;sample2",
@@ -573,6 +589,40 @@ def test_concat_dtypes_per_modality(run_component, write_mudata_to_file, change_
     concatenated_data = md.read(output_file)
     assert concatenated_data['mod2'].var['test_col'].dtype == expected
 
+
+@pytest.mark.parametrize("test_value,value_dtype,expected", 
+                         [(1, float, pd.Int64Dtype()),
+                          (1, np.float64, pd.Int64Dtype()),
+                          (1, pd.Int16Dtype(), pd.Int16Dtype()),
+                          (True, bool, pd.BooleanDtype()),
+                          (True, pd.BooleanDtype(), pd.BooleanDtype()),
+                          ("foo", str, pd.CategoricalDtype(categories=['foo'])),
+                         ]
+                        )
+def test_concat_dtypes_per_modality_multidim(run_component, write_mudata_to_file, 
+                                             sample_1_h5mu, sample_2_h5mu, test_value, value_dtype,
+                                             expected, random_h5mu_path):
+    """
+    Test if the result of concatenation is still writable when the input already contain 
+    data in .varm and this data is kept. Because we are joining observations, the dtype of this
+    data may change and the result might not be writable anymore
+    """
+
+    sample_1_h5mu['mod1'].varm['test_df'] = pd.DataFrame(index=sample_1_h5mu['mod1'].var_names)
+    sample_1_h5mu['mod1'].varm['test_df']['test_col'] = test_value
+    sample_1_h5mu['mod1'].varm['test_df']['test_col'] = sample_1_h5mu['mod1'].varm['test_df']['test_col'].astype(value_dtype)
+
+    output_file = random_h5mu_path()
+    run_component([
+        "--input_id", "sample1;sample2",
+        "--input", write_mudata_to_file(sample_1_h5mu),
+        "--input", write_mudata_to_file(sample_2_h5mu),
+        "--output", output_file,
+        "--other_axis_mode", "move"
+        ])
+    concatenated_data = md.read(output_file)
+    assert concatenated_data['mod1'].varm['test_df']['test_col'].dtype == expected
+
 @pytest.mark.parametrize("test_value_1,test_value_2,expected", [(1, "1", pd.CategoricalDtype(categories=['1.0', '1']))])
 def test_concat_dtypes_global(run_component, write_mudata_to_file, change_column_contents, 
                               sample_1_h5mu, sample_2_h5mu, test_value_1, test_value_2,
@@ -622,6 +672,8 @@ def test_non_overlapping_modalities(run_component, sample_2_h5mu, sample_3_h5mu,
         "--output", output_path,
         "--other_axis_mode", "move"
         ])
+    output_data = md.read(output_path)
+    assert set(output_data.mod.keys()) == {"mod1", "mod2", "mod3"}
 
 
 def test_resolve_annotation_conflict_missing_column(run_component, sample_1_h5mu, 

diff --git a/target/docker/annotate/popv/.config.vsh.yaml b/target/docker/annotate/popv/.config.vsh.yaml
@@ -1,7 +1,7 @@
 functionality:
   name: "popv"
   namespace: "annotate"
-  version: "1.0.1"
+  version: "1.0.2"
   authors:
   - name: "Matthias Beyens"
     roles:
@@ -352,6 +352,6 @@ info:
   output: "/home/runner/work/openpipeline/openpipeline/target/docker/annotate/popv"
   executable: "/home/runner/work/openpipeline/openpipeline/target/docker/annotate/popv/popv"
   viash_version: "0.8.6"
-  git_commit: "8ba584e550ac93c484a8b6ddea0c44618d02cf3f"
+  git_commit: "57add3ff137ac44d67bc7456cb9146bc6dc633cc"
   git_remote: "https://github.com/openpipelines-bio/openpipeline"
-  git_tag: "1.0.0-1-g8ba584e550"
+  git_tag: "1.0.1-3-g57add3ff13"
diff --git a/target/docker/annotate/popv/popv b/target/docker/annotate/popv/popv
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-# popv 1.0.1
+# popv 1.0.2
 # 
 # This wrapper script is auto-generated by viash 0.8.6 and is thus a derivative
 # work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data
@@ -174,7 +174,7 @@ VIASH_META_TEMP_DIR="$VIASH_TEMP"
 
 # ViashHelp: Display helpful explanation about this executable
 function ViashHelp {
-  echo "popv 1.0.1"
+  echo "popv 1.0.2"
   echo ""
   echo "Performs popular major vote cell typing on single cell sequence data using"
   echo "multiple algorithms. Note that this is a one-shot version of PopV."
@@ -503,10 +503,10 @@ RUN cd /opt && git clone --depth 1 https://github.com/YosefLab/PopV.git && \
 
 LABEL org.opencontainers.image.authors="Matthias Beyens, Robrecht Cannoodt"
 LABEL org.opencontainers.image.description="Companion container for running component annotate popv"
-LABEL org.opencontainers.image.created="2024-06-18T14:14:56Z"
+LABEL org.opencontainers.image.created="2024-07-22T07:16:09Z"
 LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline"
-LABEL org.opencontainers.image.revision="8ba584e550ac93c484a8b6ddea0c44618d02cf3f"
-LABEL org.opencontainers.image.version="1.0.1"
+LABEL org.opencontainers.image.revision="57add3ff137ac44d67bc7456cb9146bc6dc633cc"
+LABEL org.opencontainers.image.version="1.0.2"
 
 VIASHDOCKER
 }
@@ -661,7 +661,7 @@ while [[ $# -gt 0 ]]; do
             shift 1
             ;;
         --version)
-            echo "popv 1.0.1"
+            echo "popv 1.0.2"
             exit
             ;;
         --input)

diff --git a/target/docker/cluster/leiden/.config.vsh.yaml b/target/docker/cluster/leiden/.config.vsh.yaml
@@ -1,7 +1,7 @@
 functionality:
   name: "leiden"
   namespace: "cluster"
-  version: "1.0.1"
+  version: "1.0.2"
   authors:
   - name: "Dries De Maeyer"
     roles:
@@ -230,6 +230,6 @@ info:
   output: "/home/runner/work/openpipeline/openpipeline/target/docker/cluster/leiden"
   executable: "/home/runner/work/openpipeline/openpipeline/target/docker/cluster/leiden/leiden"
   viash_version: "0.8.6"
-  git_commit: "8ba584e550ac93c484a8b6ddea0c44618d02cf3f"
+  git_commit: "57add3ff137ac44d67bc7456cb9146bc6dc633cc"
   git_remote: "https://github.com/openpipelines-bio/openpipeline"
-  git_tag: "1.0.0-1-g8ba584e550"
+  git_tag: "1.0.1-3-g57add3ff13"
diff --git a/target/docker/cluster/leiden/leiden b/target/docker/cluster/leiden/leiden
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-# leiden 1.0.1
+# leiden 1.0.2
 # 
 # This wrapper script is auto-generated by viash 0.8.6 and is thus a derivative
 # work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data
@@ -173,7 +173,7 @@ VIASH_META_TEMP_DIR="$VIASH_TEMP"
 
 # ViashHelp: Display helpful explanation about this executable
 function ViashHelp {
-  echo "leiden 1.0.1"
+  echo "leiden 1.0.2"
   echo ""
   echo "Cluster cells using the [Leiden algorithm] [Traag18] implemented in the [Scanpy"
   echo "framework] [Wolf18]."
@@ -460,10 +460,10 @@ RUN pip install --upgrade pip && \
 
 LABEL org.opencontainers.image.authors="Dries De Maeyer"
 LABEL org.opencontainers.image.description="Companion container for running component cluster leiden"
-LABEL org.opencontainers.image.created="2024-06-18T14:14:57Z"
+LABEL org.opencontainers.image.created="2024-07-22T07:16:05Z"
 LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline"
-LABEL org.opencontainers.image.revision="8ba584e550ac93c484a8b6ddea0c44618d02cf3f"
-LABEL org.opencontainers.image.version="1.0.1"
+LABEL org.opencontainers.image.revision="57add3ff137ac44d67bc7456cb9146bc6dc633cc"
+LABEL org.opencontainers.image.version="1.0.2"
 
 VIASHDOCKER
 }
@@ -618,7 +618,7 @@ while [[ $# -gt 0 ]]; do
             shift 1
             ;;
         --version)
-            echo "leiden 1.0.1"
+            echo "leiden 1.0.2"
             exit
             ;;
         --input)

diff --git a/target/docker/compression/compress_h5mu/.config.vsh.yaml b/target/docker/compression/compress_h5mu/.config.vsh.yaml
@@ -1,7 +1,7 @@
 functionality:
   name: "compress_h5mu"
   namespace: "compression"
-  version: "1.0.1"
+  version: "1.0.2"
   authors:
   - name: "Dries Schaumont"
     roles:
@@ -171,6 +171,6 @@ info:
   output: "/home/runner/work/openpipeline/openpipeline/target/docker/compression/compress_h5mu"
   executable: "/home/runner/work/openpipeline/openpipeline/target/docker/compression/compress_h5mu/compress_h5mu"
   viash_version: "0.8.6"
-  git_commit: "8ba584e550ac93c484a8b6ddea0c44618d02cf3f"
+  git_commit: "57add3ff137ac44d67bc7456cb9146bc6dc633cc"
   git_remote: "https://github.com/openpipelines-bio/openpipeline"
-  git_tag: "1.0.0-1-g8ba584e550"
+  git_tag: "1.0.1-3-g57add3ff13"
diff --git a/target/docker/compression/compress_h5mu/compress_h5mu b/target/docker/compression/compress_h5mu/compress_h5mu
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-# compress_h5mu 1.0.1
+# compress_h5mu 1.0.2
 # 
 # This wrapper script is auto-generated by viash 0.8.6 and is thus a derivative
 # work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data
@@ -173,7 +173,7 @@ VIASH_META_TEMP_DIR="$VIASH_TEMP"
 
 # ViashHelp: Display helpful explanation about this executable
 function ViashHelp {
-  echo "compress_h5mu 1.0.1"
+  echo "compress_h5mu 1.0.2"
   echo ""
   echo "Compress a MuData file."
   echo ""
@@ -423,10 +423,10 @@ RUN pip install --upgrade pip && \
 
 LABEL org.opencontainers.image.authors="Dries Schaumont"
 LABEL org.opencontainers.image.description="Companion container for running component compression compress_h5mu"
-LABEL org.opencontainers.image.created="2024-06-18T14:14:58Z"
+LABEL org.opencontainers.image.created="2024-07-22T07:16:06Z"
 LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline"
-LABEL org.opencontainers.image.revision="8ba584e550ac93c484a8b6ddea0c44618d02cf3f"
-LABEL org.opencontainers.image.version="1.0.1"
+LABEL org.opencontainers.image.revision="57add3ff137ac44d67bc7456cb9146bc6dc633cc"
+LABEL org.opencontainers.image.version="1.0.2"
 
 VIASHDOCKER
 }
@@ -581,7 +581,7 @@ while [[ $# -gt 0 ]]; do
             shift 1
             ;;
         --version)
-            echo "compress_h5mu 1.0.1"
+            echo "compress_h5mu 1.0.2"
             exit
             ;;
         --input)

diff --git a/target/docker/compression/tar_extract/.config.vsh.yaml b/target/docker/compression/tar_extract/.config.vsh.yaml
@@ -1,7 +1,7 @@
 functionality:
   name: "tar_extract"
   namespace: "compression"
-  version: "1.0.1"
+  version: "1.0.2"
   arguments:
   - type: "file"
     name: "--input"
@@ -107,6 +107,6 @@ info:
   output: "/home/runner/work/openpipeline/openpipeline/target/docker/compression/tar_extract"
   executable: "/home/runner/work/openpipeline/openpipeline/target/docker/compression/tar_extract/tar_extract"
   viash_version: "0.8.6"
-  git_commit: "8ba584e550ac93c484a8b6ddea0c44618d02cf3f"
+  git_commit: "57add3ff137ac44d67bc7456cb9146bc6dc633cc"
   git_remote: "https://github.com/openpipelines-bio/openpipeline"
-  git_tag: "1.0.0-1-g8ba584e550"
+  git_tag: "1.0.1-3-g57add3ff13"
diff --git a/target/docker/compression/tar_extract/tar_extract b/target/docker/compression/tar_extract/tar_extract
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-# tar_extract 1.0.1
+# tar_extract 1.0.2
 # 
 # This wrapper script is auto-generated by viash 0.8.6 and is thus a derivative
 # work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data
@@ -170,7 +170,7 @@ VIASH_META_TEMP_DIR="$VIASH_TEMP"
 
 # ViashHelp: Display helpful explanation about this executable
 function ViashHelp {
-  echo "tar_extract 1.0.1"
+  echo "tar_extract 1.0.2"
   echo ""
   echo "Extract files from a tar archive"
   echo ""
@@ -421,10 +421,10 @@ ENTRYPOINT []
  
 RUN :
 LABEL org.opencontainers.image.description="Companion container for running component compression tar_extract"
-LABEL org.opencontainers.image.created="2024-06-18T14:14:58Z"
+LABEL org.opencontainers.image.created="2024-07-22T07:16:06Z"
 LABEL org.opencontainers.image.source="https://github.com/openpipelines-bio/openpipeline"
-LABEL org.opencontainers.image.revision="8ba584e550ac93c484a8b6ddea0c44618d02cf3f"
-LABEL org.opencontainers.image.version="1.0.1"
+LABEL org.opencontainers.image.revision="57add3ff137ac44d67bc7456cb9146bc6dc633cc"
+LABEL org.opencontainers.image.version="1.0.2"
 
 VIASHDOCKER
 }
@@ -579,7 +579,7 @@ while [[ $# -gt 0 ]]; do
             shift 1
             ;;
         --version)
-            echo "tar_extract 1.0.1"
+            echo "tar_extract 1.0.2"
             exit
             ;;
         --input)

diff --git a/target/docker/convert/from_10xh5_to_h5mu/.config.vsh.yaml b/target/docker/convert/from_10xh5_to_h5mu/.config.vsh.yaml
@@ -1,7 +1,7 @@
 functionality:
   name: "from_10xh5_to_h5mu"
   namespace: "convert"
-  version: "1.0.1"
+  version: "1.0.2"
   authors:
   - name: "Robrecht Cannoodt"
     roles:
@@ -277,6 +277,6 @@ info:
   output: "/home/runner/work/openpipeline/openpipeline/target/docker/convert/from_10xh5_to_h5mu"
   executable: "/home/runner/work/openpipeline/openpipeline/target/docker/convert/from_10xh5_to_h5mu/from_10xh5_to_h5mu"
   viash_version: "0.8.6"
-  git_commit: "8ba584e550ac93c484a8b6ddea0c44618d02cf3f"
+  git_commit: "57add3ff137ac44d67bc7456cb9146bc6dc633cc"
   git_remote: "https://github.com/openpipelines-bio/openpipeline"
-  git_tag: "1.0.0-1-g8ba584e550"
+  git_tag: "1.0.1-3-g57add3ff13"