Merge pull request #19 from RIVM-bioinformatics/parse_libraries

Parse libraries
RIVM-bioinformatics · Feb 9, 2024 · 72da214 · 72da214
2 parents b681346 + 6c10ef4
commit 72da214
Show file tree

Hide file tree

Showing 3 changed files with 58 additions and 15 deletions.
diff --git a/.gitignore b/.gitignore
@@ -9,3 +9,6 @@ build/
 
 # Conda
 conda
+
+# config files
+config/sample_sheet.yaml
diff --git a/juno_library/juno_library.py b/juno_library/juno_library.py
@@ -29,7 +29,7 @@
     get_commit_git,
     get_repo_url,
 )
-from typing import Any, Optional, Dict, cast
+from typing import Any, Optional, Dict, Tuple, cast
 import argparse
 
 
@@ -398,17 +398,35 @@ def __enlist_fastq_samples(self, dir: Path) -> None:
         # because they get confused with the identifiers of forward and reverse
         # reads.
         pattern = re.compile(
-            r"(.*?)(?:_S\d+_|_S\d+.|_|\.)(?:_L555_)?(?:p)?R?(1|2)(?:_.*\.|\..*\.|\.)f(ast)?q(\.gz)?"
+            r"(.*?)(?:_S\d+_|_)(?:L\d{3}_)?(?:p)?R?(1|2)(?:_.*|\..*)?\.f(ast)?q(\.gz)?"
         )
+        observed_combinations: Dict[Tuple[str, str], str] = {}
+        errors = []
         for file_ in dir.iterdir():
+            filepath_ = str(file_.resolve())
             if validate_file_has_min_lines(file_, self.min_num_lines):
                 if match := pattern.fullmatch(file_.name):
                     sample_name = match.group(1)
                     read_group = match.group(2)
                     if sample_name in self.excluded_samples:
                         continue
+                    # check if sample_name and read_group combination is already seen before
+                    # if this happens, it might be that the sample is spread over multiple sequencing lanes
+                    if (sample_name, read_group) in observed_combinations:
+                        observed_file = observed_combinations[sample_name, read_group]
+                        errors.append(
+                            KeyError(
+                                f"Multiple fastq files ({observed_file} and {filepath_}) matching the same sample ({sample_name}) and read group ({read_group}). This pipeline expects only one fastq file per sample and read group."
+                            )
+                        )
+                    else:
+                        observed_combinations[(sample_name, read_group)] = filepath_
                     sample = self.sample_dict.setdefault(match.group(1), {})
-                    sample[f"R{read_group}"] = str(file_.resolve())
+                    sample[f"R{read_group}"] = filepath_
+        if len(errors) == 1:
+            raise errors[0]
+        elif len(errors) > 1:
+            raise KeyError(errors)
 
     def __enlist_fasta_samples(self, dir: Path) -> None:
         """Function to enlist the fasta files found in the input directory.

diff --git a/tests/library_tests.py b/tests/library_tests.py
@@ -171,7 +171,8 @@ def setUpClass(cls) -> None:
             "fake_dir_juno/clean_fastq",
             "fake_dir_juno/de_novo_assembly_filtered",
             "fake_dir_juno/identify_species",
-            "fake_wrong_fastq_names",
+            "fake_1_in_fastqname",
+            "fake_multiple_library_samples",
         ]
 
         fake_files = [
@@ -198,8 +199,12 @@ def setUpClass(cls) -> None:
             "fake_dir_juno/clean_fastq/1234_R1.fastq.gz",
             "fake_dir_juno/clean_fastq/1234_R2.fastq.gz",
             "fake_dir_juno/de_novo_assembly_filtered/1234.fasta",
-            "fake_wrong_fastq_names/1234_S001_PE_R1.fastq.gz",
-            "fake_wrong_fastq_names/1234_S001_PE_R2.fastq.gz",
+            "fake_1_in_fastqname/1234_1_R1.fastq.gz",
+            "fake_1_in_fastqname/1234_1_R2.fastq.gz",
+            "fake_multiple_library_samples/sample5_S1_L001_R1.fastq.gz",
+            "fake_multiple_library_samples/sample5_S1_L001_R2.fastq.gz",
+            "fake_multiple_library_samples/sample5_S1_L002_R1.fastq.gz",
+            "fake_multiple_library_samples/sample5_S1_L002_R2.fastq.gz",
         ]
 
         for folder in fake_dirs:
@@ -231,7 +236,8 @@ def tearDownClass(cls) -> None:
             "fake_dir_juno/clean_fastq",
             "fake_dir_juno/de_novo_assembly_filtered",
             "fake_dir_juno/identify_species",
-            "fake_wrong_fastq_names",
+            "fake_1_in_fastqname",
+            "fake_multiple_library_samples",
         ]
 
         for folder in fake_dirs:
@@ -329,12 +335,14 @@ def test_excludefile(self) -> None:
         pipeline.setup()
         self.assertDictEqual(pipeline.sample_dict, expected_output)
 
-    def test_correctdir_fastq_with_L555_in_filename(self) -> None:
+    def test_correctdir_fastq_with_library_in_filename(self) -> None:
         """Testing the pipeline startup accepts fastq and fastq.gz files"""
 
         input_dir = Path("fake_dir_wsamples").resolve()
-        make_non_empty_file(input_dir.joinpath("12345_S182_L555_R1_001.fastq.gz"))
-        make_non_empty_file(input_dir.joinpath("12345_S182_L555_R2_001.fastq.gz"))
+        make_non_empty_file(input_dir.joinpath("sample3_S182_L555_R1_001.fastq.gz"))
+        make_non_empty_file(input_dir.joinpath("sample3_S182_L555_R2_001.fastq.gz"))
+        make_non_empty_file(input_dir.joinpath("sample4_S183_L001_R1_001.fastq.gz"))
+        make_non_empty_file(input_dir.joinpath("sample4_S183_L001_R2_001.fastq.gz"))
 
         expected_output = {
             "sample1": {
@@ -345,9 +353,13 @@ def test_correctdir_fastq_with_L555_in_filename(self) -> None:
                 "R1": str(input_dir.joinpath("sample2_R1_filt.fq")),
                 "R2": str(input_dir.joinpath("sample2_R2_filt.fq.gz")),
             },
-            "12345": {
-                "R1": str(input_dir.joinpath("12345_S182_L555_R1_001.fastq.gz")),
-                "R2": str(input_dir.joinpath("12345_S182_L555_R2_001.fastq.gz")),
+            "sample3": {
+                "R1": str(input_dir.joinpath("sample3_S182_L555_R1_001.fastq.gz")),
+                "R2": str(input_dir.joinpath("sample3_S182_L555_R2_001.fastq.gz")),
+            },
+            "sample4": {
+                "R1": str(input_dir.joinpath("sample4_S183_L001_R1_001.fastq.gz")),
+                "R2": str(input_dir.joinpath("sample4_S183_L001_R2_001.fastq.gz")),
             },
         }
         pipeline = Pipeline(
@@ -497,13 +509,23 @@ def test_junodir_wnumericsamplenames(self) -> None:
             pipeline.juno_metadata, expected_metadata, pipeline.juno_metadata
         )
 
-    def test_fail_with_wrong_fastq_naming(self) -> None:
+    def test_fail_with_1_in_fastqname(self) -> None:
         """Testing the pipeline startup fails with wrong fastq naming (name
         contains _1_ in the sample name)"""
         with self.assertRaises(KeyError):
             pipeline = Pipeline(
                 **default_args,
-                argv=["-i", "fake_wrong_fastq_names"],
+                argv=["-i", "fake_1_in_fastqname"],
+                input_type="fastq",
+            )
+            pipeline.setup()
+
+    def test_fail_with_multiple_libraries_per_sample(self) -> None:
+        """Testing the pipeline startup fails with wrong fastq naming (multiple libraries per sample)"""
+        with self.assertRaises(KeyError):
+            pipeline = Pipeline(
+                **default_args,
+                argv=["-i", "fake_multiple_library_samples"],
                 input_type="fastq",
             )
             pipeline.setup()
-Original file line number
+Diff line change
@@ Expand Up / @@ -9,3 +9,6 @@ build/ @@
     # Conda
     conda
+    # config files
+    config/sample_sheet.yaml