diff --git a/.gitignore b/.gitignore index d192bd2..c6a3724 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,6 @@ build/ # Conda conda + +# config files +config/sample_sheet.yaml \ No newline at end of file diff --git a/juno_library/juno_library.py b/juno_library/juno_library.py index e0d8e9d..65a227f 100644 --- a/juno_library/juno_library.py +++ b/juno_library/juno_library.py @@ -29,7 +29,7 @@ get_commit_git, get_repo_url, ) -from typing import Any, Optional, Dict, cast +from typing import Any, Optional, Dict, Tuple, cast import argparse @@ -398,17 +398,35 @@ def __enlist_fastq_samples(self, dir: Path) -> None: # because they get confused with the identifiers of forward and reverse # reads. pattern = re.compile( - r"(.*?)(?:_S\d+_|_S\d+.|_|\.)(?:_L555_)?(?:p)?R?(1|2)(?:_.*\.|\..*\.|\.)f(ast)?q(\.gz)?" + r"(.*?)(?:_S\d+_|_)(?:L\d{3}_)?(?:p)?R?(1|2)(?:_.*|\..*)?\.f(ast)?q(\.gz)?" ) + observed_combinations: Dict[Tuple[str, str], str] = {} + errors = [] for file_ in dir.iterdir(): + filepath_ = str(file_.resolve()) if validate_file_has_min_lines(file_, self.min_num_lines): if match := pattern.fullmatch(file_.name): sample_name = match.group(1) read_group = match.group(2) if sample_name in self.excluded_samples: continue + # check if sample_name and read_group combination is already seen before + # if this happens, it might be that the sample is spread over multiple sequencing lanes + if (sample_name, read_group) in observed_combinations: + observed_file = observed_combinations[sample_name, read_group] + errors.append( + KeyError( + f"Multiple fastq files ({observed_file} and {filepath_}) matching the same sample ({sample_name}) and read group ({read_group}). This pipeline expects only one fastq file per sample and read group." + ) + ) + else: + observed_combinations[(sample_name, read_group)] = filepath_ sample = self.sample_dict.setdefault(match.group(1), {}) - sample[f"R{read_group}"] = str(file_.resolve()) + sample[f"R{read_group}"] = filepath_ + if len(errors) == 1: + raise errors[0] + elif len(errors) > 1: + raise KeyError(errors) def __enlist_fasta_samples(self, dir: Path) -> None: """Function to enlist the fasta files found in the input directory. diff --git a/tests/library_tests.py b/tests/library_tests.py index 2a5bd0e..af61f4a 100644 --- a/tests/library_tests.py +++ b/tests/library_tests.py @@ -171,7 +171,8 @@ def setUpClass(cls) -> None: "fake_dir_juno/clean_fastq", "fake_dir_juno/de_novo_assembly_filtered", "fake_dir_juno/identify_species", - "fake_wrong_fastq_names", + "fake_1_in_fastqname", + "fake_multiple_library_samples", ] fake_files = [ @@ -198,8 +199,12 @@ def setUpClass(cls) -> None: "fake_dir_juno/clean_fastq/1234_R1.fastq.gz", "fake_dir_juno/clean_fastq/1234_R2.fastq.gz", "fake_dir_juno/de_novo_assembly_filtered/1234.fasta", - "fake_wrong_fastq_names/1234_S001_PE_R1.fastq.gz", - "fake_wrong_fastq_names/1234_S001_PE_R2.fastq.gz", + "fake_1_in_fastqname/1234_1_R1.fastq.gz", + "fake_1_in_fastqname/1234_1_R2.fastq.gz", + "fake_multiple_library_samples/sample5_S1_L001_R1.fastq.gz", + "fake_multiple_library_samples/sample5_S1_L001_R2.fastq.gz", + "fake_multiple_library_samples/sample5_S1_L002_R1.fastq.gz", + "fake_multiple_library_samples/sample5_S1_L002_R2.fastq.gz", ] for folder in fake_dirs: @@ -231,7 +236,8 @@ def tearDownClass(cls) -> None: "fake_dir_juno/clean_fastq", "fake_dir_juno/de_novo_assembly_filtered", "fake_dir_juno/identify_species", - "fake_wrong_fastq_names", + "fake_1_in_fastqname", + "fake_multiple_library_samples", ] for folder in fake_dirs: @@ -329,12 +335,14 @@ def test_excludefile(self) -> None: pipeline.setup() self.assertDictEqual(pipeline.sample_dict, expected_output) - def test_correctdir_fastq_with_L555_in_filename(self) -> None: + def test_correctdir_fastq_with_library_in_filename(self) -> None: """Testing the pipeline startup accepts fastq and fastq.gz files""" input_dir = Path("fake_dir_wsamples").resolve() - make_non_empty_file(input_dir.joinpath("12345_S182_L555_R1_001.fastq.gz")) - make_non_empty_file(input_dir.joinpath("12345_S182_L555_R2_001.fastq.gz")) + make_non_empty_file(input_dir.joinpath("sample3_S182_L555_R1_001.fastq.gz")) + make_non_empty_file(input_dir.joinpath("sample3_S182_L555_R2_001.fastq.gz")) + make_non_empty_file(input_dir.joinpath("sample4_S183_L001_R1_001.fastq.gz")) + make_non_empty_file(input_dir.joinpath("sample4_S183_L001_R2_001.fastq.gz")) expected_output = { "sample1": { @@ -345,9 +353,13 @@ def test_correctdir_fastq_with_L555_in_filename(self) -> None: "R1": str(input_dir.joinpath("sample2_R1_filt.fq")), "R2": str(input_dir.joinpath("sample2_R2_filt.fq.gz")), }, - "12345": { - "R1": str(input_dir.joinpath("12345_S182_L555_R1_001.fastq.gz")), - "R2": str(input_dir.joinpath("12345_S182_L555_R2_001.fastq.gz")), + "sample3": { + "R1": str(input_dir.joinpath("sample3_S182_L555_R1_001.fastq.gz")), + "R2": str(input_dir.joinpath("sample3_S182_L555_R2_001.fastq.gz")), + }, + "sample4": { + "R1": str(input_dir.joinpath("sample4_S183_L001_R1_001.fastq.gz")), + "R2": str(input_dir.joinpath("sample4_S183_L001_R2_001.fastq.gz")), }, } pipeline = Pipeline( @@ -497,13 +509,23 @@ def test_junodir_wnumericsamplenames(self) -> None: pipeline.juno_metadata, expected_metadata, pipeline.juno_metadata ) - def test_fail_with_wrong_fastq_naming(self) -> None: + def test_fail_with_1_in_fastqname(self) -> None: """Testing the pipeline startup fails with wrong fastq naming (name contains _1_ in the sample name)""" with self.assertRaises(KeyError): pipeline = Pipeline( **default_args, - argv=["-i", "fake_wrong_fastq_names"], + argv=["-i", "fake_1_in_fastqname"], + input_type="fastq", + ) + pipeline.setup() + + def test_fail_with_multiple_libraries_per_sample(self) -> None: + """Testing the pipeline startup fails with wrong fastq naming (multiple libraries per sample)""" + with self.assertRaises(KeyError): + pipeline = Pipeline( + **default_args, + argv=["-i", "fake_multiple_library_samples"], input_type="fastq", ) pipeline.setup()