Skip to content

Commit

Permalink
Merge pull request #19 from RIVM-bioinformatics/parse_libraries
Browse files Browse the repository at this point in the history
Parse libraries
  • Loading branch information
boasvdp authored Feb 9, 2024
2 parents b681346 + 6c10ef4 commit 72da214
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 15 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,6 @@ build/

# Conda
conda

# config files
config/sample_sheet.yaml
24 changes: 21 additions & 3 deletions juno_library/juno_library.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
get_commit_git,
get_repo_url,
)
from typing import Any, Optional, Dict, cast
from typing import Any, Optional, Dict, Tuple, cast
import argparse


Expand Down Expand Up @@ -398,17 +398,35 @@ def __enlist_fastq_samples(self, dir: Path) -> None:
# because they get confused with the identifiers of forward and reverse
# reads.
pattern = re.compile(
r"(.*?)(?:_S\d+_|_S\d+.|_|\.)(?:_L555_)?(?:p)?R?(1|2)(?:_.*\.|\..*\.|\.)f(ast)?q(\.gz)?"
r"(.*?)(?:_S\d+_|_)(?:L\d{3}_)?(?:p)?R?(1|2)(?:_.*|\..*)?\.f(ast)?q(\.gz)?"
)
observed_combinations: Dict[Tuple[str, str], str] = {}
errors = []
for file_ in dir.iterdir():
filepath_ = str(file_.resolve())
if validate_file_has_min_lines(file_, self.min_num_lines):
if match := pattern.fullmatch(file_.name):
sample_name = match.group(1)
read_group = match.group(2)
if sample_name in self.excluded_samples:
continue
# check if sample_name and read_group combination is already seen before
# if this happens, it might be that the sample is spread over multiple sequencing lanes
if (sample_name, read_group) in observed_combinations:
observed_file = observed_combinations[sample_name, read_group]
errors.append(
KeyError(
f"Multiple fastq files ({observed_file} and {filepath_}) matching the same sample ({sample_name}) and read group ({read_group}). This pipeline expects only one fastq file per sample and read group."
)
)
else:
observed_combinations[(sample_name, read_group)] = filepath_
sample = self.sample_dict.setdefault(match.group(1), {})
sample[f"R{read_group}"] = str(file_.resolve())
sample[f"R{read_group}"] = filepath_
if len(errors) == 1:
raise errors[0]
elif len(errors) > 1:
raise KeyError(errors)

def __enlist_fasta_samples(self, dir: Path) -> None:
"""Function to enlist the fasta files found in the input directory.
Expand Down
46 changes: 34 additions & 12 deletions tests/library_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,8 @@ def setUpClass(cls) -> None:
"fake_dir_juno/clean_fastq",
"fake_dir_juno/de_novo_assembly_filtered",
"fake_dir_juno/identify_species",
"fake_wrong_fastq_names",
"fake_1_in_fastqname",
"fake_multiple_library_samples",
]

fake_files = [
Expand All @@ -198,8 +199,12 @@ def setUpClass(cls) -> None:
"fake_dir_juno/clean_fastq/1234_R1.fastq.gz",
"fake_dir_juno/clean_fastq/1234_R2.fastq.gz",
"fake_dir_juno/de_novo_assembly_filtered/1234.fasta",
"fake_wrong_fastq_names/1234_S001_PE_R1.fastq.gz",
"fake_wrong_fastq_names/1234_S001_PE_R2.fastq.gz",
"fake_1_in_fastqname/1234_1_R1.fastq.gz",
"fake_1_in_fastqname/1234_1_R2.fastq.gz",
"fake_multiple_library_samples/sample5_S1_L001_R1.fastq.gz",
"fake_multiple_library_samples/sample5_S1_L001_R2.fastq.gz",
"fake_multiple_library_samples/sample5_S1_L002_R1.fastq.gz",
"fake_multiple_library_samples/sample5_S1_L002_R2.fastq.gz",
]

for folder in fake_dirs:
Expand Down Expand Up @@ -231,7 +236,8 @@ def tearDownClass(cls) -> None:
"fake_dir_juno/clean_fastq",
"fake_dir_juno/de_novo_assembly_filtered",
"fake_dir_juno/identify_species",
"fake_wrong_fastq_names",
"fake_1_in_fastqname",
"fake_multiple_library_samples",
]

for folder in fake_dirs:
Expand Down Expand Up @@ -329,12 +335,14 @@ def test_excludefile(self) -> None:
pipeline.setup()
self.assertDictEqual(pipeline.sample_dict, expected_output)

def test_correctdir_fastq_with_L555_in_filename(self) -> None:
def test_correctdir_fastq_with_library_in_filename(self) -> None:
"""Testing the pipeline startup accepts fastq and fastq.gz files"""

input_dir = Path("fake_dir_wsamples").resolve()
make_non_empty_file(input_dir.joinpath("12345_S182_L555_R1_001.fastq.gz"))
make_non_empty_file(input_dir.joinpath("12345_S182_L555_R2_001.fastq.gz"))
make_non_empty_file(input_dir.joinpath("sample3_S182_L555_R1_001.fastq.gz"))
make_non_empty_file(input_dir.joinpath("sample3_S182_L555_R2_001.fastq.gz"))
make_non_empty_file(input_dir.joinpath("sample4_S183_L001_R1_001.fastq.gz"))
make_non_empty_file(input_dir.joinpath("sample4_S183_L001_R2_001.fastq.gz"))

expected_output = {
"sample1": {
Expand All @@ -345,9 +353,13 @@ def test_correctdir_fastq_with_L555_in_filename(self) -> None:
"R1": str(input_dir.joinpath("sample2_R1_filt.fq")),
"R2": str(input_dir.joinpath("sample2_R2_filt.fq.gz")),
},
"12345": {
"R1": str(input_dir.joinpath("12345_S182_L555_R1_001.fastq.gz")),
"R2": str(input_dir.joinpath("12345_S182_L555_R2_001.fastq.gz")),
"sample3": {
"R1": str(input_dir.joinpath("sample3_S182_L555_R1_001.fastq.gz")),
"R2": str(input_dir.joinpath("sample3_S182_L555_R2_001.fastq.gz")),
},
"sample4": {
"R1": str(input_dir.joinpath("sample4_S183_L001_R1_001.fastq.gz")),
"R2": str(input_dir.joinpath("sample4_S183_L001_R2_001.fastq.gz")),
},
}
pipeline = Pipeline(
Expand Down Expand Up @@ -497,13 +509,23 @@ def test_junodir_wnumericsamplenames(self) -> None:
pipeline.juno_metadata, expected_metadata, pipeline.juno_metadata
)

def test_fail_with_wrong_fastq_naming(self) -> None:
def test_fail_with_1_in_fastqname(self) -> None:
"""Testing the pipeline startup fails with wrong fastq naming (name
contains _1_ in the sample name)"""
with self.assertRaises(KeyError):
pipeline = Pipeline(
**default_args,
argv=["-i", "fake_wrong_fastq_names"],
argv=["-i", "fake_1_in_fastqname"],
input_type="fastq",
)
pipeline.setup()

def test_fail_with_multiple_libraries_per_sample(self) -> None:
"""Testing the pipeline startup fails with wrong fastq naming (multiple libraries per sample)"""
with self.assertRaises(KeyError):
pipeline = Pipeline(
**default_args,
argv=["-i", "fake_multiple_library_samples"],
input_type="fastq",
)
pipeline.setup()
Expand Down

0 comments on commit 72da214

Please sign in to comment.