diff --git a/htsinfer/get_library_type.py b/htsinfer/get_library_type.py index 4d4257c..37dad4f 100644 --- a/htsinfer/get_library_type.py +++ b/htsinfer/get_library_type.py @@ -113,7 +113,7 @@ def _evaluate_mate_relationship( ids_2: As `ids_1` but for the putative second mate file. """ self.results.relationship = StatesTypeRelationship.not_mates - if ids_1 == ids_2: + if ids_1 and ids_2 and ids_1 == ids_2: if ( self.results.file_1 == StatesType.first_mate and self.results.file_2 == StatesType.second_mate @@ -127,13 +127,23 @@ def _evaluate_mate_relationship( self.mapping.library_type.relationship = ( StatesTypeRelationship.split_mates ) - else: + elif ( + self.library_source.file_1.short_name is not None or + self.library_source.file_2.short_name is not None + ): + LOGGER.debug("Determining mate relationship by alignment...") self.mapping.library_type.relationship \ = StatesTypeRelationship.not_available self.mapping.library_source = self.library_source self.mapping.paths = self.path_1, self.path_2 self.mapping.evaluate() self._align_mates() + else: + self.results.relationship = StatesTypeRelationship.not_available + LOGGER.debug( + "Sequence IDs and library source are not determined, " + "mate relationship cannot be inferred." + ) def _align_mates(self): """Decide mate relationship by alignment.""" @@ -144,19 +154,24 @@ def _align_mates(self): samfile1 = pysam.AlignmentFile(str(alignment_1), 'r') samfile2 = pysam.AlignmentFile(str(alignment_2), 'r') + seq_id1 = None + seq_id2 = None previous_seq_id1 = None previous_seq_id2 = None reads1 = [] # List to store alignments for one read from file1 - mate1 = [] # List to store alignments for each read + mate1 = [] # List to store alignments for each read from file1 reads2 = [] # List to store alignments for one read from file2 + mate2 = [] # List to store alignments for each read from file2 concordant = 0 for read1 in samfile1: seq_id1 = read1.query_name - if seq_id1 != previous_seq_id1 \ - and previous_seq_id1 is not None: + if ( + seq_id1 != previous_seq_id1 and + previous_seq_id1 is not None + ): mate1.append(reads1.copy()) reads1.clear() if read1.reference_end: @@ -167,35 +182,63 @@ def _align_mates(self): read_counter = 0 for read2 in samfile2: seq_id2 = read2.query_name - if seq_id2 != previous_seq_id2 \ - and previous_seq_id2 is not None: - if self._compare_alignments(mate1[read_counter], reads2): + if ( + seq_id2 != previous_seq_id2 and + previous_seq_id2 is not None + ): + mate2.append(reads2.copy()) + if self._compare_alignments( + mate1[read_counter], reads2 + ): concordant += 1 reads2.clear() read_counter += 1 if read2.reference_end: reads2.append(read2) previous_seq_id2 = seq_id2 - - if self._compare_alignments(mate1[read_counter], reads2): + mate2.append(reads2.copy()) + if self._compare_alignments( + mate1[read_counter], reads2 + ): concordant += 1 - if (concordant / read_counter) >= self.cutoff: - self.results.relationship = ( - StatesTypeRelationship.split_mates - ) - self.mapping.library_type.relationship \ - = StatesTypeRelationship.split_mates - self.mapping.mapped = False - self.mapping.star_dirs = [] - else: - self.results.relationship = ( - StatesTypeRelationship.not_mates - ) + aligned_mate1 = len(list(filter(None, mate1))) + aligned_mate2 = len(list(filter(None, mate2))) + + LOGGER.debug(f"Number of aligned reads file 1: {aligned_mate1}") + LOGGER.debug(f"Number of aligned reads file 2: {aligned_mate2}") + LOGGER.debug(f"Number of concordant reads: {concordant}") + + self._update_relationship( + concordant, min(aligned_mate1, aligned_mate2) + ) samfile1.close() samfile2.close() + def _update_relationship(self, concordant, aligned_reads): + """Helper function to update relationship based on alignment.""" + try: + ratio = concordant / aligned_reads + except ZeroDivisionError: + self.results.relationship = ( + StatesTypeRelationship.not_available + ) + else: + if ratio >= self.cutoff: + self.results.relationship = ( + StatesTypeRelationship.split_mates + ) + self.mapping.library_type.relationship = ( + StatesTypeRelationship.split_mates + ) + self.mapping.mapped = False + self.mapping.star_dirs = [] + else: + self.results.relationship = ( + StatesTypeRelationship.not_mates + ) + class AlignedSegment: """Placeholder class for mypy "Missing attribute" error in _compare_alignments(), the actual object used @@ -302,44 +345,47 @@ def evaluate(self) -> None: self.result = StatesType.not_available raise FileProblem(f"File is empty: {self.path}") from exc - if self.seq_id_format is None: + if self.seq_id_format is not None: + LOGGER.debug( + "Sequence identifier format: " + f"{self.seq_id_format.name}" + ) + else: self.result = StatesType.not_available - raise MetadataWarning( + LOGGER.debug( "Could not determine sequence identifier format." ) - LOGGER.debug( - f"Sequence identifier format: {self.seq_id_format.name}" - ) # Ensure that remaining records are compatible with sequence # identifier format and library type determined from first # record - LOGGER.debug( - "Checking consistency of remaining reads with initially " - "determined identifier format and library type..." - ) - for record in seq_iter: - records += 1 - try: - self._get_read_type( - seq_id=record[0], - regex=self.seq_id_format.value, - ) - except ( - InconsistentFastqIdentifiers, - UnknownFastqIdentifier, - ) as exc: - self.result = StatesType.not_available - raise MetadataWarning( - f"{type(exc).__name__}: {str(exc)}" - ) from exc + if self.seq_id_format is not None: + LOGGER.debug( + "Checking consistency of remaining reads with " + "initially determined identifier format " + "and library type..." + ) + for record in seq_iter: + records += 1 + try: + self._get_read_type( + seq_id=record[0], + regex=self.seq_id_format.value, + ) + except ( + InconsistentFastqIdentifiers, + UnknownFastqIdentifier, + ) as exc: + self.result = StatesType.not_available + raise MetadataWarning( + f"{type(exc).__name__}: {str(exc)}" + ) from exc + LOGGER.debug(f"Total records processed: {records}") except (OSError, ValueError) as exc: self.result = StatesType.not_available raise FileProblem(f"{type(exc).__name__}: {str(exc)}") from exc - LOGGER.debug(f"Total records processed: {records}") - def _get_read_type( self, seq_id: str, diff --git a/htsinfer/get_read_orientation.py b/htsinfer/get_read_orientation.py index d76b092..4b36e8b 100644 --- a/htsinfer/get_read_orientation.py +++ b/htsinfer/get_read_orientation.py @@ -75,7 +75,11 @@ def evaluate(self) -> ResultsOrientation: self.mapping.transcripts_file = self.transcripts_file self.mapping.tmp_dir = self.tmp_dir - if not self.mapping.mapped: + if not self.mapping.mapped and ( + self.library_source.file_1.short_name is not None or + self.library_source.file_2.short_name is not None + ): + LOGGER.debug("Determining read relationship by alignment...") self.mapping.evaluate() return self.process_alignments(star_dirs=self.mapping.star_dirs) diff --git a/htsinfer/mapping.py b/htsinfer/mapping.py index cb53f7a..52780f3 100644 --- a/htsinfer/mapping.py +++ b/htsinfer/mapping.py @@ -51,11 +51,7 @@ def __init__( self.star_dirs: List[Path] = [] def evaluate(self): - """Infer read orientation. - - Returns: - Orientation results object. - """ + """Align FASTQ files to reference transcripts with STAR.""" # get transcripts for current organims transcripts = self.subset_transcripts_by_organism() @@ -270,6 +266,10 @@ def prepare_star_alignment_commands( ) -> Dict[Path, List[str]]: """Prepare STAR alignment commands. + Input FASTQ files are assumed to be sorted according to reference names + or coordinates, the order of input reads is kept with the option + "PairedKeepInputOrder", no additional sorting of aligned reads is done. + Args: index_dir: Path to directory containing STAR index. @@ -299,6 +299,7 @@ def build_star_command( "--runThreadN", f"{str(self.threads_star)}", "--genomeDir", f"{str(index_dir)}", "--outFilterMultimapNmax", "50", + "--outSAMorder", "PairedKeepInputOrder", "--outSAMunmapped", "Within", "KeepPairs", ] cmd: List[str] = cmd_base[:] diff --git a/tests/test_get_library_type.py b/tests/test_get_library_type.py index dd52dfa..e1a0dce 100644 --- a/tests/test_get_library_type.py +++ b/tests/test_get_library_type.py @@ -13,7 +13,9 @@ GetFastqType, ) from htsinfer.models import ( + ResultsSource, ResultsType, + Source, SeqIdFormats, StatesType, StatesTypeRelationship, @@ -27,11 +29,11 @@ FILE_INCONSISTENT_IDS_SINGLE_OLD_NEW, FILE_MATE_1, FILE_MATE_2, + FILE_UNKNOWN_SEQ_ID, FILE_IDS_NOT_MATCH_1, FILE_IDS_NOT_MATCH_2, FILE_TRANSCRIPTS, FILE_SINGLE, - FILE_UNKNOWN_SEQ_ID, RaiseError, SEQ_ID_DUMMY, SEQ_ID_MATE_1, @@ -112,6 +114,35 @@ def test_evaluate_mate_relationship_split_mates(self): ) def test_evaluate_mate_relationship_not_mates(self, tmpdir): + """Test mate relationship evaluation logic with input files that are + mates, but the relationship is not enough to trigger split_mates. + """ + CONFIG.args.path_1_processed = FILE_MATE_1 + CONFIG.args.path_2_processed = FILE_MATE_2 + CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + CONFIG.args.tmp_dir = tmpdir + MAPPING.paths = (FILE_MATE_1, FILE_MATE_2) + MAPPING.transcripts_file = FILE_TRANSCRIPTS + MAPPING.tmp_dir = tmpdir + + test_instance = GetLibType(config=CONFIG, mapping=MAPPING) + test_instance.results.file_1 = StatesType.not_available + test_instance.results.file_2 = StatesType.not_available + + # Set the cutoff such that it's not enough to trigger split_mates + test_instance.cutoff = 300 + + # Call the _evaluate_mate_relationship method + test_instance._evaluate_mate_relationship( + ids_1=["A", "B", "C"], ids_2=["A", "B", "C"] + ) + + assert ( + test_instance.results.relationship == + StatesTypeRelationship.not_mates + ) + + def test_evaluate_mate_relationship_not_available(self, tmpdir): """Test mate relationship evaluation logic with input files that are not mates from a paired-end library. """ @@ -119,18 +150,78 @@ def test_evaluate_mate_relationship_not_mates(self, tmpdir): CONFIG.args.path_2_processed = FILE_MATE_2 CONFIG.args.t_file_processed = FILE_TRANSCRIPTS CONFIG.args.tmp_dir = tmpdir + CONFIG.results.library_source = ResultsSource( + file_1=Source(short_name="hsapiens", taxon_id=9606), + file_2=Source(short_name="hsapiens", taxon_id=9606), + ) MAPPING.paths = (FILE_IDS_NOT_MATCH_1, FILE_MATE_2) MAPPING.transcripts_file = FILE_TRANSCRIPTS MAPPING.tmp_dir = tmpdir test_instance = GetLibType(config=CONFIG, mapping=MAPPING) - test_instance.results.file_1 = StatesType.first_mate - test_instance.results.file_2 = StatesType.second_mate + test_instance.results.file_1 = StatesType.not_available + test_instance.results.file_2 = StatesType.not_available test_instance.evaluate() + assert ( + test_instance.results.relationship == + StatesTypeRelationship.not_available + ) + + def test_update_relationship_not_mates(self, tmpdir): + """Test update_relationship logic.""" + CONFIG.args.path_1_processed = FILE_IDS_NOT_MATCH_1 + CONFIG.args.path_2_processed = FILE_MATE_2 + CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + CONFIG.args.tmp_dir = tmpdir + MAPPING.paths = (FILE_IDS_NOT_MATCH_1, FILE_MATE_2) + MAPPING.transcripts_file = FILE_TRANSCRIPTS + MAPPING.tmp_dir = tmpdir + + test_instance = GetLibType(config=CONFIG, mapping=MAPPING) + test_instance.results.file_1 = StatesType.not_available + test_instance.results.file_2 = StatesType.not_available + + # Simulate a scenario where ratio is below the cutoff + concordant = 0 + read_counter = 20 + + # Call the _update_relationship method + test_instance._update_relationship(concordant, read_counter) + assert ( test_instance.results.relationship == StatesTypeRelationship.not_mates ) + assert ( + test_instance.mapping.library_type.relationship == + StatesTypeRelationship.not_available + ) + + def test_evaluate_mate_relationship_not_determined(self, tmpdir): + """Test mate relationship evaluation logic when + library source is not determined. + """ + CONFIG.args.path_1_processed = FILE_MATE_1 + CONFIG.args.path_2_processed = FILE_MATE_2 + CONFIG.args.t_file_processed = FILE_TRANSCRIPTS + CONFIG.args.tmp_dir = tmpdir + CONFIG.results.library_source = ResultsSource( + file_1=Source(), + file_2=Source(), + ) + test_instance = GetLibType(config=CONFIG, mapping=MAPPING) + test_instance.results.file_1 = StatesType.not_available + test_instance.results.file_2 = StatesType.not_available + + # Call the _evaluate_mate_relationship method + test_instance._evaluate_mate_relationship( + ids_1=["A", "B", "C"], ids_2=["D", "E", "F"] + ) + + assert ( + test_instance.results.relationship == + StatesTypeRelationship.not_available + ) def test_evaluate_split_mates_not_matching_ids(self, tmpdir): """Test mate relationship evaluation logic with input files that are @@ -138,6 +229,10 @@ def test_evaluate_split_mates_not_matching_ids(self, tmpdir): """ CONFIG.args.path_1_processed = FILE_IDS_NOT_MATCH_1 CONFIG.args.path_2_processed = FILE_IDS_NOT_MATCH_2 + CONFIG.results.library_source = ResultsSource( + file_1=Source(short_name="hsapiens", taxon_id=9606), + file_2=Source(short_name="hsapiens", taxon_id=9606), + ) CONFIG.args.tmp_dir = tmpdir MAPPING.paths = (FILE_IDS_NOT_MATCH_1, FILE_IDS_NOT_MATCH_2) MAPPING.tmp_dir = tmpdir @@ -175,12 +270,6 @@ def test_evaluate_single(self): test_instance.evaluate() assert test_instance.result == StatesType.single - def test_evaluate_unknown_seq_id(self): - """Evaluate file with identifiers of an unknown format.""" - test_instance = GetFastqType(path=FILE_UNKNOWN_SEQ_ID) - with pytest.raises(MetadataWarning): - test_instance.evaluate() - def test_evaluate_inconsistent_identifiers_single_mate(self): """Raise ``MetadataWarning`` by passing a file with inconsistent identifiers, suggesting a single-end library first, then a paired-end @@ -272,6 +361,12 @@ def test_get_read_type_no_match(self): regex=SeqIdFormats['Casava >=1.8'].value, ) + def test_evaluate_unknown_identifier_format(self): + """Test scenario where seq_id format cannot be determined.""" + test_instance = GetFastqType(path=FILE_UNKNOWN_SEQ_ID) + test_instance.evaluate() + assert test_instance.result == StatesType.not_available + def test_get_read_type_single_pass(self): """Read identifier is consistent with previous state.""" test_instance = GetFastqType(path=FILE_DUMMY) diff --git a/tests/test_get_read_orientation.py b/tests/test_get_read_orientation.py index 1089433..b73b8e2 100644 --- a/tests/test_get_read_orientation.py +++ b/tests/test_get_read_orientation.py @@ -73,6 +73,10 @@ def test_init_all(self, tmpdir): CONFIG.args.path_2_processed = FILE_MATE_2 CONFIG.args.t_file_processed = FILE_TRANSCRIPTS CONFIG.args.tmp_dir = tmpdir + CONFIG.results.library_source = ResultsSource( + file_1=Source(), + file_2=Source() + ) test_instance = GetOrientation(config=CONFIG, mapping=MAPPING) assert test_instance.paths[0] == FILE_MATE_1 @@ -154,7 +158,10 @@ def test_evaluate_paired_unmapped(self, tmpdir): CONFIG.args.path_1_processed = FILE_UNMAPPED_PAIRED_1 CONFIG.args.path_2_processed = FILE_UNMAPPED_PAIRED_2 CONFIG.args.tmp_dir = tmpdir - CONFIG.results.library_source = ResultsSource() + CONFIG.results.library_source = ResultsSource( + file_1=Source(short_name="hsapiens", taxon_id=9606), + file_2=Source(short_name="hsapiens", taxon_id=9606) + ) CONFIG.results.library_type = ResultsType( relationship=StatesTypeRelationship.split_mates, ) @@ -259,7 +266,7 @@ def test_evaluate_paired_not_mates_unmapped(self, tmpdir): ) CONFIG.results.library_source = ResultsSource( file_1=Source(), - file_2=Source(), + file_2=Source(short_name="hsapiens", taxon_id=9606), ) CONFIG.args.tmp_dir = tmpdir MAPPING.mapped = False diff --git a/tests/test_mapping.py b/tests/test_mapping.py index 33798be..1bdecbb 100644 --- a/tests/test_mapping.py +++ b/tests/test_mapping.py @@ -187,7 +187,8 @@ def test_prepare_star_alignment_commands(self, tmpdir): file1_alignment_path = tmpdir / 'alignments/file_1' cmd = "STAR --alignIntronMax 1 --alignEndsType Local --runThreadN 1" \ + " --genomeDir " + str(index_dir) + " --outFilterMultimapNmax " \ - + "50 --outSAMunmapped Within KeepPairs --readFilesIn " \ + + "50 --outSAMorder PairedKeepInputOrder " \ + + "--outSAMunmapped Within KeepPairs --readFilesIn " \ + str(FILE_2000_RECORDS) + " --outFileNamePrefix " \ + str(file1_alignment_path) + "/" results = test_instance.prepare_star_alignment_commands(