feature: adding converters and test data for AA translation (#87)

* implement process/normalize_reads() * WIP: bam_to_cleartext_alignment * POC bam_to_cleartext_aligment() * WIP insertions as JSON * POC full aligne wiht padding and small data * add note * correct readme and cleanup of index files * reformat * refactor resources * [refactor]: `silo` module (#80) * refactor: move lapis script to module * refactor: move wrangleing to silo module * refactor: move wrangleing to silo module * add docs * Update src/sr2silo/silo/wrangle.py Co-authored-by: Copilot <[email protected]> * typos - github copilote Co-authored-by: Copilot <[email protected]> * refactore: moving the URLs for lapis to env variables * bug: auth should not be got is in CI * silo * silo * OO in silo * works * WORKS * CI issue --------- Co-authored-by: Gordon J. Köhn <[email protected]> Co-authored-by: Copilot <[email protected]> Getting all Changes from the Nextclade Try on main for try with BlastX * Apply suggestions from code review Co-authored-by: Copilot <[email protected]> * Update tests/data/bam/README.md Co-authored-by: Copilot <[email protected]> * reformat * missing fixture tag --------- Co-authored-by: Gordon J. Köhn <[email protected]> Co-authored-by: Copilot <[email protected]>
cbg-ethz · Jan 30, 2025 · 27bd16e · 27bd16e
1 parent 359df45
commit 27bd16e
Show file tree

Hide file tree

Showing 16 changed files with 16,287 additions and 32 deletions.
diff --git a/README.md b/README.md
@@ -2,10 +2,10 @@
 <picture>
   <source
     media="(prefers-color-scheme: light)"
-    srcset="resources/logo.svg">
+    srcset="resources/graphics/logo.svg">
   <source
     media="(prefers-color-scheme: dark)"
-    srcset="resources/logo_dark_mode.svg">
+    srcset="resources/graphics/logo_dark_mode.svg">
   <img alt="Logo" src="resources/logo.svg" width="15%" />
 </picture>
 

diff --git a/resources/logo.svg → resources/graphics/logo.svg b/resources/logo.svg → resources/graphics/logo.svg
diff --git a/resources/logo_dark_mode.svg → resources/graphics/logo_dark_mode.svg b/resources/logo_dark_mode.svg → resources/graphics/logo_dark_mode.svg
diff --git a/resources/sars-cov-2/NC_045512.2.fasta b/resources/sars-cov-2/NC_045512.2.fasta
diff --git a/resources/sars-cov-2/README.md b/resources/sars-cov-2/README.md
@@ -0,0 +1,5 @@
+This folder contains data for the alignment of sars-cov-2
+
+The `NC_045512.2.fasta` is the one used by V-Pipe.
+
+The `reference_genomes.json` is the one used by NextClade.
diff --git a/resources/reference_genomes.json → resources/sars-cov-2/reference_genomes.json b/resources/reference_genomes.json → resources/sars-cov-2/reference_genomes.json
diff --git a/src/sr2silo/process/__init__.py b/src/sr2silo/process/__init__.py
@@ -5,8 +5,13 @@
 
 from __future__ import annotations
 
-from sr2silo.process.convert import bam_to_sam
+from sr2silo.process.convert import bam_to_cleartext_alignment, bam_to_sam
 from sr2silo.process.merge import pair_normalize_reads
 from sr2silo.process.translation_aligment import translate
 
-__all__ = ["bam_to_sam", "pair_normalize_reads", "translate"]
+__all__ = [
+    "bam_to_sam",
+    "pair_normalize_reads",
+    "translate",
+    "bam_to_cleartext_alignment",
+]
diff --git a/src/sr2silo/process/convert.py b/src/sr2silo/process/convert.py
@@ -2,6 +2,8 @@
 
 from __future__ import annotations
 
+import logging
+import re
 import tempfile
 from pathlib import Path
 
@@ -27,3 +29,226 @@ def bam_to_sam(bam_file: Path) -> str:
         temp_sam.seek(0)
         temp_sam_content = temp_sam.read().decode()
     return temp_sam_content
+
+
+def parse_cigar(cigar: str) -> list[tuple[str, int]]:
+    """
+    Parse a cigar string into a list of tuples.
+
+    Args:
+        cigar: A string representing the cigar string.
+    Returns:
+        A list of tuples where the first element is the operation
+        type and the second is the length of the operation.
+
+    Credits: adapted from David Gicev @davidgicev
+    """
+    pattern = re.compile(r"(\d+)([MIDNSHP=X])")
+
+    parsed_cigar = pattern.findall(cigar)
+
+    return [(op, int(length)) for length, op in parsed_cigar]
+
+
+def normalize_reads(sam_data: str, output_fasta: Path, output_insertions: Path) -> None:
+    """
+    Normalize (to clear text sequence using CIGAR)
+    all reads in a SAM file output FASTA and insertions files.
+
+    Note that the input SAM file must be read in
+    its entirety before calling this function,
+    whilst the output files can be written incrementally.
+
+    Args:
+        sam_data: A file-like object containing SAM formatted data.
+    Returns:
+        A string with merged, normalized reads in FASTA format.
+        TODO: annotate the output format
+
+    Credits: adapted from David Gicev @davidgicev
+    """
+
+    logging.warning(
+        "pair_normalize_reads: Nuliotide Insertions are not yet implemented, "
+        "{output_insertions} will be empty."
+    )
+
+    unpaired = dict()
+
+    with output_fasta.open("w") as fasta_file, output_insertions.open(
+        "w"
+    ) as insertions_file:
+        for line in sam_data.splitlines():
+            if line.startswith("@"):
+                continue
+
+            fields = line.strip().split("\t")
+
+            qname = fields[0]  # Query template NAME
+            pos = int(fields[3])  # 1-based leftmost mapping position
+            cigar = parse_cigar(fields[5])  # cigar string
+            seq = fields[9]  # segment sequence
+            qual = fields[10]  # ASCII of Phred-scaled base quality + 33
+
+            result_sequence = ""
+            result_qual = ""
+            index = 0
+            inserts = []
+
+            for operation in cigar:
+                ops_type, count = operation
+                if ops_type == "S":
+                    index += count
+                    continue
+                if ops_type == "M":
+                    result_sequence += seq[index : index + count]
+                    result_qual += qual[index : index + count]
+                    index += count
+                    continue
+                if ops_type == "D":
+                    result_sequence += "-" * count
+                    result_qual += "!" * count
+                    continue
+                if ops_type == "I":
+                    inserts.append((index + pos, seq[index : index + count]))
+                    index += count
+                    continue
+
+            read = {
+                "pos": pos,
+                "cigar": cigar,
+                "RESULT_seqUENCE": result_sequence,
+                "RESULT_qual": result_qual,
+                "insertions": inserts,
+            }
+
+            fasta_file.write(f">{qname}|{read['pos']}\n{read['RESULT_seqUENCE']}\n")
+
+            insertions = read["insertions"].copy()
+            # insertion_index = read["pos"] + len(read["RESULT_seqUENCE"])
+
+            insertions_file.write(f"{qname}\t{insertions}\n")
+
+        for read_id, unpaired_read in unpaired.items():
+            fasta_file.write(
+                f">{read_id}|{unpaired_read['pos']}\n{unpaired_read['RESULT_seqUENCE']}\n"
+            )
+            insertions_file.write(f"{read_id}\t{unpaired_read['insertions']}\n")
+
+
+def bam_to_cleartext_alignment(
+    bam_path: Path, output_fp: Path, reference: Path
+) -> None:
+    """Convert BAM to cleartext alignment and write to a ndjson file.
+
+    One Json per read with elements
+    - read_id: read identifier
+    - query_seq_aligned: the aligned sequence with gaps
+    - inserts: list of insertions with elements
+            - pos: position of the insertion
+            - ins: insertion sequence
+    """
+
+    # Get the reference length
+    logging.info(reference)
+    reference_length = 0
+    with Path(reference).open() as ref_f:
+        for line in ref_f:
+            if line.startswith(">"):
+                continue
+            reference_length += len(line.strip())
+
+    print(f"Reference length: {reference_length}")
+
+    # Ensure the BAM file is indexed
+    bam_path_str = str(bam_path)
+    sorted_bam_path_str = None
+    if not bam_path.with_suffix(".bai").exists():
+        try:
+            pysam.index(bam_path_str)
+        except pysam.SamtoolsError as e:  # type: ignore
+            logging.error(f"Error indexing BAM file: {e}")
+            sorted_bam_path_str = bam_path_str.replace(".bam", ".sorted.bam")
+            pysam.sort("-o", sorted_bam_path_str, bam_path_str)
+            pysam.index(sorted_bam_path_str)
+            bam_path_str = sorted_bam_path_str
+
+    with Path(output_fp).open("w") as out_f:
+        # Open the BAM file
+        with pysam.AlignmentFile(bam_path_str, "rb") as samfile:
+            for read in samfile.fetch():
+                aligned = []  # To store the aligned sequence with gaps
+                insertions = []  # To store the insertions
+                ref_pos = read.reference_start
+                seq_pos = 0
+
+                # validate ciagtuples, query_sequence, and reference_name exist
+                if (
+                    not read.cigartuples
+                    or not read.query_sequence
+                    or not read.reference_name
+                ):
+                    logging.warning(
+                        f"Skipping read {read.query_name} due to missing data"
+                    )
+                    continue
+
+                for operation, length in read.cigartuples:
+                    if operation == 0:  # Match (M)
+                        aligned.append(read.query_sequence[seq_pos : seq_pos + length])
+                        seq_pos += length
+                        ref_pos += length
+                    elif operation == 1:  # Insertion (I)
+                        # Extract insertion sequences
+                        insertions.append(
+                            (seq_pos, read.query_sequence[seq_pos : seq_pos + length])
+                        )
+                        aligned.append(
+                            read.query_sequence[seq_pos : seq_pos + length]
+                        )  # Add insertions to the aligned sequence as well
+                        seq_pos += length
+                    elif operation == 2:  # Deletion (D)
+                        aligned.append(
+                            "-" * length
+                        )  # Represent deletions as gaps ('-')
+                        ref_pos += length
+                    elif operation == 4:  # Soft clip (S)
+                        seq_pos += length
+                    elif operation == 5:  # Hard clip (H)
+                        pass  # Don't include hard clipped sequences in the alignment
+
+                # Combine the aligned sequence
+                aligned_str = "".join(aligned)
+
+                # Calculate the padding needed for the left and right
+                left_padding = "-" * read.reference_start
+                right_padding = "-" * (
+                    reference_length - len(aligned_str) - read.reference_start
+                )
+
+                # Pad the aligned sequence
+                padded_alignment = left_padding + aligned_str + right_padding
+
+                # Create a JSON object for the read
+                read_json = {
+                    "read_id": read.query_name,
+                    "query_seq_aligned": padded_alignment,
+                    "inserts": (
+                        [
+                            {"pos": pos + read.reference_start, "ins": list(seq)}
+                            for pos, seq in insertions
+                        ]
+                        if insertions
+                        else []
+                    ),
+                }
+
+                # Write the JSON object to the file
+                out_f.write(f"{read_json}\n")
+
+    # Cleanup generated files
+    if sorted_bam_path_str:
+        Path(sorted_bam_path_str).unlink(missing_ok=True)
+        Path(sorted_bam_path_str + ".bai").unlink(missing_ok=True)
+    else:
+        Path(bam_path_str + ".bai").unlink(missing_ok=True)
diff --git a/src/sr2silo/process/merge.py b/src/sr2silo/process/merge.py
@@ -3,27 +3,9 @@
 from __future__ import annotations
 
 import logging
-import re
 from pathlib import Path
 
-
-def parse_cigar(cigar: str) -> list[tuple[str, int]]:
-    """
-    Parse a cigar string into a list of tuples.
-
-    Args:
-        cigar: A string representing the cigar string.
-    Returns:
-        A list of tuples where the first element is the operation
-        type and the second is the length of the operation.
-
-    Credits: adapted from David Gicev @davidgicev
-    """
-    pattern = re.compile(r"(\d+)([MIDNSHP=X])")
-
-    parsed_cigar = pattern.findall(cigar)
-
-    return [(op, int(length)) for length, op in parsed_cigar]
+from sr2silo.process.convert import parse_cigar
 
 
 def pair_normalize_reads(

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -34,7 +34,39 @@ def unit_test_mocks(monkeypatch: None):
 
 # Define test data paths
 TEST_DATA_DIR = Path(__file__).parent / "data"
+
 INPUT_BAM_PATH = TEST_DATA_DIR / "REF_aln_trim_subsample.bam"
+EXPECTED_SAM_PATH = TEST_DATA_DIR / "REF_aln_trim_subsample_expected.sam"
+
+
+LARGE_TEST_DATA_DIR = (
+    TEST_DATA_DIR
+    / "samples_large"
+    / "A1_05_2024_10_08/20241024_2411515907"
+    / "alignments"
+)
+INPUT_BAM_INSERTIONS_PATH = LARGE_TEST_DATA_DIR / "REF_aln_trim.bam"
+EXPECTED_BAM_INSERTIONS_PATH_inserts = (
+    LARGE_TEST_DATA_DIR / "REF_aln_trim_subsample_insertions.fasta"
+)
+EXPECTED_BAM_INSERTIONS_PATH_cleartext = (
+    LARGE_TEST_DATA_DIR / "REF_aln_trim_subsample.fasta"
+)
+
+
+"""Returns a sample BAM data path and its corresponding SAM data as a string."""
+
+
+@pytest.fixture
+def bam_data() -> dict:
+    """Return a sample BAM data path and its corresponding SAM data as a string."""
+    dict_data = dict()
+    # read in these files as test
+    dict_data["bam_path"] = INPUT_BAM_PATH
+    with open(EXPECTED_SAM_PATH) as f:
+        dict_data["sam_data"] = f.read()
+
+    return dict_data
 
 
 @pytest.fixture
@@ -43,6 +75,24 @@ def sam_data():
     return bam_to_sam(INPUT_BAM_PATH)
 
 
+@pytest.fixture
+def sam_with_insert_data():
+    """Return a sample SAM data as a string with insertions."""
+
+    data_expected = dict()
+    data_expected["bam_data_fp"] = INPUT_BAM_INSERTIONS_PATH
+    data_expected["sam_data"] = bam_to_sam(INPUT_BAM_INSERTIONS_PATH)
+    # Read in the insertions file
+    insertions_content = EXPECTED_BAM_INSERTIONS_PATH_inserts.read_text()
+    data_expected["insertions"] = insertions_content
+
+    # Read in the cleartext file
+    cleartext_content = EXPECTED_BAM_INSERTIONS_PATH_cleartext.read_text()
+    data_expected["cleartext"] = cleartext_content
+
+    return data_expected
+
+
 @pytest.fixture
 def temp_dir():
     """Return a temporary directory as a Path object."""

diff --git a/tests/data/bam/README.md b/tests/data/bam/README.md
@@ -0,0 +1,10 @@
+This combined.bam contains 43 reads from
+real V-Pipe Sars Sequencing.
+
+8 Reads contain insertions, and the rest 35
+do not contain insertions.
+
+This file was purposefully enriched with insertions
+for good testing.
+
+8 insertion is all that was to find in an entire sequencing run.
diff --git a/tests/data/bam/combined.bam b/tests/data/bam/combined.bam
diff --git a/tests/data/bam/expected_cleartext.ndjson b/tests/data/bam/expected_cleartext.ndjson