Skip to content

Commit

Permalink
feature: adding converters and test data for AA translation (#87)
Browse files Browse the repository at this point in the history
* implement process/normalize_reads()

* WIP: bam_to_cleartext_alignment

* POC bam_to_cleartext_aligment()

* WIP insertions as JSON

* POC full aligne wiht padding and small data

* add note

* correct readme and cleanup of index files

* reformat

* refactor resources

* [refactor]: `silo` module (#80)

* refactor: move lapis script to module

* refactor: move wrangleing to silo module

* refactor: move wrangleing to silo module

* add docs

* Update src/sr2silo/silo/wrangle.py

Co-authored-by: Copilot <[email protected]>

* typos - github copilote

Co-authored-by: Copilot <[email protected]>

* refactore: moving the URLs for lapis to env variables

* bug: auth should not be got is in CI

* silo

* silo

* OO in silo

* works

* WORKS

* CI issue

---------

Co-authored-by: Gordon J. Köhn <[email protected]>
Co-authored-by: Copilot <[email protected]>

Getting all Changes from the Nextclade Try on main for try with BlastX

* Apply suggestions from code review

Co-authored-by: Copilot <[email protected]>

* Update tests/data/bam/README.md

Co-authored-by: Copilot <[email protected]>

* reformat

* missing fixture tag

---------

Co-authored-by: Gordon J. Köhn <[email protected]>
Co-authored-by: Copilot <[email protected]>
  • Loading branch information
3 people authored Jan 30, 2025
1 parent 359df45 commit 27bd16e
Show file tree
Hide file tree
Showing 16 changed files with 16,287 additions and 32 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
<picture>
<source
media="(prefers-color-scheme: light)"
srcset="resources/logo.svg">
srcset="resources/graphics/logo.svg">
<source
media="(prefers-color-scheme: dark)"
srcset="resources/logo_dark_mode.svg">
srcset="resources/graphics/logo_dark_mode.svg">
<img alt="Logo" src="resources/logo.svg" width="15%" />
</picture>

Expand Down
File renamed without changes
File renamed without changes
429 changes: 429 additions & 0 deletions resources/sars-cov-2/NC_045512.2.fasta

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions resources/sars-cov-2/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
This folder contains data for the alignment of sars-cov-2

The `NC_045512.2.fasta` is the one used by V-Pipe.

The `reference_genomes.json` is the one used by NextClade.
File renamed without changes.
9 changes: 7 additions & 2 deletions src/sr2silo/process/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,13 @@

from __future__ import annotations

from sr2silo.process.convert import bam_to_sam
from sr2silo.process.convert import bam_to_cleartext_alignment, bam_to_sam
from sr2silo.process.merge import pair_normalize_reads
from sr2silo.process.translation_aligment import translate

__all__ = ["bam_to_sam", "pair_normalize_reads", "translate"]
__all__ = [
"bam_to_sam",
"pair_normalize_reads",
"translate",
"bam_to_cleartext_alignment",
]
225 changes: 225 additions & 0 deletions src/sr2silo/process/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from __future__ import annotations

import logging
import re
import tempfile
from pathlib import Path

Expand All @@ -27,3 +29,226 @@ def bam_to_sam(bam_file: Path) -> str:
temp_sam.seek(0)
temp_sam_content = temp_sam.read().decode()
return temp_sam_content


def parse_cigar(cigar: str) -> list[tuple[str, int]]:
"""
Parse a cigar string into a list of tuples.
Args:
cigar: A string representing the cigar string.
Returns:
A list of tuples where the first element is the operation
type and the second is the length of the operation.
Credits: adapted from David Gicev @davidgicev
"""
pattern = re.compile(r"(\d+)([MIDNSHP=X])")

parsed_cigar = pattern.findall(cigar)

return [(op, int(length)) for length, op in parsed_cigar]


def normalize_reads(sam_data: str, output_fasta: Path, output_insertions: Path) -> None:
"""
Normalize (to clear text sequence using CIGAR)
all reads in a SAM file output FASTA and insertions files.
Note that the input SAM file must be read in
its entirety before calling this function,
whilst the output files can be written incrementally.
Args:
sam_data: A file-like object containing SAM formatted data.
Returns:
A string with merged, normalized reads in FASTA format.
TODO: annotate the output format
Credits: adapted from David Gicev @davidgicev
"""

logging.warning(
"pair_normalize_reads: Nuliotide Insertions are not yet implemented, "
"{output_insertions} will be empty."
)

unpaired = dict()

with output_fasta.open("w") as fasta_file, output_insertions.open(
"w"
) as insertions_file:
for line in sam_data.splitlines():
if line.startswith("@"):
continue

fields = line.strip().split("\t")

qname = fields[0] # Query template NAME
pos = int(fields[3]) # 1-based leftmost mapping position
cigar = parse_cigar(fields[5]) # cigar string
seq = fields[9] # segment sequence
qual = fields[10] # ASCII of Phred-scaled base quality + 33

result_sequence = ""
result_qual = ""
index = 0
inserts = []

for operation in cigar:
ops_type, count = operation
if ops_type == "S":
index += count
continue
if ops_type == "M":
result_sequence += seq[index : index + count]
result_qual += qual[index : index + count]
index += count
continue
if ops_type == "D":
result_sequence += "-" * count
result_qual += "!" * count
continue
if ops_type == "I":
inserts.append((index + pos, seq[index : index + count]))
index += count
continue

read = {
"pos": pos,
"cigar": cigar,
"RESULT_seqUENCE": result_sequence,
"RESULT_qual": result_qual,
"insertions": inserts,
}

fasta_file.write(f">{qname}|{read['pos']}\n{read['RESULT_seqUENCE']}\n")

insertions = read["insertions"].copy()
# insertion_index = read["pos"] + len(read["RESULT_seqUENCE"])

insertions_file.write(f"{qname}\t{insertions}\n")

for read_id, unpaired_read in unpaired.items():
fasta_file.write(
f">{read_id}|{unpaired_read['pos']}\n{unpaired_read['RESULT_seqUENCE']}\n"
)
insertions_file.write(f"{read_id}\t{unpaired_read['insertions']}\n")


def bam_to_cleartext_alignment(
bam_path: Path, output_fp: Path, reference: Path
) -> None:
"""Convert BAM to cleartext alignment and write to a ndjson file.
One Json per read with elements
- read_id: read identifier
- query_seq_aligned: the aligned sequence with gaps
- inserts: list of insertions with elements
- pos: position of the insertion
- ins: insertion sequence
"""

# Get the reference length
logging.info(reference)
reference_length = 0
with Path(reference).open() as ref_f:
for line in ref_f:
if line.startswith(">"):
continue
reference_length += len(line.strip())

print(f"Reference length: {reference_length}")

# Ensure the BAM file is indexed
bam_path_str = str(bam_path)
sorted_bam_path_str = None
if not bam_path.with_suffix(".bai").exists():
try:
pysam.index(bam_path_str)
except pysam.SamtoolsError as e: # type: ignore
logging.error(f"Error indexing BAM file: {e}")
sorted_bam_path_str = bam_path_str.replace(".bam", ".sorted.bam")
pysam.sort("-o", sorted_bam_path_str, bam_path_str)
pysam.index(sorted_bam_path_str)
bam_path_str = sorted_bam_path_str

with Path(output_fp).open("w") as out_f:
# Open the BAM file
with pysam.AlignmentFile(bam_path_str, "rb") as samfile:
for read in samfile.fetch():
aligned = [] # To store the aligned sequence with gaps
insertions = [] # To store the insertions
ref_pos = read.reference_start
seq_pos = 0

# validate ciagtuples, query_sequence, and reference_name exist
if (
not read.cigartuples
or not read.query_sequence
or not read.reference_name
):
logging.warning(
f"Skipping read {read.query_name} due to missing data"
)
continue

for operation, length in read.cigartuples:
if operation == 0: # Match (M)
aligned.append(read.query_sequence[seq_pos : seq_pos + length])
seq_pos += length
ref_pos += length
elif operation == 1: # Insertion (I)
# Extract insertion sequences
insertions.append(
(seq_pos, read.query_sequence[seq_pos : seq_pos + length])
)
aligned.append(
read.query_sequence[seq_pos : seq_pos + length]
) # Add insertions to the aligned sequence as well
seq_pos += length
elif operation == 2: # Deletion (D)
aligned.append(
"-" * length
) # Represent deletions as gaps ('-')
ref_pos += length
elif operation == 4: # Soft clip (S)
seq_pos += length
elif operation == 5: # Hard clip (H)
pass # Don't include hard clipped sequences in the alignment

# Combine the aligned sequence
aligned_str = "".join(aligned)

# Calculate the padding needed for the left and right
left_padding = "-" * read.reference_start
right_padding = "-" * (
reference_length - len(aligned_str) - read.reference_start
)

# Pad the aligned sequence
padded_alignment = left_padding + aligned_str + right_padding

# Create a JSON object for the read
read_json = {
"read_id": read.query_name,
"query_seq_aligned": padded_alignment,
"inserts": (
[
{"pos": pos + read.reference_start, "ins": list(seq)}
for pos, seq in insertions
]
if insertions
else []
),
}

# Write the JSON object to the file
out_f.write(f"{read_json}\n")

# Cleanup generated files
if sorted_bam_path_str:
Path(sorted_bam_path_str).unlink(missing_ok=True)
Path(sorted_bam_path_str + ".bai").unlink(missing_ok=True)
else:
Path(bam_path_str + ".bai").unlink(missing_ok=True)
20 changes: 1 addition & 19 deletions src/sr2silo/process/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,9 @@
from __future__ import annotations

import logging
import re
from pathlib import Path


def parse_cigar(cigar: str) -> list[tuple[str, int]]:
"""
Parse a cigar string into a list of tuples.
Args:
cigar: A string representing the cigar string.
Returns:
A list of tuples where the first element is the operation
type and the second is the length of the operation.
Credits: adapted from David Gicev @davidgicev
"""
pattern = re.compile(r"(\d+)([MIDNSHP=X])")

parsed_cigar = pattern.findall(cigar)

return [(op, int(length)) for length, op in parsed_cigar]
from sr2silo.process.convert import parse_cigar


def pair_normalize_reads(
Expand Down
50 changes: 50 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,39 @@ def unit_test_mocks(monkeypatch: None):

# Define test data paths
TEST_DATA_DIR = Path(__file__).parent / "data"

INPUT_BAM_PATH = TEST_DATA_DIR / "REF_aln_trim_subsample.bam"
EXPECTED_SAM_PATH = TEST_DATA_DIR / "REF_aln_trim_subsample_expected.sam"


LARGE_TEST_DATA_DIR = (
TEST_DATA_DIR
/ "samples_large"
/ "A1_05_2024_10_08/20241024_2411515907"
/ "alignments"
)
INPUT_BAM_INSERTIONS_PATH = LARGE_TEST_DATA_DIR / "REF_aln_trim.bam"
EXPECTED_BAM_INSERTIONS_PATH_inserts = (
LARGE_TEST_DATA_DIR / "REF_aln_trim_subsample_insertions.fasta"
)
EXPECTED_BAM_INSERTIONS_PATH_cleartext = (
LARGE_TEST_DATA_DIR / "REF_aln_trim_subsample.fasta"
)


"""Returns a sample BAM data path and its corresponding SAM data as a string."""


@pytest.fixture
def bam_data() -> dict:
"""Return a sample BAM data path and its corresponding SAM data as a string."""
dict_data = dict()
# read in these files as test
dict_data["bam_path"] = INPUT_BAM_PATH
with open(EXPECTED_SAM_PATH) as f:
dict_data["sam_data"] = f.read()

return dict_data


@pytest.fixture
Expand All @@ -43,6 +75,24 @@ def sam_data():
return bam_to_sam(INPUT_BAM_PATH)


@pytest.fixture
def sam_with_insert_data():
"""Return a sample SAM data as a string with insertions."""

data_expected = dict()
data_expected["bam_data_fp"] = INPUT_BAM_INSERTIONS_PATH
data_expected["sam_data"] = bam_to_sam(INPUT_BAM_INSERTIONS_PATH)
# Read in the insertions file
insertions_content = EXPECTED_BAM_INSERTIONS_PATH_inserts.read_text()
data_expected["insertions"] = insertions_content

# Read in the cleartext file
cleartext_content = EXPECTED_BAM_INSERTIONS_PATH_cleartext.read_text()
data_expected["cleartext"] = cleartext_content

return data_expected


@pytest.fixture
def temp_dir():
"""Return a temporary directory as a Path object."""
Expand Down
10 changes: 10 additions & 0 deletions tests/data/bam/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
This combined.bam contains 43 reads from
real V-Pipe Sars Sequencing.

8 Reads contain insertions, and the rest 35
do not contain insertions.

This file was purposefully enriched with insertions
for good testing.

8 insertion is all that was to find in an entire sequencing run.
Binary file added tests/data/bam/combined.bam
Binary file not shown.
43 changes: 43 additions & 0 deletions tests/data/bam/expected_cleartext.ndjson

Large diffs are not rendered by default.

Loading

0 comments on commit 27bd16e

Please sign in to comment.