Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add translation of alignment #78

Draft
wants to merge 29 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "nextclade"]
path = nextclade
url = https://github.com/gordonkoehn/nextclade
34 changes: 34 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Makefile for managing the sr2silo installation

# Variables
ENV_NAME = sr2silo
ENV_FILE = environment.yml

# Default target
.PHONY: all
all: install

# Create the Conda environment
.PHONY: create_env
create_env:
conda env create -f $(ENV_FILE)

# Build the Rust project as a Python extension
.PHONY: build_nextclade
build_nextclade:
conda run -n $(ENV_NAME) bash -c "cd nextclade/packages/nextclade && maturin develop --release -b pyo3"

# Build the Rust project as a Python extension
.PHONY: build_silo_input_transformer
build_silo_input_transformer:
conda run -n $(ENV_NAME) bash -c "cd silo_input_transformer && maturin develop --release"

# Install the sr2silo package
.PHONY: install
install: create_env sr2silo
conda run -n $(ENV_NAME) pip install -e .

# Clean up the Conda environment
.PHONY: clean
clean:
conda env remove -n $(ENV_NAME)
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
<picture>
<source
media="(prefers-color-scheme: light)"
srcset="resources/logo.svg">
srcset="resources/graphics/logo.svg">
<source
media="(prefers-color-scheme: dark)"
srcset="resources/logo_dark_mode.svg">
srcset="resources/graphics/logo_dark_mode.svg">
<img alt="Logo" src="resources/logo.svg" width="15%" />
</picture>

Expand Down
1 change: 1 addition & 0 deletions nextclade
Submodule nextclade added at cb5ebc
File renamed without changes
File renamed without changes
429 changes: 429 additions & 0 deletions resources/sars-cov-2/NC_045512.2.fasta

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions resources/sars-cov-2/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
This folder contains data for the alignment of sars-cov-2

The `NC_045512.2.fasta` is the one used by V-Pipe.

The `reference_genomes.json` ius the one used by NextClade.
100 changes: 100 additions & 0 deletions scripts/make_nextclade_input.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
"""Make cleartext alignment from BAM file with reference sequence
for Nextclade input."""

from __future__ import annotations

import json
import time
from pathlib import Path
from statistics import mean, variance

import nextclade
from sr2silo.process.convert import bam_to_cleartext_alignment

REFERENCE_PATH = Path("resources/sars-cov-2/NC_045512.2.fasta")
OUTPUT_PATH = Path("out/nextclade/nextclade_input.ndjson")
BAM_PATH = Path(
"tests/data/samples_large/A1_05_2024_10_08/20241024_2411515907/alignments/REF_aln_trim.bam"
)
OUTPUT_PATH_NEXTCLADE = OUTPUT_PATH.parent / "nextclade_output.ndjson"


def process(bam_path: Path, output_fp: Path, reference: Path) -> None:
"""Convert BAM to cleartext alignment and write to a ndjson file.

One Json per read with elements
- read_id: read identifier
- query_seq_aligned: the aligned sequence with gaps
- inserts: list of insertions with elements
- pos: position of the insertion
- ins: insertion sequence
"""
# check if the output directory exists
output_fp.parent.mkdir(parents=True, exist_ok=True)

bam_to_cleartext_alignment(bam_path, output_fp, reference)


if __name__ == "__main__":
N = 3000 # Number of lines to read and process

process(BAM_PATH, OUTPUT_PATH, REFERENCE_PATH)

# read the first N lines of the output file
with open(OUTPUT_PATH, "r") as f:
qry_seqs = [f.readline().replace("'", '"') for _ in range(N)]

qry_seqs = [json.loads(qry_seq) for qry_seq in qry_seqs]
# extract the query sequences "query_seq_aligned"
qry_seqs = [qry_seq["query_seq_aligned"] for qry_seq in qry_seqs]

# load the reference sequence
with open(REFERENCE_PATH, "r") as f:
reference_seq = f.readlines()
ref_seq = "".join(reference_seq[1:]).strip().replace("\n", "")

# check that both sequences are the same length
print(
f"Reference and Aligned Query Seq have the same length {len(qry_seqs[0]) == len(ref_seq)}"
)

# gene ref path
GENE_MAP_GFF = "nextclade/data/sars-cov-2/genemap.gff"

# Simulate processing times for each sequence
alignments = []
processing_times = []
for qry_seq in qry_seqs:
seq_start_time = time.time()
# Simulate processing of a sequence
alignments.append(nextclade.translate_aa_align(ref_seq, qry_seq, GENE_MAP_GFF)) # type: ignore
seq_stop_time = time.time()
processing_times.append(seq_stop_time - seq_start_time)

# Calculate statistics
mean_time = mean(processing_times)
variance_time = variance(processing_times)
min_time = min(processing_times)
max_time = max(processing_times)

print(f"Mean processing time: {mean_time:.8f} seconds")
std_dev_time = variance_time**0.5
print(f"Standard deviation of processing time: {std_dev_time:.8f} seconds")
print(f"Minimum processing time: {min_time:.8f} seconds")
print(f"Maximum processing time: {max_time:.8f} seconds")

# Estimate time for 5 million reads
estimated_time_5m_reads = mean_time * 5_000_000
print(
f"Estimated time for 5 million reads: {estimated_time_5m_reads / 3600:.2f} hours"
)

# make two different files for the output
# for rust and python in the name
OUTPUT_PATH_NEXTCLADE_PY = OUTPUT_PATH.parent / "nextclade_output_py.ndjson"

with open(OUTPUT_PATH_NEXTCLADE_PY, "w") as f:
for result in alignments:
f.write(json.dumps(result) + "\n")

print(f"Processed {len(qry_seqs)} sequences")
3 changes: 2 additions & 1 deletion src/sr2silo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@
"""sr2silo connects pairs, normalizes reads, and converts BAM to SAM files."""
from __future__ import annotations

import sr2silo.process as process
import sr2silo.vpipe as vpipe

__version__ = "0.0.2"

__all__ = ["vpipe"]
__all__ = ["vpipe", "process"]
9 changes: 7 additions & 2 deletions src/sr2silo/process/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,13 @@

from __future__ import annotations

from sr2silo.process.convert import bam_to_sam
from sr2silo.process.convert import bam_to_cleartext_alignment, bam_to_sam
from sr2silo.process.merge import pair_normalize_reads
from sr2silo.process.translation_aligment import translate

__all__ = ["bam_to_sam", "pair_normalize_reads", "translate"]
__all__ = [
"bam_to_sam",
"pair_normalize_reads",
"translate",
"bam_to_cleartext_alignment",
]
Loading
Loading