cbg-ethz · gordonkoehn · Jan 10, 2025 · Jan 16, 2025 · Jan 16, 2025 · Jan 16, 2025
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "nextclade"]
+	path = nextclade
+	url = https://github.com/gordonkoehn/nextclade
diff --git a/Makefile b/Makefile
@@ -0,0 +1,34 @@
+# Makefile for managing the sr2silo installation
+
+# Variables
+ENV_NAME = sr2silo
+ENV_FILE = environment.yml
+
+# Default target
+.PHONY: all
+all: install
+
+# Create the Conda environment
+.PHONY: create_env
+create_env:
+	conda env create -f $(ENV_FILE)
+
+# Build the Rust project as a Python extension
+.PHONY: build_nextclade
+build_nextclade:
+	conda run -n $(ENV_NAME) bash -c "cd nextclade/packages/nextclade && maturin develop --release -b pyo3"
+
+# Build the Rust project as a Python extension
+.PHONY: build_silo_input_transformer
+build_silo_input_transformer:
+	conda run -n $(ENV_NAME) bash -c "cd silo_input_transformer && maturin develop --release"
+
+# Install the sr2silo package
+.PHONY: install
+install: create_env sr2silo
+	conda run -n $(ENV_NAME) pip install -e .
+
+# Clean up the Conda environment
+.PHONY: clean
+clean:
+	conda env remove -n $(ENV_NAME)
diff --git a/README.md b/README.md
@@ -2,10 +2,10 @@
 <picture>
   <source
     media="(prefers-color-scheme: light)"
-    srcset="resources/logo.svg">
+    srcset="resources/graphics/logo.svg">
   <source
     media="(prefers-color-scheme: dark)"
-    srcset="resources/logo_dark_mode.svg">
+    srcset="resources/graphics/logo_dark_mode.svg">
   <img alt="Logo" src="resources/logo.svg" width="15%" />
 </picture>
 

diff --git a/nextclade b/nextclade
diff --git a/resources/logo.svg → resources/graphics/logo.svg b/resources/logo.svg → resources/graphics/logo.svg
diff --git a/resources/logo_dark_mode.svg → resources/graphics/logo_dark_mode.svg b/resources/logo_dark_mode.svg → resources/graphics/logo_dark_mode.svg
diff --git a/resources/sars-cov-2/NC_045512.2.fasta b/resources/sars-cov-2/NC_045512.2.fasta
diff --git a/resources/sars-cov-2/README.md b/resources/sars-cov-2/README.md
@@ -0,0 +1,5 @@
+This folder contains data for the alignment of sars-cov-2
+
+The `NC_045512.2.fasta` is the one used by V-Pipe.
+
+The `reference_genomes.json` ius the one used by NextClade.
diff --git a/resources/reference_genomes.json → resources/sars-cov-2/reference_genomes.json b/resources/reference_genomes.json → resources/sars-cov-2/reference_genomes.json
diff --git a/scripts/make_nextclade_input.py b/scripts/make_nextclade_input.py
@@ -0,0 +1,100 @@
+"""Make cleartext alignment from BAM file with reference sequence
+    for Nextclade input."""
+
+from __future__ import annotations
+
+import json
+import time
+from pathlib import Path
+from statistics import mean, variance
+
+import nextclade
+from sr2silo.process.convert import bam_to_cleartext_alignment
+
+REFERENCE_PATH = Path("resources/sars-cov-2/NC_045512.2.fasta")
+OUTPUT_PATH = Path("out/nextclade/nextclade_input.ndjson")
+BAM_PATH = Path(
+    "tests/data/samples_large/A1_05_2024_10_08/20241024_2411515907/alignments/REF_aln_trim.bam"
+)
+OUTPUT_PATH_NEXTCLADE = OUTPUT_PATH.parent / "nextclade_output.ndjson"
+
+
+def process(bam_path: Path, output_fp: Path, reference: Path) -> None:
+    """Convert BAM to cleartext alignment and write to a ndjson file.
+
+    One Json per read with elements
+    - read_id: read identifier
+    - query_seq_aligned: the aligned sequence with gaps
+    - inserts: list of insertions with elements
+            - pos: position of the insertion
+            - ins: insertion sequence
+    """
+    # check if the output directory exists
+    output_fp.parent.mkdir(parents=True, exist_ok=True)
+
+    bam_to_cleartext_alignment(bam_path, output_fp, reference)
+
+
+if __name__ == "__main__":
+    N = 3000  # Number of lines to read and process
+
+    process(BAM_PATH, OUTPUT_PATH, REFERENCE_PATH)
+
+    # read the first N lines of the output file
+    with open(OUTPUT_PATH, "r") as f:
+        qry_seqs = [f.readline().replace("'", '"') for _ in range(N)]
+
+    qry_seqs = [json.loads(qry_seq) for qry_seq in qry_seqs]
+    # extract the query sequences "query_seq_aligned"
+    qry_seqs = [qry_seq["query_seq_aligned"] for qry_seq in qry_seqs]
+
+    # load the reference sequence
+    with open(REFERENCE_PATH, "r") as f:
+        reference_seq = f.readlines()
+    ref_seq = "".join(reference_seq[1:]).strip().replace("\n", "")
+
+    # check that both sequences are the same length
+    print(
+        f"Reference and Aligned Query Seq have the same length {len(qry_seqs[0]) == len(ref_seq)}"
+    )
+
+    # gene ref path
+    GENE_MAP_GFF = "nextclade/data/sars-cov-2/genemap.gff"
+
+    # Simulate processing times for each sequence
+    alignments = []
+    processing_times = []
+    for qry_seq in qry_seqs:
+        seq_start_time = time.time()
+        # Simulate processing of a sequence
+        alignments.append(nextclade.translate_aa_align(ref_seq, qry_seq, GENE_MAP_GFF))  # type: ignore
+        seq_stop_time = time.time()
+        processing_times.append(seq_stop_time - seq_start_time)
+
+    # Calculate statistics
+    mean_time = mean(processing_times)
+    variance_time = variance(processing_times)
+    min_time = min(processing_times)
+    max_time = max(processing_times)
+
+    print(f"Mean processing time: {mean_time:.8f} seconds")
+    std_dev_time = variance_time**0.5
+    print(f"Standard deviation of processing time: {std_dev_time:.8f} seconds")
+    print(f"Minimum processing time: {min_time:.8f} seconds")
+    print(f"Maximum processing time: {max_time:.8f} seconds")
+
+    # Estimate time for 5 million reads
+    estimated_time_5m_reads = mean_time * 5_000_000
+    print(
+        f"Estimated time for 5 million reads: {estimated_time_5m_reads / 3600:.2f} hours"
+    )
+
+    # make two different files for the output
+    # for rust and python in the name
+    OUTPUT_PATH_NEXTCLADE_PY = OUTPUT_PATH.parent / "nextclade_output_py.ndjson"
+
+    with open(OUTPUT_PATH_NEXTCLADE_PY, "w") as f:
+        for result in alignments:
+            f.write(json.dumps(result) + "\n")
+
+    print(f"Processed {len(qry_seqs)} sequences")
diff --git a/src/sr2silo/__init__.py b/src/sr2silo/__init__.py
@@ -5,8 +5,9 @@
 """sr2silo connects pairs, normalizes reads, and converts BAM to SAM files."""
 from __future__ import annotations
 
+import sr2silo.process as process
 import sr2silo.vpipe as vpipe
 
 __version__ = "0.0.2"
 
-__all__ = ["vpipe"]
+__all__ = ["vpipe", "process"]
diff --git a/src/sr2silo/process/__init__.py b/src/sr2silo/process/__init__.py
@@ -5,8 +5,13 @@
 
 from __future__ import annotations
 
-from sr2silo.process.convert import bam_to_sam
+from sr2silo.process.convert import bam_to_cleartext_alignment, bam_to_sam
 from sr2silo.process.merge import pair_normalize_reads
 from sr2silo.process.translation_aligment import translate
 
-__all__ = ["bam_to_sam", "pair_normalize_reads", "translate"]
+__all__ = [
+    "bam_to_sam",
+    "pair_normalize_reads",
+    "translate",
+    "bam_to_cleartext_alignment",
+]