Skip to content

Commit

Permalink
vsnp3 version 3.23
Browse files Browse the repository at this point in the history
  • Loading branch information
stuber committed Jun 14, 2024
1 parent 34ca798 commit f461495
Show file tree
Hide file tree
Showing 27 changed files with 89 additions and 122 deletions.
15 changes: 12 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Tested with Python 3.8 - 3.9.
Anaconda [setup](./docs/instructions/conda_instructions.md)

```
conda create -c conda-forge -c bioconda -n vsnp3 vsnp3=3.22
conda create -c conda-forge -c bioconda -n vsnp3 vsnp3=3.23
```

## Installation test
Expand Down Expand Up @@ -81,7 +81,7 @@ Test step 2:
cd ~/vsnp3_test_dataset/NC_045512_test_files/step2
```
```
vsnp3_step2.py -a -t NC_045512_wuhan-hu-1
vsnp3_step2.py -a -t NC_045512_wuhan-hu-1 -remove
```

## Description
Expand Down Expand Up @@ -117,8 +117,15 @@ See `-h` option for detail and usage
- vsnp3_excel_merge_files.py
- vsnp3_filter_finder.py
- vsnp3_spoligotype.py

## Input Summary

[Additional Tools](./docs/instructions/additional_tools.md)
<!-- <img src="../dependencies/vsnp_inputs.png" alt="vSNP inputs" width="500"> -->
![vSNP inputs](docs/img/vsnp_inputs.png "vSNP inputs")
## Map

<!-- ![vSNP script usage](../dependencies/vsnp3_structure.jpg "Script structure") -->
![Script structure](docs/img/vsnp3_structure.png "Script structure")

## Version Enhancements

Expand Down Expand Up @@ -147,3 +154,5 @@ See `-h` option for detail and usage
- Detailed steps recorded in run_log.txt.
- Spoligotype for TB complex isolates not found by default. -s option required with step 1 or find using vsnp3_spoligotype.py.
- Brucella MLST removed from step 1. Find Brucella MLST using vsnp3_bruc_mlst.py.

[Additional Tools](./docs/instructions/additional_tools.md)
2 changes: 1 addition & 1 deletion bin/vsnp3_alignment_vcf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.22"
__version__ = "3.23"

import os
import subprocess
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_annotation.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.22"
__version__ = "3.23"

import os
import shutil
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_assembly.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.22"
__version__ = "3.23"

import os
import sys
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_best_reference_sourmash.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.22"
__version__ = "3.23"

import os
import subprocess
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_bruc_mlst.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.22"
__version__ = "3.23"

import os
import io
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_download_GCA_fasta_get_metadata.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.22"
__version__ = "3.23"

import os
import sys
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_download_fasta_gbk_gff_by_acc.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.22"
__version__ = "3.23"

import os
import argparse
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_excel_merge_defining_snps.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.22"
__version__ = "3.23"

import os
import re
Expand Down
146 changes: 52 additions & 94 deletions bin/vsnp3_fasta_to_fastq.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
#!/usr/bin/env python3

__version__ = "3.22"
__version__ = "3.23"

import gzip
import os
import argparse
import random
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.SeqUtils import seq1
from Bio.Seq import Seq
import logging
from datetime import datetime

Expand All @@ -18,14 +18,19 @@
'K': ['G', 'T'], 'M': ['A', 'C'], 'B': ['C', 'G', 'T'], 'D': ['A', 'G', 'T'],
'H': ['A', 'C', 'T'], 'V': ['A', 'C', 'G'], 'N': ['A', 'T', 'C', 'G']
}
class Fasta_to_Paired_Fastq():

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
class Fasta_to_Paired_Fastq:

def __init__(self, fasta_file, coverage, read_length):
self.fasta_file = fasta_file
self.coverage = coverage
self.read_length = read_length
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
self.process_fasta(fasta_file, coverage, read_length)

def fake_quality_scores(self, length):
"""Generate a fake quality score string of given length with varying quality scores."""
return ''.join(chr(random.randint(58, 72)) for _ in range(length))
return ''.join(chr(random.randint(58, 69)) for _ in range(length))

def calculate_total_genome_length(self, fasta_file):
"""Calculate the total genome length from the input FASTA file."""
Expand All @@ -34,117 +39,70 @@ def calculate_total_genome_length(self, fasta_file):
total_length += len(record.seq)
return total_length

def generate_paired_reads(self, sequence, num_reads, read_length):
def replace_ambiguities(self, sequence):
"""Replace ambiguity codes in the sequence with random nucleotides."""
return ''.join(random.choice(ambiguity_codes[base]) if base in ambiguity_codes else base for base in sequence)

def generate_paired_reads(self, sequence, num_reads, read_length):
"""Generate paired-end reads from the given sequence."""
seq_len = len(sequence)
reads = []
for _ in range(num_reads):
start_pos = random.randint(0, seq_len - read_length)
end_pos = start_pos + read_length
for i in range(num_reads):
adjusted_read_length = min(read_length, seq_len)
start_pos = random.randint(0, seq_len - adjusted_read_length)
end_pos = start_pos + adjusted_read_length

# Create read pair
seq_r1 = sequence[start_pos:start_pos + read_length // 2]
seq_r2 = sequence[end_pos - read_length // 2:end_pos]
seq_r1 = self.replace_ambiguities(sequence[start_pos:start_pos + adjusted_read_length // 2].upper())
seq_r2 = self.replace_ambiguities(sequence[end_pos - adjusted_read_length // 2:end_pos].upper())

# Pad reads if they are shorter than the read length
if len(seq_r1) < read_length // 2:
seq_r1 = seq_r1 + 'N' * (read_length // 2 - len(seq_r1))
if len(seq_r2) < read_length // 2:
seq_r2 = seq_r2 + 'N' * (read_length // 2 - len(seq_r2))

reads.append((seq_r1, seq_r2))

return reads

def is_valid_sequence(self, sequence):
global ambiguity_codes
"""Check if the sequence contains only valid nucleotide characters."""
valid_nucleotides = set('ATCG') | set(ambiguity_codes.keys())
invalid_chars = set(sequence.upper()) - valid_nucleotides
if invalid_chars:
logging.warning(f"Invalid characters found in sequence: {invalid_chars}. Treating them as 'N'.")
return False
return True

def handle_ambiguity(self, sequence):
global ambiguity_codes
"""Handle ambiguity in sequence by randomly choosing one of the possible nucleotides."""
new_sequence = []
for base in sequence:
if base in ambiguity_codes:
new_sequence.append(random.choice(ambiguity_codes[base]))
else:
new_sequence.append(base)
return ''.join(new_sequence)

def reverse_complement(self, sequence):
"""Generate the reverse complement of a sequence."""
complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}
return ''.join(complement[base] for base in reversed(sequence))

def generate_barcode(self, length=6):
"""Generate a random barcode sequence of a given length."""
return ''.join(random.choice('ATCG') for _ in range(length))

def __init__(self, fasta_file, coverage, read_length):
"""Convert a FASTA file into paired-end FASTQ files with fake quality scores and specific coverage."""
print(f'Converting FASTA to FASTQ')
print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
# Pad reads if they are shorter than the adjusted read length
if len(seq_r1) < adjusted_read_length // 2:
seq_r1 = seq_r1 + 'N' * (adjusted_read_length // 2 - len(seq_r1))
if len(seq_r2) < adjusted_read_length // 2:
seq_r2 = seq_r2 + 'N' * (adjusted_read_length // 2 - len(seq_r2))

# Determine output file names based on input file name
base_name = os.path.splitext(os.path.basename(fasta_file))[0]
fastq_r1_file = base_name + "_from_FASTA_R1.fastq.gz"
fastq_r2_file = base_name + "_from_FASTA_R2.fastq.gz"
seq_r2 = str(Seq(seq_r2).reverse_complement()) # Reverse complement for R2
x_coord = start_pos + 1
y_coord = end_pos
reads.append((seq_r1, seq_r2, x_coord, y_coord, i + 1))

# Calculate total genome length
total_genome_length = self.calculate_total_genome_length(fasta_file)
return reads

# Calculate the number of reads needed for the desired coverage
num_reads = (coverage * total_genome_length) // read_length
def process_fasta(self, fasta_file, coverage, read_length):
"""Process the input FASTA file and generate paired-end FASTQ files."""
file_prefix = os.path.basename(fasta_file).split('.')[0]
total_length = self.calculate_total_genome_length(fasta_file)
num_reads = int((total_length * coverage) / read_length)

fastq_r1_file = f"{file_prefix}_R1.fastq.gz"
fastq_r2_file = f"{file_prefix}_R2.fastq.gz"
constant_overlap_seq = "TTCAAGTATG+CGATACCATC"

# Open output FASTQ files for writing
with gzip.open(fastq_r1_file, "wt") as r1, gzip.open(fastq_r2_file, "wt") as r2:
with gzip.open(fastq_r1_file, 'wt') as r1, gzip.open(fastq_r2_file, 'wt') as r2:
for record in SeqIO.parse(fasta_file, "fasta"):
sequence = str(record.seq).upper()
if not self.is_valid_sequence(sequence):
# Replace invalid characters with 'N'
sequence = ''.join([base if base in {'A', 'T', 'C', 'G', 'N'} else 'N' for base in sequence])

sequence = str(record.seq)
reads = self.generate_paired_reads(sequence, num_reads, read_length)

for i, (seq_r1, seq_r2) in enumerate(reads):
# Handle ambiguities in each read pair
seq_r1 = self.handle_ambiguity(seq_r1)
seq_r2 = self.handle_ambiguity(seq_r2)
seq_r2 = self.reverse_complement(seq_r2) # Reverse complement for R2

# Create varying quality scores between Phred 20 and 35
qual_r1 = self.fake_quality_scores(len(seq_r1))
qual_r2 = self.fake_quality_scores(len(seq_r2))

# Generate a random barcode sequence
barcode = self.generate_barcode()

# Create Illumina-like headers with barcode
header_base = f"{base_name}:{i+1}:N:0:1:{barcode}"
record_r1 = f"@{header_base}/1\n{seq_r1}\n+\n{qual_r1}\n"
record_r2 = f"@{header_base}/2\n{seq_r2}\n+\n{qual_r2}\n"

# Write to FASTQ files
for seq_r1, seq_r2, x_coord, y_coord, unique_id in reads:
header_r1 = f"@{file_prefix}:8:FASTQFROMFASTA:1:{x_coord}:{y_coord}:{unique_id} 1:N:0:{constant_overlap_seq}"
header_r2 = f"@{file_prefix}:8:FASTQFROMFASTA:1:{x_coord}:{y_coord}:{unique_id} 2:N:0:{constant_overlap_seq}"

record_r1 = f"{header_r1}\n{seq_r1}\n+\n{self.fake_quality_scores(len(seq_r1))}\n"
record_r2 = f"{header_r2}\n{seq_r2}\n+\n{self.fake_quality_scores(len(seq_r2))}\n"

r1.write(record_r1)
r2.write(record_r2)

self.fastq_r1_file = fastq_r1_file
self.fastq_r2_file = fastq_r2_file
self.fastq_r1_file = fastq_r1_file
self.fastq_r2_file = fastq_r2_file

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Convert a FASTA file into paired-end FASTQ files with fake quality scores and specific coverage. If FASTA is reporting ambiguity codes, they will be replaced with resepentative mix of nucleotides.")
parser = argparse.ArgumentParser(description="Convert a FASTA file into paired-end FASTQ files with fake quality scores and specific coverage. If FASTA is reporting ambiguity codes, they will be replaced with representative mix of nucleotides.")
parser.add_argument("-i", "--input", required=True, help="Input FASTA file")
parser.add_argument("-c", "--coverage", type=int, default=100, help="Desired coverage (default: 100X)")
parser.add_argument("-l", "--read_length", type=int, default=300, help="Read length (default: 300)")
parser.add_argument('-v', '--version', action='version', version=f'{os.path.basename(__file__)}: version {__version__}')
args = parser.parse_args()

fasta_to_paired_fastq = Fasta_to_Paired_Fastq(args.input, args.coverage, args.read_length)
print(f"Paired-end FASTQ files created: \n\t{fasta_to_paired_fastq.fastq_r1_file}\n\t{fasta_to_paired_fastq.fastq_r2_file}")
print(f"Paired-end FASTQ files created: \n\t{fasta_to_paired_fastq.fastq_r1_file}\n\t{fasta_to_paired_fastq.fastq_r2_file}")
2 changes: 1 addition & 1 deletion bin/vsnp3_fasta_to_snps_table.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.22"
__version__ = "3.23"

import os
import subprocess
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_fastq_stats_seqkit.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.22"
__version__ = "3.23"

import os
import subprocess
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_file_setup.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.22"
__version__ = "3.23"

import os
import shutil
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_group_on_defining_snps.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.22"
__version__ = "3.23"

import os
import sys
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_group_reporter.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.22"
__version__ = "3.23"

import os
import io
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_html_step2_summary.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.22"
__version__ = "3.23"

import os

Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_kernel_plots.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.22"
__version__ = "3.23"

import os
import re
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_path_adder.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.22"
__version__ = "3.23"

import os
import glob
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_reference_options.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.22"
__version__ = "3.23"

import os
import sys
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_remove_from_analysis.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.22"
__version__ = "3.23"

import os
import sys
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_spoligotype.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.22"
__version__ = "3.23"

import os
import gzip
Expand Down
Loading

0 comments on commit f461495

Please sign in to comment.