Skip to content

Commit cbe2d4f

Browse files
committed
working now
1 parent d2b5e3b commit cbe2d4f

File tree

1 file changed

+35
-48
lines changed

1 file changed

+35
-48
lines changed

workflow/scripts/extend_fragments.py

+35-48
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,14 @@
1111
2. For each read, check if the read is below a certain mapping quality threshold.
1212
3. If the read is below the threshold, skip to the next read.
1313
4. Get the chromosome, start and end position of the read.
14-
5. Get the closest GATC site to the end of the read (omit the ones that came before the end of the read).
14+
5. Get the closest GATC site to the end of the read
15+
(omit the ones that came before the end of the read).
1516
6. If the distance from the end of the read to the closest GATC site is less than n,
1617
set the length to add to the read to the distance.
1718
7. If the distance is greater than n, set the length to add to n.
1819
8. Update the CIGAR string of the read to reflect the new length.
19-
9. Create a string for the new fragment in SAM format.
20-
10. Write the new fragment to a new BAM file.
20+
9. Write the new read to a new BAM file.
21+
a. Use samtools view to write the read to a new BAM file.
2122
"""
2223

2324
import logging
@@ -27,9 +28,9 @@
2728
import numpy as np
2829
import gffpandas.gffpandas as gffpd
2930

30-
input_bam = snakemake.input[0]
31+
input_bam = snakemake.input["bam"]
32+
fai = snakemake.input["fai"]
3133
output_bam = snakemake.output[0]
32-
#output_sam = output_bam.replace(".bam", ".sam")
3334
gatc_gff = snakemake.input["gatc_gff"]
3435
n = snakemake.params["n"]
3536
q = snakemake.params["q"]
@@ -56,6 +57,16 @@
5657
# Convert to Numpy array for speed
5758
chr_gatc[chr] = np.sort(chr_gatc[chr])
5859

60+
# Read first column of fai file into 0-based index
61+
# converts target ID to reference name
62+
references = {}
63+
index = 0
64+
with open(fai, "r") as f:
65+
for line in f:
66+
line = line.strip().split("\t")
67+
references[index] = line[0]
68+
index += 1
69+
5970
# Counter for number of sequences MQ filtered/extended
6071
# and total number of sequences
6172
extended_seqs = 0
@@ -66,10 +77,16 @@
6677

6778
logging.info(f"Extending reads up to {n} bases or nearest GATC site...")
6879
samfile_in = pysam.AlignmentFile(input_bam, "rb")
69-
#samfile_out = pysam.AlignmentFile(output_sam, "wb", template=samfile)
80+
81+
# Open pipe to samtools to write to new BAM file
82+
command = ["samtools", "view", "-Shb", "-t", fai, "-o", output_bam] # @SQ header is not written
83+
process = subprocess.Popen(command,
84+
stdin=subprocess.PIPE,
85+
#stdout=subprocess.PIPE,
86+
stderr=subprocess.PIPE,
87+
text=True)
7088

7189
for read in samfile_in.fetch():
72-
print(read)
7390
# First filter out reads with low mapping quality
7491
mq = read.mapping_quality
7592
if mq < q:
@@ -80,14 +97,13 @@
8097
chr_ = read.reference_name
8198
start = read.reference_start
8299
length = read.query_length
83-
print(length)
84100
end = start + length
85101

86102
# Get the GATC sites for the current chromosome
87103
tmp_gff = chr_gatc[chr_]
88104

89105
# Remove GATC sites that are before the read end
90-
tmp_gff = tmp_gff[tmp_gff >= end]
106+
tmp_gff = tmp_gff[tmp_gff > end]
91107

92108
# Find index of GATC site closest to the read end
93109
closest = np.argmin(abs(end - tmp_gff))
@@ -115,59 +131,30 @@
115131
else:
116132
cigar = f"{cigar}{length_to_add}M"
117133

118-
"""
119-
# Update quality scores
120-
old_qual = read.query_alignment_qualities
121-
122-
# Create updated read and write to file
123-
new_read = pysam.AlignedSegment()
124-
new_read.query_name = read.query_name
125-
new_read.query_sequence = "*"
126-
new_read.flag = read.flag
127-
new_read.reference_id = read.reference_id
128-
new_read.reference_start = read.reference_start
129-
new_read.mapping_quality = read.mapping_quality
130-
new_read.cigarstring = cigar
131-
new_read.next_reference_id = read.next_reference_id
132-
new_read.next_reference_start = read.next_reference_start
133-
new_read.template_length = length #new_sequence_length
134-
new_read.query_qualities = pysam.qualitystring_to_array("*")
135-
new_read.tags = read.tags
136-
#print(new_read)
137-
"""
138-
134+
# Create out read string
139135
read_out = "\t".join([
140136
read.query_name,
141137
str(read.flag),
142-
str(read.reference_id),
143-
str(start),
138+
references[read.reference_id],
139+
str(start + 1), # BAM is 1-based, SAM is 0-based
144140
str(mq),
145141
cigar,
146142
"*",
147143
"0",
148144
"0",
149145
"*",
150-
"*",
151-
read.tags
146+
"*\n",
152147
])
153-
print(read_out)
154-
# Write to BAM file
155-
command = f"samtools view -Shb - > {output_bam}"
156-
process = subprocess.Popen(command, stdin=subprocess.PIPE, shell=True)
157-
process.stdin.write(bf"{read_out}\n")
148+
149+
process.stdin.write(read_out)
158150

159151
all_seqs += 1
160152
extended_seqs += 1
161-
break
162153

163154
process.stdin.close()
164155
process.wait()
165-
#samfile_in.close()
166-
#samfile_out.close()
167-
168-
# Convert to BAM
169-
#pysam.view("-@", threads, "-o", output_bam, samfile_out)
170156

171-
logging.info(f"Total sequences: {all_seqs}")
172-
logging.info(f"Sequences with MQ < {q}: {all_seqs - extended_seqs}")
173-
logging.info(f"Sequences extended: {extended_seqs}")
157+
# Write to log file
158+
logging.info("Total sequences: {}".format(all_seqs))
159+
logging.info("Sequences with MQ < {}: {}".format(q, all_seqs - extended_seqs))
160+
logging.info("Sequences extended: {}".format(extended_seqs))

0 commit comments

Comments
 (0)