11
11
2. For each read, check if the read is below a certain mapping quality threshold.
12
12
3. If the read is below the threshold, skip to the next read.
13
13
4. Get the chromosome, start and end position of the read.
14
- 5. Get the closest GATC site to the end of the read (omit the ones that came before the end of the read).
14
+ 5. Get the closest GATC site to the end of the read
15
+ (omit the ones that came before the end of the read).
15
16
6. If the distance from the end of the read to the closest GATC site is less than n,
16
17
set the length to add to the read to the distance.
17
18
7. If the distance is greater than n, set the length to add to n.
18
19
8. Update the CIGAR string of the read to reflect the new length.
19
- 9. Create a string for the new fragment in SAM format .
20
- 10. Write the new fragment to a new BAM file.
20
+ 9. Write the new read to a new BAM file .
21
+ a. Use samtools view to write the read to a new BAM file.
21
22
"""
22
23
23
24
import logging
27
28
import numpy as np
28
29
import gffpandas .gffpandas as gffpd
29
30
30
- input_bam = snakemake .input [0 ]
31
+ input_bam = snakemake .input ["bam" ]
32
+ fai = snakemake .input ["fai" ]
31
33
output_bam = snakemake .output [0 ]
32
- #output_sam = output_bam.replace(".bam", ".sam")
33
34
gatc_gff = snakemake .input ["gatc_gff" ]
34
35
n = snakemake .params ["n" ]
35
36
q = snakemake .params ["q" ]
56
57
# Convert to Numpy array for speed
57
58
chr_gatc [chr ] = np .sort (chr_gatc [chr ])
58
59
60
+ # Read first column of fai file into 0-based index
61
+ # converts target ID to reference name
62
+ references = {}
63
+ index = 0
64
+ with open (fai , "r" ) as f :
65
+ for line in f :
66
+ line = line .strip ().split ("\t " )
67
+ references [index ] = line [0 ]
68
+ index += 1
69
+
59
70
# Counter for number of sequences MQ filtered/extended
60
71
# and total number of sequences
61
72
extended_seqs = 0
66
77
67
78
logging .info (f"Extending reads up to { n } bases or nearest GATC site..." )
68
79
samfile_in = pysam .AlignmentFile (input_bam , "rb" )
69
- #samfile_out = pysam.AlignmentFile(output_sam, "wb", template=samfile)
80
+
81
+ # Open pipe to samtools to write to new BAM file
82
+ command = ["samtools" , "view" , "-Shb" , "-t" , fai , "-o" , output_bam ] # @SQ header is not written
83
+ process = subprocess .Popen (command ,
84
+ stdin = subprocess .PIPE ,
85
+ #stdout=subprocess.PIPE,
86
+ stderr = subprocess .PIPE ,
87
+ text = True )
70
88
71
89
for read in samfile_in .fetch ():
72
- print (read )
73
90
# First filter out reads with low mapping quality
74
91
mq = read .mapping_quality
75
92
if mq < q :
80
97
chr_ = read .reference_name
81
98
start = read .reference_start
82
99
length = read .query_length
83
- print (length )
84
100
end = start + length
85
101
86
102
# Get the GATC sites for the current chromosome
87
103
tmp_gff = chr_gatc [chr_ ]
88
104
89
105
# Remove GATC sites that are before the read end
90
- tmp_gff = tmp_gff [tmp_gff >= end ]
106
+ tmp_gff = tmp_gff [tmp_gff > end ]
91
107
92
108
# Find index of GATC site closest to the read end
93
109
closest = np .argmin (abs (end - tmp_gff ))
115
131
else :
116
132
cigar = f"{ cigar } { length_to_add } M"
117
133
118
- """
119
- # Update quality scores
120
- old_qual = read.query_alignment_qualities
121
-
122
- # Create updated read and write to file
123
- new_read = pysam.AlignedSegment()
124
- new_read.query_name = read.query_name
125
- new_read.query_sequence = "*"
126
- new_read.flag = read.flag
127
- new_read.reference_id = read.reference_id
128
- new_read.reference_start = read.reference_start
129
- new_read.mapping_quality = read.mapping_quality
130
- new_read.cigarstring = cigar
131
- new_read.next_reference_id = read.next_reference_id
132
- new_read.next_reference_start = read.next_reference_start
133
- new_read.template_length = length #new_sequence_length
134
- new_read.query_qualities = pysam.qualitystring_to_array("*")
135
- new_read.tags = read.tags
136
- #print(new_read)
137
- """
138
-
134
+ # Create out read string
139
135
read_out = "\t " .join ([
140
136
read .query_name ,
141
137
str (read .flag ),
142
- str ( read .reference_id ) ,
143
- str (start ),
138
+ references [ read .reference_id ] ,
139
+ str (start + 1 ), # BAM is 1-based, SAM is 0-based
144
140
str (mq ),
145
141
cigar ,
146
142
"*" ,
147
143
"0" ,
148
144
"0" ,
149
145
"*" ,
150
- "*" ,
151
- read .tags
146
+ "*\n " ,
152
147
])
153
- print (read_out )
154
- # Write to BAM file
155
- command = f"samtools view -Shb - > { output_bam } "
156
- process = subprocess .Popen (command , stdin = subprocess .PIPE , shell = True )
157
- process .stdin .write (bf"{ read_out } \n " )
148
+
149
+ process .stdin .write (read_out )
158
150
159
151
all_seqs += 1
160
152
extended_seqs += 1
161
- break
162
153
163
154
process .stdin .close ()
164
155
process .wait ()
165
- #samfile_in.close()
166
- #samfile_out.close()
167
-
168
- # Convert to BAM
169
- #pysam.view("-@", threads, "-o", output_bam, samfile_out)
170
156
171
- logging .info (f"Total sequences: { all_seqs } " )
172
- logging .info (f"Sequences with MQ < { q } : { all_seqs - extended_seqs } " )
173
- logging .info (f"Sequences extended: { extended_seqs } " )
157
+ # Write to log file
158
+ logging .info ("Total sequences: {}" .format (all_seqs ))
159
+ logging .info ("Sequences with MQ < {}: {}" .format (q , all_seqs - extended_seqs ))
160
+ logging .info ("Sequences extended: {}" .format (extended_seqs ))
0 commit comments