working now

niekwit · niekwit · commit cbe2d4f55999 · 2024-07-29T15:48:00.000+01:00
diff --git a/workflow/scripts/extend_fragments.py b/workflow/scripts/extend_fragments.py
@@ -11,13 +11,14 @@
 2. For each read, check if the read is below a certain mapping quality threshold.
 3. If the read is below the threshold, skip to the next read.
 4. Get the chromosome, start and end position of the read.
-5. Get the closest GATC site to the end of the read (omit the ones that came before the end of the read).
+5. Get the closest GATC site to the end of the read 
+   (omit the ones that came before the end of the read).
 6. If the distance from the end of the read to the closest GATC site is less than n,
     set the length to add to the read to the distance.
 7. If the distance is greater than n, set the length to add to n.
 8. Update the CIGAR string of the read to reflect the new length.
-9. Create a string for the new fragment in SAM format.
-10. Write the new fragment to a new BAM file.
+9. Write the new read to a new BAM file.
+    a. Use samtools view to write the read to a new BAM file.
 """
 
 import logging
@@ -27,9 +28,9 @@
 import numpy as np
 import gffpandas.gffpandas as gffpd
 
-input_bam = snakemake.input[0]
+input_bam = snakemake.input["bam"]
+fai = snakemake.input["fai"]
 output_bam = snakemake.output[0]
-#output_sam = output_bam.replace(".bam", ".sam")
 gatc_gff = snakemake.input["gatc_gff"]
 n = snakemake.params["n"]
 q = snakemake.params["q"]
@@ -56,6 +57,16 @@
     # Convert to Numpy array for speed
     chr_gatc[chr] = np.sort(chr_gatc[chr])
 
+# Read first column of fai file into 0-based index
+# converts target ID to reference name
+references = {}
+index = 0
+with open(fai, "r") as f:
+    for line in f:
+        line = line.strip().split("\t")
+        references[index] = line[0]
+        index += 1
+
 # Counter for number of sequences MQ filtered/extended 
 # and total number of sequences
 extended_seqs = 0
@@ -66,10 +77,16 @@
 
 logging.info(f"Extending reads up to {n} bases or nearest GATC site...")
 samfile_in = pysam.AlignmentFile(input_bam, "rb")
-#samfile_out = pysam.AlignmentFile(output_sam, "wb", template=samfile)
+
+# Open pipe to samtools to write to new BAM file
+command = ["samtools", "view", "-Shb", "-t", fai, "-o", output_bam] # @SQ header is not written
+process = subprocess.Popen(command,
+                           stdin=subprocess.PIPE, 
+                           #stdout=subprocess.PIPE,
+                           stderr=subprocess.PIPE,
+                           text=True)
 
 for read in samfile_in.fetch():
-    print(read)
     # First filter out reads with low mapping quality
     mq = read.mapping_quality
     if mq < q:
@@ -80,14 +97,13 @@
     chr_ = read.reference_name
     start = read.reference_start
     length = read.query_length
-    print(length)
     end = start + length
     
     # Get the GATC sites for the current chromosome
     tmp_gff = chr_gatc[chr_]
     
     # Remove GATC sites that are before the read end
-    tmp_gff = tmp_gff[tmp_gff >= end]
+    tmp_gff = tmp_gff[tmp_gff > end]
     
     # Find index of GATC site closest to the read end
     closest = np.argmin(abs(end - tmp_gff))
@@ -115,59 +131,30 @@
     else:
         cigar = f"{cigar}{length_to_add}M"
     
-    """
-    # Update quality scores
-    old_qual = read.query_alignment_qualities
-    
-    # Create updated read and write to file
-    new_read = pysam.AlignedSegment()
-    new_read.query_name = read.query_name
-    new_read.query_sequence = "*"
-    new_read.flag = read.flag
-    new_read.reference_id = read.reference_id
-    new_read.reference_start = read.reference_start
-    new_read.mapping_quality = read.mapping_quality
-    new_read.cigarstring = cigar
-    new_read.next_reference_id = read.next_reference_id
-    new_read.next_reference_start = read.next_reference_start
-    new_read.template_length = length #new_sequence_length
-    new_read.query_qualities = pysam.qualitystring_to_array("*")
-    new_read.tags = read.tags
-    #print(new_read)
-    """
-    
+    # Create out read string
     read_out = "\t".join([
         read.query_name,
         str(read.flag),
-        str(read.reference_id),
-        str(start),
+        references[read.reference_id],
+        str(start + 1), # BAM is 1-based, SAM is 0-based
         str(mq),
         cigar,
         "*",
         "0",
         "0",
         "*",
-        "*",
-        read.tags
+        "*\n",
         ])
-    print(read_out)
-    # Write to BAM file
-    command = f"samtools view -Shb - > {output_bam}"
-    process = subprocess.Popen(command, stdin=subprocess.PIPE, shell=True)
-    process.stdin.write(bf"{read_out}\n")
+    
+    process.stdin.write(read_out)
     
     all_seqs += 1
     extended_seqs += 1
-    break
 
 process.stdin.close()
 process.wait()
-#samfile_in.close()
-#samfile_out.close()
-
-# Convert to BAM
-#pysam.view("-@", threads, "-o", output_bam, samfile_out)
 
-logging.info(f"Total sequences: {all_seqs}")
-logging.info(f"Sequences with MQ < {q}: {all_seqs - extended_seqs}")
-logging.info(f"Sequences extended: {extended_seqs}")
+# Write to log file
+logging.info("Total sequences: {}".format(all_seqs))
+logging.info("Sequences with MQ < {}: {}".format(q, all_seqs - extended_seqs))
+logging.info("Sequences extended: {}".format(extended_seqs))