Split IDs with some redundancy

enormandeau · Feb 1, 2023 · a137b88 · a137b88
1 parent 67f4ece
commit a137b88
Show file tree

Hide file tree

Showing 5 changed files with 73 additions and 21 deletions.
diff --git a/00_archive/TODO.md b/00_archive/TODO.md
@@ -1,7 +1,7 @@
 # Things to do before publication
 
 ## Benchmark
-* Replace the split command so we don't loose SNPs at the file edges
+- Check if new `sort -u` commands in reports takes too long
 * Copy config file to log folder for each run
 - Test on a variety of genomes / vcfs
   - Same genome with VCFs of different sizes

diff --git a/01_scripts/01_extract_positions.sh b/01_scripts/01_extract_positions.sh
@@ -3,14 +3,8 @@
 
 # Global variables
 VCF="$1"
-SPLIT_BY="$2"
+NUM_LINES="$2"
+NCPUS="$3"
 
 # Extract
-grep -v "^#" "$VCF" | cut -f -2 | split -l "$SPLIT_BY" -a 2 -d - 06_liftover/positions.
-
-# Rename
-ls -1 06_liftover/positions.* | grep -P "\.\d{2,}" |
-    while read i
-    do
-        mv "$i" "$i".ids
-    done
+./01_scripts/util/split_positions.py "$VCF" "$NUM_LINES" "$NCPUS" "$NUM_NEIGHBOURS"
diff --git a/01_scripts/util/split_positions.py b/01_scripts/util/split_positions.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+"""Split positions from a VCF into num_cpus files.
+
+Each file contains some lines from the end of the previous file
+and the beginning of the next file.
+
+Usage:
+    <program> input_vcf num_lines ncpus num_neighbours
+"""
+
+# Modules
+import sys
+import os
+
+# Parse user input
+try:
+    input_vcf = sys.argv[1]
+    num_lines = int(sys.argv[2])
+    ncpus = int(sys.argv[3])
+    num_neighbours = int(sys.argv[4])
+except:
+    print(__doc__)
+
+# Open output handles
+folder = "06_liftover"
+stub = "positions."
+end = ".ids"
+
+handles = {}
+
+for i in range(ncpus):
+    handles[i] = open(os.path.join(folder, stub + str(i).zfill(2) + end), "wt")
+
+# Iterate over VCF and write positions in appropriate files
+line_num = 0
+lines_per_file = 1 + num_lines // ncpus
+
+with open(input_vcf) as infile:
+    for line in infile:
+        if line.startswith("#"):
+            continue
+
+        l = line.strip().split("\t")
+        out_line = "\t".join(l[:2]) + "\n"
+
+        # Write line to proper file
+        output_num = line_num // lines_per_file
+        handles[output_num].write(out_line)
+
+        # Add first `num_neighbours` lines to previous file
+        output_num_prev = (line_num - num_neighbours) // lines_per_file
+        if output_num_prev >= 0 and output_num_prev == output_num - 1:
+            handles[output_num_prev].write(out_line)
+
+        # Add last `num_neighbours` lines to next file
+        output_num_next = (line_num + num_neighbours) // lines_per_file
+        if output_num_next < ncpus and output_num_next == output_num + 1:
+            handles[output_num_next].write(out_line)
+
+        line_num += 1
diff --git a/02_infos/snplift_config.sh b/02_infos/snplift_config.sh
@@ -9,7 +9,7 @@ export OLD_VCF="04_input_vcf/old.vcf"
 export NEW_VCF="new.vcf"
 
 # Skipping genome indexing
-export SKIP_INDEXING=0      # Save time if genome already indexed with 'bwa index' [0, 1]
+export SKIP_INDEXING=1      # Save time if genome already indexed with 'bwa index' [0, 1]
 
 # Skip exploring features
 export SKIP_VISUALIZATION=1 # Avoid creating a plot to explore features

diff --git a/snplift b/snplift
@@ -54,12 +54,10 @@ else
 fi
 
 # SNPLift
-## Compute SPLIT_BY, the number of lines that each chunk should have
-export SPLIT_BY=$(grep -vc "^#" "$OLD_VCF" | awk -v ncpu="$NCPUS" '{print 1+$1/ncpu}' | cut -d "." -f 1)
-
 ## Get original coordinates
 echo -e "\nSNPLift: Extracting positions from VCF"
-./01_scripts/01_extract_positions.sh "$OLD_VCF" "$SPLIT_BY"
+export NUM_LINES=$(grep -vc "^#" "$OLD_VCF")
+./01_scripts/01_extract_positions.sh "$OLD_VCF" "$NUM_LINES" "$NCPUS" "$NUM_NEIGHBOURS"
 
 ## Extract flanking sequences around SNPs (100bp on each side)
 echo -e "\nSNPLift: Extracting flanking sequences around SNPs"
@@ -117,15 +115,15 @@ echo -e "\nSNPLift: Writing new VCF with updated coordinates"
 
 ## Report number of SNPs for each step
 echo -e "\nSNPLift: Number of SNPs treated at each step\n"
-echo -e $(cat "$LIFTOVER_FOLDER"/positions.*.ids | wc -l)"\tPositions"
-echo -e $(cat "$LIFTOVER_FOLDER"/positions.*.sam | grep -v "^@" | cut -f 1 | uniq | wc -l)"\tAlignments"
-echo -e $(cat "$LIFTOVER_FOLDER"/positions.*.features | grep -v "^@" | cut -f 1 | uniq | wc -l)"\tFeatures"
-echo -e $(cat "$LIFTOVER_FOLDER"/positions.*.scores | wc -l)"\tScores"
-echo -e $(cat "$LIFTOVER_FOLDER"/positions.*.corr | wc -l)"\tTransferable"
+echo -e $(cat "$LIFTOVER_FOLDER"/positions.*.ids | sort -u | wc -l)"\tPositions"
+echo -e $(cat "$LIFTOVER_FOLDER"/positions.*.sam | grep -v "^@" | cut -f 1 | sort -u | wc -l)"\tAlignments"
+echo -e $(cat "$LIFTOVER_FOLDER"/positions.*.features | grep -v "^@" | cut -f 1 | sort -u | wc -l)"\tFeatures"
+echo -e $(cat "$LIFTOVER_FOLDER"/positions.*.scores | sort -u | wc -l)"\tScores"
+echo -e $(cat "$LIFTOVER_FOLDER"/positions.*.corr | sort -u | wc -l)"\tTransferable"
 
 ## Report percentage of transfered SNPs
 echo
 echo "SNPLift: Percentage of transferred SNPs:"
-echo $(echo '100 *' $(cat "$LIFTOVER_FOLDER"/positions.*.corr | wc -l | awk '{print $1}') / $(cat "$LIFTOVER_FOLDER"/positions.*.ids | wc -l | awk '{print $1}') | bc -ql | cut -c -5)"%"
+echo $(echo '100 *' $(cat "$LIFTOVER_FOLDER"/positions.*.corr | wc -l | awk '{print $1}') / $(cat "$LIFTOVER_FOLDER"/positions.*.ids | sort -u | wc -l | awk '{print $1}') | bc -ql | cut -c -5)"%"
 
 echo -e "\nSNPLift: Run completed\n"