diff --git a/00_archive/TODO.md b/00_archive/TODO.md index f9c2b89..ff0b759 100644 --- a/00_archive/TODO.md +++ b/00_archive/TODO.md @@ -1,7 +1,7 @@ # Things to do before publication ## Benchmark -* Replace the split command so we don't loose SNPs at the file edges +- Check if new `sort -u` commands in reports takes too long * Copy config file to log folder for each run - Test on a variety of genomes / vcfs - Same genome with VCFs of different sizes diff --git a/01_scripts/01_extract_positions.sh b/01_scripts/01_extract_positions.sh index c013666..5e63dd3 100755 --- a/01_scripts/01_extract_positions.sh +++ b/01_scripts/01_extract_positions.sh @@ -3,14 +3,8 @@ # Global variables VCF="$1" -SPLIT_BY="$2" +NUM_LINES="$2" +NCPUS="$3" # Extract -grep -v "^#" "$VCF" | cut -f -2 | split -l "$SPLIT_BY" -a 2 -d - 06_liftover/positions. - -# Rename -ls -1 06_liftover/positions.* | grep -P "\.\d{2,}" | - while read i - do - mv "$i" "$i".ids - done +./01_scripts/util/split_positions.py "$VCF" "$NUM_LINES" "$NCPUS" "$NUM_NEIGHBOURS" diff --git a/01_scripts/util/split_positions.py b/01_scripts/util/split_positions.py new file mode 100755 index 0000000..e4139d4 --- /dev/null +++ b/01_scripts/util/split_positions.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +"""Split positions from a VCF into num_cpus files. + +Each file contains some lines from the end of the previous file +and the beginning of the next file. + +Usage: + input_vcf num_lines ncpus num_neighbours +""" + +# Modules +import sys +import os + +# Parse user input +try: + input_vcf = sys.argv[1] + num_lines = int(sys.argv[2]) + ncpus = int(sys.argv[3]) + num_neighbours = int(sys.argv[4]) +except: + print(__doc__) + +# Open output handles +folder = "06_liftover" +stub = "positions." +end = ".ids" + +handles = {} + +for i in range(ncpus): + handles[i] = open(os.path.join(folder, stub + str(i).zfill(2) + end), "wt") + +# Iterate over VCF and write positions in appropriate files +line_num = 0 +lines_per_file = 1 + num_lines // ncpus + +with open(input_vcf) as infile: + for line in infile: + if line.startswith("#"): + continue + + l = line.strip().split("\t") + out_line = "\t".join(l[:2]) + "\n" + + # Write line to proper file + output_num = line_num // lines_per_file + handles[output_num].write(out_line) + + # Add first `num_neighbours` lines to previous file + output_num_prev = (line_num - num_neighbours) // lines_per_file + if output_num_prev >= 0 and output_num_prev == output_num - 1: + handles[output_num_prev].write(out_line) + + # Add last `num_neighbours` lines to next file + output_num_next = (line_num + num_neighbours) // lines_per_file + if output_num_next < ncpus and output_num_next == output_num + 1: + handles[output_num_next].write(out_line) + + line_num += 1 diff --git a/02_infos/snplift_config.sh b/02_infos/snplift_config.sh index b71b7ff..7ab9b7b 100644 --- a/02_infos/snplift_config.sh +++ b/02_infos/snplift_config.sh @@ -9,7 +9,7 @@ export OLD_VCF="04_input_vcf/old.vcf" export NEW_VCF="new.vcf" # Skipping genome indexing -export SKIP_INDEXING=0 # Save time if genome already indexed with 'bwa index' [0, 1] +export SKIP_INDEXING=1 # Save time if genome already indexed with 'bwa index' [0, 1] # Skip exploring features export SKIP_VISUALIZATION=1 # Avoid creating a plot to explore features diff --git a/snplift b/snplift index e796f1c..0304e75 100755 --- a/snplift +++ b/snplift @@ -54,12 +54,10 @@ else fi # SNPLift -## Compute SPLIT_BY, the number of lines that each chunk should have -export SPLIT_BY=$(grep -vc "^#" "$OLD_VCF" | awk -v ncpu="$NCPUS" '{print 1+$1/ncpu}' | cut -d "." -f 1) - ## Get original coordinates echo -e "\nSNPLift: Extracting positions from VCF" -./01_scripts/01_extract_positions.sh "$OLD_VCF" "$SPLIT_BY" +export NUM_LINES=$(grep -vc "^#" "$OLD_VCF") +./01_scripts/01_extract_positions.sh "$OLD_VCF" "$NUM_LINES" "$NCPUS" "$NUM_NEIGHBOURS" ## Extract flanking sequences around SNPs (100bp on each side) echo -e "\nSNPLift: Extracting flanking sequences around SNPs" @@ -117,15 +115,15 @@ echo -e "\nSNPLift: Writing new VCF with updated coordinates" ## Report number of SNPs for each step echo -e "\nSNPLift: Number of SNPs treated at each step\n" -echo -e $(cat "$LIFTOVER_FOLDER"/positions.*.ids | wc -l)"\tPositions" -echo -e $(cat "$LIFTOVER_FOLDER"/positions.*.sam | grep -v "^@" | cut -f 1 | uniq | wc -l)"\tAlignments" -echo -e $(cat "$LIFTOVER_FOLDER"/positions.*.features | grep -v "^@" | cut -f 1 | uniq | wc -l)"\tFeatures" -echo -e $(cat "$LIFTOVER_FOLDER"/positions.*.scores | wc -l)"\tScores" -echo -e $(cat "$LIFTOVER_FOLDER"/positions.*.corr | wc -l)"\tTransferable" +echo -e $(cat "$LIFTOVER_FOLDER"/positions.*.ids | sort -u | wc -l)"\tPositions" +echo -e $(cat "$LIFTOVER_FOLDER"/positions.*.sam | grep -v "^@" | cut -f 1 | sort -u | wc -l)"\tAlignments" +echo -e $(cat "$LIFTOVER_FOLDER"/positions.*.features | grep -v "^@" | cut -f 1 | sort -u | wc -l)"\tFeatures" +echo -e $(cat "$LIFTOVER_FOLDER"/positions.*.scores | sort -u | wc -l)"\tScores" +echo -e $(cat "$LIFTOVER_FOLDER"/positions.*.corr | sort -u | wc -l)"\tTransferable" ## Report percentage of transfered SNPs echo echo "SNPLift: Percentage of transferred SNPs:" -echo $(echo '100 *' $(cat "$LIFTOVER_FOLDER"/positions.*.corr | wc -l | awk '{print $1}') / $(cat "$LIFTOVER_FOLDER"/positions.*.ids | wc -l | awk '{print $1}') | bc -ql | cut -c -5)"%" +echo $(echo '100 *' $(cat "$LIFTOVER_FOLDER"/positions.*.corr | wc -l | awk '{print $1}') / $(cat "$LIFTOVER_FOLDER"/positions.*.ids | sort -u | wc -l | awk '{print $1}') | bc -ql | cut -c -5)"%" echo -e "\nSNPLift: Run completed\n"