Skip to content

Commit

Permalink
Split IDs with some redundancy
Browse files Browse the repository at this point in the history
  • Loading branch information
enormandeau committed Feb 1, 2023
1 parent 67f4ece commit a137b88
Show file tree
Hide file tree
Showing 5 changed files with 73 additions and 21 deletions.
2 changes: 1 addition & 1 deletion 00_archive/TODO.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Things to do before publication

## Benchmark
* Replace the split command so we don't loose SNPs at the file edges
- Check if new `sort -u` commands in reports takes too long
* Copy config file to log folder for each run
- Test on a variety of genomes / vcfs
- Same genome with VCFs of different sizes
Expand Down
12 changes: 3 additions & 9 deletions 01_scripts/01_extract_positions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,8 @@

# Global variables
VCF="$1"
SPLIT_BY="$2"
NUM_LINES="$2"
NCPUS="$3"

# Extract
grep -v "^#" "$VCF" | cut -f -2 | split -l "$SPLIT_BY" -a 2 -d - 06_liftover/positions.

# Rename
ls -1 06_liftover/positions.* | grep -P "\.\d{2,}" |
while read i
do
mv "$i" "$i".ids
done
./01_scripts/util/split_positions.py "$VCF" "$NUM_LINES" "$NCPUS" "$NUM_NEIGHBOURS"
60 changes: 60 additions & 0 deletions 01_scripts/util/split_positions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/usr/bin/env python3
"""Split positions from a VCF into num_cpus files.
Each file contains some lines from the end of the previous file
and the beginning of the next file.
Usage:
<program> input_vcf num_lines ncpus num_neighbours
"""

# Modules
import sys
import os

# Parse user input
try:
input_vcf = sys.argv[1]
num_lines = int(sys.argv[2])
ncpus = int(sys.argv[3])
num_neighbours = int(sys.argv[4])
except:
print(__doc__)

# Open output handles
folder = "06_liftover"
stub = "positions."
end = ".ids"

handles = {}

for i in range(ncpus):
handles[i] = open(os.path.join(folder, stub + str(i).zfill(2) + end), "wt")

# Iterate over VCF and write positions in appropriate files
line_num = 0
lines_per_file = 1 + num_lines // ncpus

with open(input_vcf) as infile:
for line in infile:
if line.startswith("#"):
continue

l = line.strip().split("\t")
out_line = "\t".join(l[:2]) + "\n"

# Write line to proper file
output_num = line_num // lines_per_file
handles[output_num].write(out_line)

# Add first `num_neighbours` lines to previous file
output_num_prev = (line_num - num_neighbours) // lines_per_file
if output_num_prev >= 0 and output_num_prev == output_num - 1:
handles[output_num_prev].write(out_line)

# Add last `num_neighbours` lines to next file
output_num_next = (line_num + num_neighbours) // lines_per_file
if output_num_next < ncpus and output_num_next == output_num + 1:
handles[output_num_next].write(out_line)

line_num += 1
2 changes: 1 addition & 1 deletion 02_infos/snplift_config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ export OLD_VCF="04_input_vcf/old.vcf"
export NEW_VCF="new.vcf"

# Skipping genome indexing
export SKIP_INDEXING=0 # Save time if genome already indexed with 'bwa index' [0, 1]
export SKIP_INDEXING=1 # Save time if genome already indexed with 'bwa index' [0, 1]

# Skip exploring features
export SKIP_VISUALIZATION=1 # Avoid creating a plot to explore features
Expand Down
18 changes: 8 additions & 10 deletions snplift
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,10 @@ else
fi

# SNPLift
## Compute SPLIT_BY, the number of lines that each chunk should have
export SPLIT_BY=$(grep -vc "^#" "$OLD_VCF" | awk -v ncpu="$NCPUS" '{print 1+$1/ncpu}' | cut -d "." -f 1)

## Get original coordinates
echo -e "\nSNPLift: Extracting positions from VCF"
./01_scripts/01_extract_positions.sh "$OLD_VCF" "$SPLIT_BY"
export NUM_LINES=$(grep -vc "^#" "$OLD_VCF")
./01_scripts/01_extract_positions.sh "$OLD_VCF" "$NUM_LINES" "$NCPUS" "$NUM_NEIGHBOURS"

## Extract flanking sequences around SNPs (100bp on each side)
echo -e "\nSNPLift: Extracting flanking sequences around SNPs"
Expand Down Expand Up @@ -117,15 +115,15 @@ echo -e "\nSNPLift: Writing new VCF with updated coordinates"

## Report number of SNPs for each step
echo -e "\nSNPLift: Number of SNPs treated at each step\n"
echo -e $(cat "$LIFTOVER_FOLDER"/positions.*.ids | wc -l)"\tPositions"
echo -e $(cat "$LIFTOVER_FOLDER"/positions.*.sam | grep -v "^@" | cut -f 1 | uniq | wc -l)"\tAlignments"
echo -e $(cat "$LIFTOVER_FOLDER"/positions.*.features | grep -v "^@" | cut -f 1 | uniq | wc -l)"\tFeatures"
echo -e $(cat "$LIFTOVER_FOLDER"/positions.*.scores | wc -l)"\tScores"
echo -e $(cat "$LIFTOVER_FOLDER"/positions.*.corr | wc -l)"\tTransferable"
echo -e $(cat "$LIFTOVER_FOLDER"/positions.*.ids | sort -u | wc -l)"\tPositions"
echo -e $(cat "$LIFTOVER_FOLDER"/positions.*.sam | grep -v "^@" | cut -f 1 | sort -u | wc -l)"\tAlignments"
echo -e $(cat "$LIFTOVER_FOLDER"/positions.*.features | grep -v "^@" | cut -f 1 | sort -u | wc -l)"\tFeatures"
echo -e $(cat "$LIFTOVER_FOLDER"/positions.*.scores | sort -u | wc -l)"\tScores"
echo -e $(cat "$LIFTOVER_FOLDER"/positions.*.corr | sort -u | wc -l)"\tTransferable"

## Report percentage of transfered SNPs
echo
echo "SNPLift: Percentage of transferred SNPs:"
echo $(echo '100 *' $(cat "$LIFTOVER_FOLDER"/positions.*.corr | wc -l | awk '{print $1}') / $(cat "$LIFTOVER_FOLDER"/positions.*.ids | wc -l | awk '{print $1}') | bc -ql | cut -c -5)"%"
echo $(echo '100 *' $(cat "$LIFTOVER_FOLDER"/positions.*.corr | wc -l | awk '{print $1}') / $(cat "$LIFTOVER_FOLDER"/positions.*.ids | sort -u | wc -l | awk '{print $1}') | bc -ql | cut -c -5)"%"

echo -e "\nSNPLift: Run completed\n"

0 comments on commit a137b88

Please sign in to comment.