Added parameters to config and scripts

enormandeau · Jan 13, 2023 · 71b12ab · 71b12ab
1 parent c7c6d1d
commit 71b12ab
Show file tree

Hide file tree

Showing 9 changed files with 64 additions and 38 deletions.
diff --git a/00_archive/TODO.md b/00_archive/TODO.md
@@ -1,35 +1,50 @@
 # Version name ideas
 
-Funny Tofu, Flying Carrot, Pumped Potato, Refreshing Tea, Shiny Zucchini
-Scalding Coffee, Sturdy Beetroot, Extravagant Pepper, Drifting Artichoke
-Sparkling Apple, Mindful Pear, Electric Bean, Jumping Squash, Whispering Corn
+Blushing.Pepper Elastic.Jujube Exuberant.Pear Flying.Carrot Refreshing.Tea
+Running.Popsicle Scalding.Coffee Shiny.Zucchini Spicy.Soup Spiny.Artichoke
+Squishy.Squid Sticky.Jam Sturdy.Beetroot Tangy.Miso
 
 # Things to do before publication
 
 ## Config file
-- Add parameters for the filters
+- Bump to v0.2.0 Funny Tofu
 
-## Validation script
-- Input files can be found
-- Scaffold names in old genome are the same as those in the VCF
-- Scaffold name formats are OK in all input files
+## Make snplift crash if one program crashes
+
+## Benchmark
+- Test on a variety of genomes / vcfs
+  - Same genome with VCFs of different sizes
+  - Collect genome-size, VCF-size, runtime
+  - Build a regression model with gsize and vsize + interaction
+  - Estimate runtime as a function
+    - runtim = ag + bv + cgv + d
+- Suggest dividing by 10 and multiplying by their test runtime
+- Run SNPLift on its result to try to get back the original (test dataset)
+  - Check that you recover the original positions
+- Test on new Linux computer / MacOS
 
-## Doc
+## Revise MS
+- Modify to reflect changes
+- Tell Davoud it is his turn
+
+## Documentation
 - Improve format using doc from barque, GAWN and stacks workflow
 - Add species name for the test dataset
 - Confirm all the dependencies
 - Explicitly describe VCF format (3 first columns)
 - Describe behaviour (eg: write lines with `#` without treatment)
+- Bump to v0.3.0 Careful Mango
 
-## Benchmark
-- Give time estimate based on genome size as well as number of SNPs and samples
-- Give estimate in minutes AND as a factor compared to test run
-- Run SNPLift on its result to try to get back the original (test dataset)
-  - See if you still lose some alignments
-    - We get 99.21% transfer
-  - Check that you recover the original positions
+## Validation script
+- Input files can be found
+- Scaffold names in old genome are the same as those in the VCF
+- Scaffold name formats are OK in all input files
+- Bump to v0.4.0 Jumping Squash
 
-## Test on new Linux computer / MacOS
+## Publish
+- Bump to v1.0.0 Mindful Peach
+- Publish on bioRxiv
+- Submit somewhere
 
 # Maybe
 - Check nucleotide distance of genomes using first chromosome

diff --git a/01_scripts/03_map_reads.sh b/01_scripts/03_map_reads.sh
@@ -4,5 +4,6 @@
 # Global variables
 GENOME="$1"
 POSITIONS="$2"
+NCPUS="$3"
 
-bwa mem -t 20 "$GENOME" "$POSITIONS"
+bwa mem -t "$NCPUS" "$GENOME" "$POSITIONS"
diff --git a/01_scripts/05_explore_features.R b/01_scripts/05_explore_features.R
@@ -35,8 +35,8 @@ subset = dd[dd$MappingFlag < 2000 &
 
 # Plot variables of interest
 set.seed(123)
-subset.random = subset[sample(nrow(subset), 50000)]
+subset.random = subset[sample(nrow(subset), 10000)]
 
-png("positions.features.png", width=1000, height=1000)
+png("06_liftover/positions.features.png", width=1000, height=1000)
 plot(subset.random, col="#00000011", pch=19)
 dev.off()
diff --git a/01_scripts/06_score_markers.py b/01_scripts/06_score_markers.py
@@ -5,29 +5,29 @@
     <program> input_features output_scores
 
 Criteria and quality penalty (as a fraction of 1):
-    Flag > 2000: -1.0
-    Quality < 10: -0.5
-    dd$SuppAlignMinDiff < 4: -0.8
-    dd$NumDiff > 0.05 * len(sequence): -0.3
-    dd$Softclip > 0.25 * len(sequence): -0.2
-    (dd$Match + dd$Softclip) < 0.9 * len(sequence): -0.3
-    (dd$Softclip - 0.05 * len(sequence)) / (dd$NumNs + 1) <= 1.1: -0.5
+    Flag > 2000: 1.0
+    Quality < 10: 0.5
+    dd$SuppAlignMinDiff < 4: 0.8
+    dd$NumDiff > 0.05 * len(sequence): 0.3
+    dd$Softclip > 0.25 * len(sequence): 0.2
+    (dd$Match + dd$Softclip) < 0.9 * len(sequence): 0.3
+    (dd$Softclip - 0.05 * len(sequence)) / (dd$NumNs + 1) <= 1.1: 0.5
 """
 
 # Modules
 import sys
 
-# Add to parameters if needed
-expected_length = 200
-
 # Parse user input
 try:
     input_features = sys.argv[1]
     output_scores = sys.argv[2]
+    window_size = int(sys.argv[3])
 except:
     print(__doc__)
     sys.exit(1)
 
+expected_length = 2 * window_size
+
 # Score away!
 with open(input_features, "rt") as infile:
     with open(output_scores, "wt") as outfile:

diff --git a/01_scripts/07_correspondence.py b/01_scripts/07_correspondence.py
@@ -75,11 +75,12 @@
                 # Compute useful neighbourhood metrix
                 scores = [float(x[0]) for x in infos]
                 average = round(sum(scores) / len(scores), 2)
-                num_negative = len([x for x in scores if x <= 0.0])
 
                 if average < 0.2:
                     continue
 
+                num_negative = len([x for x in scores if x <= 0.0])
+
                 if num_negative > window_size / 2:
                     continue
 

diff --git a/01_scripts/util/collinearity.sh b/01_scripts/util/collinearity.sh
@@ -4,10 +4,11 @@
 # Global variables
 NEW_GENOME="$1"
 OLD_GENOME="$2"
+NCPUS="$3"
 FOLDER="05_collinearity"
 
 # Align genomes with minimap
-minimap2 -t20 -x asm5 -o "$FOLDER"/correspondence.paf "$NEW_GENOME" "$OLD_GENOME"
+minimap2 -t"$NCPUS" -x asm10 -o "$FOLDER"/correspondence.paf "$NEW_GENOME" "$OLD_GENOME"
 
 # Plot collinearity with minidot
 minidot -m 1000 -i 0.5 -s 10000 -w 1000 -f 11 "$FOLDER"/correspondence.paf > "$FOLDER"/collinearity.eps && epstopdf "$FOLDER"/collinearity.eps

diff --git a/02_infos/snplift_config.sh b/02_infos/snplift_config.sh
@@ -9,9 +9,11 @@ export OLD_VCF="04_input_vcf/old.vcf"
 export NEW_VCF="new.vcf"
 
 # Skiping steps
-export SKIP_COLLINEARITY=1  # Skip alignment of both genomes and visual collinearity comparison [0, 1]
-export SKIP_INDEXING=1      # Skip indexing the genome if it is already indexed with `bwa index` [0, 1]
+export SKIP_COLLINEARITY=0  # Skip alignment of both genomes and visual collinearity comparison [0, 1]
+export SKIP_INDEXING=0      # Skip indexing the genome if it is already indexed with `bwa index` [0, 1]
 
 # Parameters
+export NCPUS=10             # Number of cores to use for mapping steps (miniconda2, bwa mem)
 export WINDOW_LENGTH=100    # If you modify this value, also modify 06_score_markers.py
-export NUM_NEIGHBOURS=10    # Number of neigbour SNPs to consider when trying to recuperate more dubious SNPs
+export NUM_NEIGHBOURS=10    # Number of neigbour SNPs to consider when trying to recuperate
+                            #   more dubious SNPs using local correlations of positions
diff --git a/README.md b/README.md
@@ -116,6 +116,12 @@ of SNPs that can be transfered will go down. Whole or partial genome
 duplication will also have an impact on the capacity to transfer SNPs between
 assemblies.
 
+For SNPs with position within 100bp (or the value of WINDOW_LENGTH in the
+configuration file), the reported position in the new VCF will be slightly off.
+Measures are taken to correct for this at the beginning of the scaffolds and
+for alignments with some soft cliping but SNPs at the end of scaffolds may be
+off by up to WINDOW_LENGTH nucleotides.
+
 ## License
 
 CC share-alike

diff --git a/snplift b/snplift
@@ -24,7 +24,7 @@ echo -e "\n$(head -1 README.md | cut -c 3-)"
 if [ "$SKIP_COLLINEARITY" == "0" ]
 then
     echo -e "\nSNPLift: Assessing collinearity of the two genomes with minimap2"
-    ./01_scripts/util/collinearity.sh "$NEW_GENOME" "$OLD_GENOME"
+    ./01_scripts/util/collinearity.sh "$NEW_GENOME" "$OLD_GENOME" "$NCPUS"
 
 elif [ "$SKIP_COLLINEARITY" == "1" ]
 then
@@ -64,7 +64,7 @@ echo -e "\nSNPLift: Extracting flanking sequences around SNPs in old genome"
 
 ## Map reads with bwa (keep best hit)
 echo -e "\nSNPLift: Mapping flanking sequences on new genome with bwa\n"
-./01_scripts/03_map_reads.sh "$NEW_GENOME" "$LIFTOVER_FOLDER"/positions.fasta > "$LIFTOVER_FOLDER"/positions.sam
+./01_scripts/03_map_reads.sh "$NEW_GENOME" "$LIFTOVER_FOLDER"/positions.fasta "$NCPUS" > "$LIFTOVER_FOLDER"/positions.sam
 
 ## Extract features from alignments
 echo -e "\nSNPLift: Extract features from alignments"
@@ -76,7 +76,7 @@ R -q -e 'source("./01_scripts/05_explore_features.R")' 2>/dev/null
 
 ## Score markers based on alignments (detail them)
 echo -e "\nSNPLift: Score markers based on extracted features"
-./01_scripts/06_score_markers.py "$LIFTOVER_FOLDER"/positions.features "$LIFTOVER_FOLDER"/positions.scores
+./01_scripts/06_score_markers.py "$LIFTOVER_FOLDER"/positions.features "$LIFTOVER_FOLDER"/positions.scores "$WINDOW_LENGTH"
 
 ## Keep good loci and try to recuperate bad alignments if locally collinear
 echo -e "\nSNPLift: Getting coordinates of transferable SNPs"