diff --git a/00_archive/TODO.md b/00_archive/TODO.md index 976e572..7d9b93d 100644 --- a/00_archive/TODO.md +++ b/00_archive/TODO.md @@ -1,16 +1,20 @@ # Things to do before publication ## Needs fixing -- Confirm all position from .scores files are in .corr (ie: not missing 1-2 at the interface) -- Keep only one SNP per position -- For VCFs +* Corrected VCF + - Keep only one SNP per position - If hit reverse strand, modify alleles (A/G -> T/C) - Produce new SNP id in column 3 -- Try harder to extract accurate SNP positions from sam file +- Bump version + +## Position accuracy +- Extract more accurate SNP positions - At ends of chromosomes (put info in the sequence name?) - Already have something for this? Only for chromosome starts? - When there are Ns - Other non-perfect matches + - Use only center portion around SNP and find best position + - Alternatively, do this for on the left then on the right of the SNP ## Validation script (Crash explicitly) - Input files can be found diff --git a/01_scripts/06_score_markers.py b/01_scripts/06_score_markers.py index 9ad9270..e37614f 100755 --- a/01_scripts/06_score_markers.py +++ b/01_scripts/06_score_markers.py @@ -26,7 +26,7 @@ l = line.strip().split("\t") if line.startswith("QueryName"): - header = ["QueryScaffold", "QueryName", "QueryPos", "TargetChrom", "TargetPos"] + header = ["QueryScaffold", "QueryName", "QueryPos", "TargetChrom", "TargetPos", "Reversed"] header.insert(0, "Penalties") header.insert(0, "Score") outfile.write("\t".join(header) + "\n") @@ -38,6 +38,8 @@ score = 1.0 penalties = [] query_scaffold = QueryName.split(";")[0] + if MappingFlag == "16": + MappingFlag = "1" # Alignment too short if len(Sequence) < (expected_length / 2): @@ -103,4 +105,4 @@ penalties.append(".") outfile.write("\t".join([str(round(score, 2)), "".join(penalties)] + - [query_scaffold, QueryName, QueryPos, TargetChrom, TargetPos]) + "\n") + [query_scaffold, QueryName, QueryPos, TargetChrom, TargetPos, MappingFlag]) + "\n") diff --git a/01_scripts/07_correspondence.py b/01_scripts/07_correspondence.py index 0ce3160..b24fdb2 100755 --- a/01_scripts/07_correspondence.py +++ b/01_scripts/07_correspondence.py @@ -87,7 +87,7 @@ def keep_snp(past, now, future): outfile.write(line) continue - #Score, Penalties, QueryScaffold, QueryName, QueryPos, TargetChrom, TargetPos + #Score, Penalties, QueryScaffold, QueryName, QueryPos, TargetChrom, TargetPos, Reversed l = line.strip().split("\t") # Get first info line diff --git a/02_infos/snplift_config.sh b/02_infos/snplift_config.sh index 9460c52..97303f9 100644 --- a/02_infos/snplift_config.sh +++ b/02_infos/snplift_config.sh @@ -9,15 +9,20 @@ export INPUT_FILE="04_input_vcf/old.vcf" export OUTPUT_FILE="new.vcf" # Skipping genome indexing -export SKIP_INDEXING=1 # Save time if genome already indexed with 'bwa index' [0, 1] +export SKIP_INDEXING=1 # Save time if genome already indexed with 'bwa index' [0, 1]. # Skip exploring features export SKIP_VISUALIZATION=1 # Avoid creating a plot to explore features. These are used - # for debugging + # for debugging [0, 1]. # Checking for collinearity between both genome versions export CHECK_COLLINEARITY=0 # Increases runtime by 5+ times. Align genomes and produce a - # collinearity comparison figure [0, 1] + # collinearity comparison figure [0, 1]. + +# Do final corrections to VCF file +export CORRECT_VCF=1 # If output file is a VCF, recompute ID column 3 from columns 1 and 2, + # reverse complement alleles of loci that map in reverse in the new + # genome, and permit only one locus per position. # Parameters export NCPUS=10 # Number of cores to use (around 10 and maximum 20 is recommended) diff --git a/README.md b/README.md index 3a4ab1d..5dfc0ea 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ and then call and filter the loci. In the process, a proportion of the loci are inevitably lost. However, the transferred proportion is very high for genomes with low duplication content and -when both genome versions are similar. Our test run on real data gives a 99.87% +when both genome versions are similar. Our test run on real data gives a 99.82% transfers rate. **NOTE**: Although SNPLift was designed primarily for VCFs containing SNP data, @@ -22,8 +22,10 @@ type or even bed file, as long as the two first columns contain chromosome and position information and that there are other columns with informations to transfer. -**WARNING** Ultimately, the only way to guaranty that all the positions on the -new genome are correct is to re-align the reads and call the genotypes again. +**WARNING**: In regions that differ between the two assemblies, a small +proportion of SNPs will end up with an approximate position. Ultimately, the +only way to guaranty that all the positions on the new genome are correct is to +re-align the reads and call the genotypes again. See licence information at the end of this file. @@ -85,8 +87,7 @@ assemblies from *Medicago truncatula* and a VCF with SNPs found in the first chromosome of the reference genome. The VCF contains the genotypes of 10 samples for 190,443 SNPs. The test takes about 1m20s on 10 Xeon processors from 2020. About 1m is used to index the old genome for alignment with bwa. The rest -of the steps take about 18s. For this test run, based on real data, 99.87% of -the SNPs are transferred. +of the steps take about 18s. You can run the full SNPLift test with: @@ -140,8 +141,13 @@ The output of SNPLift is a file (eg: VCF) in which the positions for which a good alignment was found are transferred to the coordinates of a new reference genome. -Optionally, if `CHECK_COLLINEARITY` is set to `1` a collinearity figure in .eps -and .pdf formats is produced. +Optionaly, if `CORRECT_VCF` is set to `1`, column 3 of the VCF containing locus +IDs will be recomputed from columns 1 and 2, alleles for loci that map in +the reverse orientation in the new genome will be reverse complemented, and +only one locus will be retain if multiple loci map in the same position. + +Optionally, if `CHECK_COLLINEARITY` is set to `1`, a collinearity figure in +.eps and .pdf formats is produced. Optionally, if `SKIP_VISUALIZATION` is set to `0`, a figure showing some of the features used for filtering the alignments is produced. This is used mainly for