From 18a0dedf9e044e8ea0061296a11808bfe21b8710 Mon Sep 17 00:00:00 2001 From: JokingHero Date: Fri, 19 Apr 2024 14:01:32 +0200 Subject: [PATCH] new options for the alghoritm, maybe this will make things possible --- src/CHOPOFF.jl | 11 ++++++- src/db_vcf.jl | 55 ++++++++++++++++++++++------------- test/src/db.jl | 5 ++-- test/src/db_extends5_false.jl | 2 +- 4 files changed, 49 insertions(+), 24 deletions(-) diff --git a/src/CHOPOFF.jl b/src/CHOPOFF.jl index 5452865e..9c7eae2c 100644 --- a/src/CHOPOFF.jl +++ b/src/CHOPOFF.jl @@ -298,6 +298,13 @@ function parse_commandline(args::Array{String}) help = "Defines length of the hash. " arg_type = Int required = false + "--reuse_saved_not" + help = "Whether to reuse paths that were saved for Cas9 distance 4 and prefix 16." + action = :store_true + "--variant_overlaps" + help = "Whether to check for all potential combinations of alternate alleles for nearby variants. " * + "Only use with small VCF files! Preferably only run for specific variants." + action = :store_true end @add_arg_table! s["build"]["pamDB"] begin @@ -524,7 +531,9 @@ function main(args::Array{String}) if hash_len === nothing hash_len = min(length_noPAM(motif) - (motif.distance), 16) end - build_vcfDB(args["name"], args["genome"], args["vcfDB"]["vcf"], motif, args["output"], hash_len) + build_vcfDB(args["name"], args["genome"], args["vcfDB"]["vcf"], motif, args["output"], hash_len; + reuse_saved = !args["vcfDB"]["reuse_saved_not"], + variant_overlaps = args["vcfDB"]["variant_overlaps"]) elseif args["%COMMAND%"] == "fmi" build_fmiDB(args["genome"], args["output"]) elseif args["%COMMAND%"] == "pamDB" diff --git a/src/db_vcf.jl b/src/db_vcf.jl index 68795ba8..2b74276b 100644 --- a/src/db_vcf.jl +++ b/src/db_vcf.jl @@ -35,7 +35,8 @@ function parse_vcf(vcf_filepath::String) recordNum += 1 end close(reader) - return rs_ids, rs_chroms, rs_ref, rs_ranges, rs_alt + order = sortperm(collect(zip(rs_chroms, rs_ranges))) + return rs_ids[order], rs_chroms[order], rs_ref[order], rs_ranges[order], rs_alt[order] end @@ -48,12 +49,15 @@ build_vcfDB( motif::Motif, storage_path::String, hash_len::Int = min(length_noPAM(motif) - motif.distance, 16); - reuse_saved::Bool = true) + reuse_saved::Bool = true, + variant_overlaps = false) ``` Builds a database of all potential off-targets that overlap any of the variants in the VCF file. It supports combinations of variants that are close to each other, will report all possible combinations of variants. This database uses simialr principles to `prefixHashDB`, also utilizes hashed prefix of specific length. +In case of troubles with loading of VCF files, the only fields that we use are ID, CHROM, POS, REF, ALT, so its +often possible to remove INFO field and other unnecesary fields which may cause troubles. # Arguments @@ -72,6 +76,8 @@ variants. This database uses simialr principles to `prefixHashDB`, also utilizes `reuse_saved` - Whether to reuse paths that were saved for Cas9 distance 4 and prefix 16. +`variant_overlaps` - Whether to check for all potential combinations of alternate alleles for nearby variants. + Only use with small VCF files! Preferably only run for specific variants. # Examples ```julia @@ -84,19 +90,21 @@ function build_vcfDB( vcfpath::String, motif::Motif, storage_path::String, - hash_len::Int = min(length_noPAM(motif) - motif.distance, 16); reuse_saved = true) + hash_len::Int = min(length_noPAM(motif) - motif.distance, 16); + reuse_saved = true, + variant_overlaps = false) dbi = DBInfo(genomepath, name, motif; vcf_filepath = vcfpath) hash_type = CHOPOFF.smallestutype(parse(UInt, repeat("1", hash_len * 2); base = 2)) suffix_type = CHOPOFF.smallestutype(parse(UInt, repeat("1", (CHOPOFF.length_noPAM(motif) - hash_len + motif.distance) * 2); base = 2)) ot_type = CHOPOFF.smallestutype(parse(UInt, repeat("1", (CHOPOFF.length_noPAM(motif) + motif.distance) * 2); base = 2)) - + + @info "Step 1: Reading VCF file." rs_ids, rs_chroms, rs_ref, rs_ranges, rs_alt = CHOPOFF.parse_vcf(vcfpath) l = length_noPAM(motif) + motif.distance lp = length(motif) + motif.distance # with PAM - @info "Step 1: Parsing the genomic relation to the VCF file." - # For each chromosome parallelized we build database + @info "Step 2: Parsing the genomic relation to the VCF file." ref = open(dbi.gi.filepath, "r") reader = dbi.gi.is_fa ? FASTA.Reader(ref, index = dbi.gi.filepath * ".fai") : TwoBit.Reader(ref) @@ -107,27 +115,34 @@ function build_vcfDB( ambig_annot = Vector{String}() # change to InlineStrings for ch in unique(rs_chroms) - #ch = first(unique(rs_chroms)) # REMOVE - ch_ = convert(dbi.gi.chrom_type, findfirst(isequal(ch), dbi.gi.chrom)) + ch_numeric = findfirst(isequal(ch), dbi.gi.chrom) + if isnothing(ch_numeric) + @warn("Chromosome " * string(ch) * " is not indexed for specified genome, skipping it.") + continue + end + ch_ = convert(dbi.gi.chrom_type, ch_numeric) + @info "Working on " * string(ch) chrom_seq = CHOPOFF.getchromseq(dbi.gi.is_fa, reader[ch]) chrom_idx = findall(isequal(ch), rs_chroms) - # now for every snp (we assume they are sorted) + # group snps by proximity - as we have to enumerate all permutations of rs_alt for them grouping = 1 - grouping_idx = ones(Int, length(chrom_idx)) - for i in 1:(length(chrom_idx) - 1) - x = (rs_ranges[chrom_idx[i]].start - l):(rs_ranges[chrom_idx[i]].stop + l) - if length(intersect(x, rs_ranges[chrom_idx[i+1]])) > 0 # rs overlaps - grouping_idx[i + 1] = grouping - else - grouping += 1 - grouping_idx[i + 1] = grouping + grouping_idx = collect(1:length(chrom_idx)) + if variant_overlaps # we need to correct grouping, otherwise each variant is in its own group + for i in 1:(length(chrom_idx) - 1) + x = (rs_ranges[chrom_idx[i]].start - l):(rs_ranges[chrom_idx[i]].stop + l) + if length(intersect(x, rs_ranges[chrom_idx[i+1]])) > 0 # rs overlaps + grouping_idx[i + 1] = grouping + else + grouping += 1 + grouping_idx[i + 1] = grouping + end end end # for each group we analyze these snps together - for group in unique(grouping_idx) + @showprogress dt=60 for group in unique(grouping_idx) # group = first(unique(grouping_idx)) idxs = chrom_idx[grouping_idx .== group] if length(idxs) == 1 # simple case - singular snp - potentially many alternate alleles @@ -237,7 +252,7 @@ function build_vcfDB( end close(ref) - @info "Step 2: Constructing Paths for hashes" + @info "Step 3: Constructing Paths for hashes" paths = nothing if (reuse_saved && (motif.distance <= 4) && (hash_len <= 16)) m2 = Motif("Cas9") @@ -276,7 +291,7 @@ function build_vcfDB( end if length(ambig_guides) > 0 - @info "Step 3: Constructing DB for ambigous gRNAs." + @info "Step 4: Constructing DB for ambigous gRNAs." order = sortperm(ambig_guides) ambig_guides = ambig_guides[order] ambig_chrom = ambig_chrom[order] diff --git a/test/src/db.jl b/test/src/db.jl index 3a91ea1f..9cd708a0 100644 --- a/test/src/db.jl +++ b/test/src/db.jl @@ -6,7 +6,7 @@ using CSV using DataFrames ## SET WD when debugging -# cd("test") + cd("test") ## CRISPRitz compare functions - we test with up to 4 distance function asguide(x::String) @@ -94,7 +94,8 @@ end vcf_storage_path = joinpath(vcf_path, "vcfDB.bin") build_vcfDB( "samirandom", genome, vcf, - Motif("Cas9"; distance = 2, ambig_max = 3), vcf_storage_path) + Motif("Cas9"; distance = 2, ambig_max = 3), vcf_storage_path; + variant_overlaps = true) detail_path_vcf = joinpath(vcf_path, "output.csv") search_vcfDB(vcf_storage_path, guides, detail_path_vcf; distance = 2, diff --git a/test/src/db_extends5_false.jl b/test/src/db_extends5_false.jl index 844e07fb..e91ea232 100644 --- a/test/src/db_extends5_false.jl +++ b/test/src/db_extends5_false.jl @@ -57,7 +57,7 @@ end vcf_storage_path = joinpath(vcf_path, "vcfDB.bin") build_vcfDB( "samirandom", genome, vcf, - Motif("Cas12a"; distance = 1, ambig_max = 3), vcf_storage_path) + Motif("Cas12a"; distance = 1, ambig_max = 3), vcf_storage_path; variant_overlaps = true) detail_path_vcf = joinpath(vcf_path, "output.csv") search_vcfDB(vcf_storage_path, guides, detail_path_vcf; distance = 1,