diff --git a/src/db_prefix_hash.jl b/src/db_prefix_hash.jl index 6b99e567..c02b223c 100644 --- a/src/db_prefix_hash.jl +++ b/src/db_prefix_hash.jl @@ -224,7 +224,7 @@ function build_prefixHashDB( suffixes = convert.(suffix_type, mask .& guides) guides = nothing - order = sortperm(prefixes) + order = sortperm(collect(zip(prefixes, suffixes))) prefixes = prefixes[order] suffixes = suffixes[order] chrom = chrom[order] @@ -385,7 +385,7 @@ function search_prefixHashDB( paths = db.mpt.paths[db.mpt.paths_distances .<= distance, :] mkpath(dirname(output_file)) - Base.map(guides_) do g # maybe a function would be faster than lambda here? + ThreadsX.map(guides_) do g guides_formated = CHOPOFF.guide_to_template_format(g; alphabet = CHOPOFF.ALPHABET_TWOBIT) sa = guides_formated[paths] sa = Base.map(x -> CHOPOFF.asUInt(eltype(db.prefix), x), eachrow(sa)) @@ -407,7 +407,16 @@ function search_prefixHashDB( return end - @inbounds for i in sa # each sa is range of indices of prefixes where all ots are the same + if length(sa) != 0 + sa = Base.mapreduce(vcat, sa) do x # split sa based on suffixes + finds = findall(diff(db.suffix[x]) .!= 0) + stops = vcat(finds, length(x)) .+ x.start .- 1 + starts = vcat(0, finds) .+ x.start + Base.map(x -> UnitRange(x[1], x[2]), zip(starts, stops)) + end + end + + @inbounds for i in sa # each sa is range of indices of prefixes where all prefixes are the same ot = LongDNA{4}((convert(ot_type, db.prefix[i.start]) << (2 * s_len)) | convert(ot_type, db.suffix[i.start]), ot_len) aln = CHOPOFF.align(g, ot, distance, iscompatible) diff --git a/src/db_vcf.jl b/src/db_vcf.jl index 96528a76..0c71a34d 100644 --- a/src/db_vcf.jl +++ b/src/db_vcf.jl @@ -302,28 +302,28 @@ function build_vcfDB( if (sum(grouping_idx_ones) > 0) chrom_idx_ones = chrom_idx[grouping_idx_ones] append!(ambig_vots, - ThreadsX.mapreduce(x -> find_ots_one(lp, chrom_seq, x[1], x[2], x[3], x[4], ch_, dbi), vcat, - zip(rs_ranges[chrom_idx_ones], - rs_ref[chrom_idx_ones], - rs_ids[chrom_idx_ones], - rs_alt[chrom_idx_ones]); - init = Vector{VarOT}())) + ThreadsX.mapreduce(x -> find_ots_one(lp, chrom_seq, x[1], x[2], x[3], x[4], ch_, dbi), vcat, + zip(rs_ranges[chrom_idx_ones], + rs_ref[chrom_idx_ones], + rs_ids[chrom_idx_ones], + rs_alt[chrom_idx_ones]); + init = Vector{VarOT}())) end if (length(grouping_idx) > 0) chrom_idx_not_ones = chrom_idx[.!grouping_idx_ones] append!(ambig_vots, - ThreadsX.mapreduce(vcat, unique(grouping_idx); init = Vector{VarOT}()) do x - first = searchsortedfirst(grouping_idx, x) - last = searchsortedlast(grouping_idx, x) - first_last = chrom_idx_not_ones[first:last] - return find_ots_many( - lp, chrom_seq, - rs_ranges[first_last], - rs_ref[first_last], - rs_ids[first_last], - rs_alt[first_last], ch_, dbi) - end) + ThreadsX.mapreduce(vcat, unique(grouping_idx); init = Vector{VarOT}()) do x + first = searchsortedfirst(grouping_idx, x) + last = searchsortedlast(grouping_idx, x) + first_last = chrom_idx_not_ones[first:last] + return find_ots_many( + lp, chrom_seq, + rs_ranges[first_last], + rs_ref[first_last], + rs_ids[first_last], + rs_alt[first_last], ch_, dbi) + end) end end close(ref) diff --git a/src/find_offtargets.jl b/src/find_offtargets.jl index 7b772e6b..de84e78b 100644 --- a/src/find_offtargets.jl +++ b/src/find_offtargets.jl @@ -200,9 +200,12 @@ end """ ``` -gatherofftargets!( +function gatherofftargets!( output::T, - dbi::DBInfo) where {T<:Union{Vector{String}, Vector{UInt64}, Vector{UInt128}}} + dbi::DBInfo; + remove_pam::Bool = true, + normalize::Bool = true, + restrict_to_len::Union{Nothing, Int64} = nothing) where {T<:Union{Vector{String}, Vector{UInt64}, Vector{UInt128}}} ``` Gathers all off-targets that conform to the given `dbi` Motif. @@ -211,6 +214,11 @@ This function appends to the `output` during the run, however it will also retur guides in return object. We can use UInt64 and UInt128 to compress space that the gRNAs use. When using large genomes or non-specific PAMs you might run out of memory when using this function. +remove_pam - whether PAM sequence should be removed +normalize - whether all guides should be flipped into PAMseqEXT e.g. GGn-20N-3bp +restrict_to_len - will restrict the guides to be of specific lengths, smaller than the initial motif + this includes/excludes PAM based on remove_pam as remove_pam is applied before the length restriction + # Examples ```julia # use CHOPOFF example genome @@ -235,7 +243,10 @@ guides2 = String.(LongDNA{4}.(guides2, guide_with_extension_len)) """ function gatherofftargets!( output::T, - dbi::DBInfo) where {T<:Union{Vector{String}, Vector{UInt64}, Vector{UInt128}}} + dbi::DBInfo; + remove_pam::Bool = true, + normalize::Bool = true, + restrict_to_len::Union{Nothing, Int64} = nothing) where {T<:Union{Vector{String}, Vector{UInt64}, Vector{UInt128}}} ref = open(dbi.gi.filepath, "r") reader = dbi.gi.is_fa ? FASTA.Reader(ref, index = dbi.gi.filepath * ".fai") : TwoBit.Reader(ref) diff --git a/test/src/db.jl b/test/src/db.jl index c970d0f0..5bf4a58a 100644 --- a/test/src/db.jl +++ b/test/src/db.jl @@ -342,6 +342,39 @@ end @test nrow(ldbes) >= 2 end + #= + @testset "linearDB vs prefixHashDB on distance 3 and semirandom1" + motif = Motif("Cas9"; distance = 1) + # take all possible guides on our semirandom genome and query them + dbi = DBInfo(genome, "Cas9_semirandom_noVCF", motif) + # finally gather all off-targets + guides = Vector{String}() + # guides are GGN...20bp+ext + ambig = gatherofftargets!(guides, dbi; remove_pam = true, normalize = true) + guides = LongDNA{4}.(guides) + guides = Base.map(x -> x[1:20], guides) + reverse!.(guides) + + # lets randomize here some + phdb_path = joinpath(tdir, "prefixHashDBes") + mkpath(phdb_path) + build_prefixHashDB("samirandom", genome, setdist(Motif("Cas9"), 3), phdb_path) + detail_path = joinpath(ldb_path, "detail2.csv") + detail_path_es = joinpath(phdb_path, "detail_es.csv") + for g in guides + @info g + search_linearDB(ldb_path, [g], detail_path; distance = 3) + ldb = DataFrame(CSV.File(detail_path)) + search_prefixHashDB(phdb_path, [g], detail_path_es; + distance = 3, + early_stopping = [100000, 100000, 100000, 100000]) + pdbes = DataFrame(CSV.File(detail_path_es)) + failed = antijoin(ldb, pdbes, on = [:guide, :distance, :chromosome, :start, :strand]) + @test nrow(failed) == 0 + end + end + =# + @testset "linearDB vs prefixHashDB early stopped" begin # remember that this early stopping can find overlaps