fixing rather big bug inside prefixHashDB where suffixes were not cor…

…rectly iterated, and also multithreading was not on
JokingHero · Apr 24, 2024 · 473f56c · 473f56c
1 parent 762d8d4
commit 473f56c
Show file tree

Hide file tree

Showing 4 changed files with 76 additions and 23 deletions.
diff --git a/src/db_prefix_hash.jl b/src/db_prefix_hash.jl
@@ -224,7 +224,7 @@ function build_prefixHashDB(
     suffixes = convert.(suffix_type, mask .& guides)
     guides = nothing
 
-    order = sortperm(prefixes)
+    order = sortperm(collect(zip(prefixes, suffixes)))
     prefixes = prefixes[order]
     suffixes = suffixes[order]
     chrom = chrom[order]
@@ -385,7 +385,7 @@ function search_prefixHashDB(
     paths = db.mpt.paths[db.mpt.paths_distances .<= distance, :]
     mkpath(dirname(output_file))
 
-    Base.map(guides_) do g # maybe a function would be faster than lambda here?
+    ThreadsX.map(guides_) do g
         guides_formated = CHOPOFF.guide_to_template_format(g; alphabet = CHOPOFF.ALPHABET_TWOBIT)
         sa = guides_formated[paths]
         sa = Base.map(x -> CHOPOFF.asUInt(eltype(db.prefix), x), eachrow(sa))
@@ -407,7 +407,16 @@ function search_prefixHashDB(
             return
         end
 
-        @inbounds for i in sa # each sa is range of indices of prefixes where all ots are the same
+        if length(sa) != 0
+            sa = Base.mapreduce(vcat, sa) do x # split sa based on suffixes
+                finds = findall(diff(db.suffix[x]) .!= 0)
+                stops = vcat(finds, length(x)) .+ x.start .- 1
+                starts = vcat(0, finds) .+ x.start
+                Base.map(x -> UnitRange(x[1], x[2]), zip(starts, stops))
+            end
+        end
+
+        @inbounds for i in sa # each sa is range of indices of prefixes where all prefixes are the same
             ot = LongDNA{4}((convert(ot_type, db.prefix[i.start]) << (2 * s_len)) | 
                 convert(ot_type, db.suffix[i.start]), ot_len)
             aln = CHOPOFF.align(g, ot, distance, iscompatible)

diff --git a/src/db_vcf.jl b/src/db_vcf.jl
@@ -302,28 +302,28 @@ function build_vcfDB(
         if (sum(grouping_idx_ones) > 0) 
             chrom_idx_ones = chrom_idx[grouping_idx_ones]
             append!(ambig_vots,
-            ThreadsX.mapreduce(x -> find_ots_one(lp, chrom_seq, x[1], x[2], x[3], x[4], ch_, dbi), vcat, 
-                zip(rs_ranges[chrom_idx_ones], 
-                    rs_ref[chrom_idx_ones], 
-                    rs_ids[chrom_idx_ones], 
-                    rs_alt[chrom_idx_ones]); 
-                init = Vector{VarOT}()))
+                ThreadsX.mapreduce(x -> find_ots_one(lp, chrom_seq, x[1], x[2], x[3], x[4], ch_, dbi), vcat, 
+                    zip(rs_ranges[chrom_idx_ones], 
+                        rs_ref[chrom_idx_ones], 
+                        rs_ids[chrom_idx_ones], 
+                        rs_alt[chrom_idx_ones]); 
+                    init = Vector{VarOT}()))
         end
 
         if (length(grouping_idx) > 0)
             chrom_idx_not_ones = chrom_idx[.!grouping_idx_ones]
             append!(ambig_vots,
-            ThreadsX.mapreduce(vcat, unique(grouping_idx); init = Vector{VarOT}()) do x
-                first = searchsortedfirst(grouping_idx, x)
-                last = searchsortedlast(grouping_idx, x)
-                first_last = chrom_idx_not_ones[first:last]
-                return find_ots_many(
-                    lp, chrom_seq, 
-                    rs_ranges[first_last], 
-                    rs_ref[first_last], 
-                    rs_ids[first_last], 
-                    rs_alt[first_last], ch_, dbi)
-            end)
+                ThreadsX.mapreduce(vcat, unique(grouping_idx); init = Vector{VarOT}()) do x
+                    first = searchsortedfirst(grouping_idx, x)
+                    last = searchsortedlast(grouping_idx, x)
+                    first_last = chrom_idx_not_ones[first:last]
+                    return find_ots_many(
+                        lp, chrom_seq, 
+                        rs_ranges[first_last], 
+                        rs_ref[first_last], 
+                        rs_ids[first_last], 
+                        rs_alt[first_last], ch_, dbi)
+                end)
         end
     end
     close(ref)

diff --git a/src/find_offtargets.jl b/src/find_offtargets.jl
@@ -200,9 +200,12 @@ end
 
 """
 ```
-gatherofftargets!(
+function gatherofftargets!(
     output::T,
-    dbi::DBInfo) where {T<:Union{Vector{String}, Vector{UInt64}, Vector{UInt128}}}
+    dbi::DBInfo;
+    remove_pam::Bool = true,
+    normalize::Bool = true,
+    restrict_to_len::Union{Nothing, Int64} = nothing) where {T<:Union{Vector{String}, Vector{UInt64}, Vector{UInt128}}}
 ```
 
 Gathers all off-targets that conform to the given `dbi` Motif.
@@ -211,6 +214,11 @@ This function appends to the `output` during the run, however it will also retur
 guides in return object. We can use UInt64 and UInt128 to compress space that the gRNAs use. When using 
 large genomes or non-specific PAMs you might run out of memory when using this function.
 
+remove_pam - whether PAM sequence should be removed
+normalize - whether all guides should be flipped into PAMseqEXT e.g. GGn-20N-3bp
+restrict_to_len - will restrict the guides to be of specific lengths, smaller than the initial motif
+    this includes/excludes PAM based on remove_pam as remove_pam is applied before the length restriction
+
 # Examples
 ```julia
 # use CHOPOFF example genome
@@ -235,7 +243,10 @@ guides2 = String.(LongDNA{4}.(guides2, guide_with_extension_len))
 """
 function gatherofftargets!(
     output::T,
-    dbi::DBInfo) where {T<:Union{Vector{String}, Vector{UInt64}, Vector{UInt128}}}
+    dbi::DBInfo;
+    remove_pam::Bool = true,
+    normalize::Bool = true,
+    restrict_to_len::Union{Nothing, Int64} = nothing) where {T<:Union{Vector{String}, Vector{UInt64}, Vector{UInt128}}}
 
     ref = open(dbi.gi.filepath, "r")
     reader = dbi.gi.is_fa ? FASTA.Reader(ref, index = dbi.gi.filepath * ".fai") : TwoBit.Reader(ref)

diff --git a/test/src/db.jl b/test/src/db.jl
@@ -342,6 +342,39 @@ end
         @test nrow(ldbes) >= 2
     end
 
+    #=
+    @testset "linearDB vs prefixHashDB on distance 3 and semirandom1"
+        motif = Motif("Cas9"; distance = 1)
+        # take all possible guides on our semirandom genome and query them
+        dbi = DBInfo(genome, "Cas9_semirandom_noVCF", motif)
+        # finally gather all off-targets
+        guides = Vector{String}()
+        # guides are GGN...20bp+ext
+        ambig = gatherofftargets!(guides, dbi; remove_pam = true, normalize = true)
+        guides = LongDNA{4}.(guides)
+        guides = Base.map(x -> x[1:20], guides)
+        reverse!.(guides)
+
+        # lets randomize here some 
+        phdb_path = joinpath(tdir, "prefixHashDBes")
+        mkpath(phdb_path)
+        build_prefixHashDB("samirandom", genome, setdist(Motif("Cas9"), 3), phdb_path)
+        detail_path = joinpath(ldb_path, "detail2.csv")
+        detail_path_es = joinpath(phdb_path, "detail_es.csv")
+        for g in guides
+            @info g
+            search_linearDB(ldb_path, [g], detail_path; distance = 3)
+            ldb = DataFrame(CSV.File(detail_path))
+            search_prefixHashDB(phdb_path, [g], detail_path_es; 
+                distance = 3, 
+                early_stopping = [100000, 100000, 100000, 100000])
+            pdbes = DataFrame(CSV.File(detail_path_es))
+            failed = antijoin(ldb, pdbes, on = [:guide, :distance, :chromosome, :start, :strand])
+            @test nrow(failed) == 0
+        end
+    end
+    =#
+
 
     @testset "linearDB vs prefixHashDB early stopped" begin
         # remember that this early stopping can find overlaps