Skip to content

Commit

Permalink
fixing rather big bug inside prefixHashDB where suffixes were not cor…
Browse files Browse the repository at this point in the history
…rectly iterated, and also multithreading was not on
  • Loading branch information
JokingHero committed Apr 24, 2024
1 parent 762d8d4 commit 473f56c
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 23 deletions.
15 changes: 12 additions & 3 deletions src/db_prefix_hash.jl
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ function build_prefixHashDB(
suffixes = convert.(suffix_type, mask .& guides)
guides = nothing

order = sortperm(prefixes)
order = sortperm(collect(zip(prefixes, suffixes)))
prefixes = prefixes[order]
suffixes = suffixes[order]
chrom = chrom[order]
Expand Down Expand Up @@ -385,7 +385,7 @@ function search_prefixHashDB(
paths = db.mpt.paths[db.mpt.paths_distances .<= distance, :]
mkpath(dirname(output_file))

Base.map(guides_) do g # maybe a function would be faster than lambda here?
ThreadsX.map(guides_) do g
guides_formated = CHOPOFF.guide_to_template_format(g; alphabet = CHOPOFF.ALPHABET_TWOBIT)
sa = guides_formated[paths]
sa = Base.map(x -> CHOPOFF.asUInt(eltype(db.prefix), x), eachrow(sa))
Expand All @@ -407,7 +407,16 @@ function search_prefixHashDB(
return
end

@inbounds for i in sa # each sa is range of indices of prefixes where all ots are the same
if length(sa) != 0
sa = Base.mapreduce(vcat, sa) do x # split sa based on suffixes
finds = findall(diff(db.suffix[x]) .!= 0)
stops = vcat(finds, length(x)) .+ x.start .- 1
starts = vcat(0, finds) .+ x.start
Base.map(x -> UnitRange(x[1], x[2]), zip(starts, stops))
end
end

@inbounds for i in sa # each sa is range of indices of prefixes where all prefixes are the same
ot = LongDNA{4}((convert(ot_type, db.prefix[i.start]) << (2 * s_len)) |
convert(ot_type, db.suffix[i.start]), ot_len)
aln = CHOPOFF.align(g, ot, distance, iscompatible)
Expand Down
34 changes: 17 additions & 17 deletions src/db_vcf.jl
Original file line number Diff line number Diff line change
Expand Up @@ -302,28 +302,28 @@ function build_vcfDB(
if (sum(grouping_idx_ones) > 0)
chrom_idx_ones = chrom_idx[grouping_idx_ones]
append!(ambig_vots,
ThreadsX.mapreduce(x -> find_ots_one(lp, chrom_seq, x[1], x[2], x[3], x[4], ch_, dbi), vcat,
zip(rs_ranges[chrom_idx_ones],
rs_ref[chrom_idx_ones],
rs_ids[chrom_idx_ones],
rs_alt[chrom_idx_ones]);
init = Vector{VarOT}()))
ThreadsX.mapreduce(x -> find_ots_one(lp, chrom_seq, x[1], x[2], x[3], x[4], ch_, dbi), vcat,
zip(rs_ranges[chrom_idx_ones],
rs_ref[chrom_idx_ones],
rs_ids[chrom_idx_ones],
rs_alt[chrom_idx_ones]);
init = Vector{VarOT}()))
end

if (length(grouping_idx) > 0)
chrom_idx_not_ones = chrom_idx[.!grouping_idx_ones]
append!(ambig_vots,
ThreadsX.mapreduce(vcat, unique(grouping_idx); init = Vector{VarOT}()) do x
first = searchsortedfirst(grouping_idx, x)
last = searchsortedlast(grouping_idx, x)
first_last = chrom_idx_not_ones[first:last]
return find_ots_many(
lp, chrom_seq,
rs_ranges[first_last],
rs_ref[first_last],
rs_ids[first_last],
rs_alt[first_last], ch_, dbi)
end)
ThreadsX.mapreduce(vcat, unique(grouping_idx); init = Vector{VarOT}()) do x
first = searchsortedfirst(grouping_idx, x)
last = searchsortedlast(grouping_idx, x)
first_last = chrom_idx_not_ones[first:last]
return find_ots_many(
lp, chrom_seq,
rs_ranges[first_last],
rs_ref[first_last],
rs_ids[first_last],
rs_alt[first_last], ch_, dbi)
end)
end
end
close(ref)
Expand Down
17 changes: 14 additions & 3 deletions src/find_offtargets.jl
Original file line number Diff line number Diff line change
Expand Up @@ -200,9 +200,12 @@ end

"""
```
gatherofftargets!(
function gatherofftargets!(
output::T,
dbi::DBInfo) where {T<:Union{Vector{String}, Vector{UInt64}, Vector{UInt128}}}
dbi::DBInfo;
remove_pam::Bool = true,
normalize::Bool = true,
restrict_to_len::Union{Nothing, Int64} = nothing) where {T<:Union{Vector{String}, Vector{UInt64}, Vector{UInt128}}}
```
Gathers all off-targets that conform to the given `dbi` Motif.
Expand All @@ -211,6 +214,11 @@ This function appends to the `output` during the run, however it will also retur
guides in return object. We can use UInt64 and UInt128 to compress space that the gRNAs use. When using
large genomes or non-specific PAMs you might run out of memory when using this function.
remove_pam - whether PAM sequence should be removed
normalize - whether all guides should be flipped into PAMseqEXT e.g. GGn-20N-3bp
restrict_to_len - will restrict the guides to be of specific lengths, smaller than the initial motif
this includes/excludes PAM based on remove_pam as remove_pam is applied before the length restriction
# Examples
```julia
# use CHOPOFF example genome
Expand All @@ -235,7 +243,10 @@ guides2 = String.(LongDNA{4}.(guides2, guide_with_extension_len))
"""
function gatherofftargets!(
output::T,
dbi::DBInfo) where {T<:Union{Vector{String}, Vector{UInt64}, Vector{UInt128}}}
dbi::DBInfo;
remove_pam::Bool = true,
normalize::Bool = true,
restrict_to_len::Union{Nothing, Int64} = nothing) where {T<:Union{Vector{String}, Vector{UInt64}, Vector{UInt128}}}

ref = open(dbi.gi.filepath, "r")
reader = dbi.gi.is_fa ? FASTA.Reader(ref, index = dbi.gi.filepath * ".fai") : TwoBit.Reader(ref)
Expand Down
33 changes: 33 additions & 0 deletions test/src/db.jl
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,39 @@ end
@test nrow(ldbes) >= 2
end

#=
@testset "linearDB vs prefixHashDB on distance 3 and semirandom1"
motif = Motif("Cas9"; distance = 1)
# take all possible guides on our semirandom genome and query them
dbi = DBInfo(genome, "Cas9_semirandom_noVCF", motif)
# finally gather all off-targets
guides = Vector{String}()
# guides are GGN...20bp+ext
ambig = gatherofftargets!(guides, dbi; remove_pam = true, normalize = true)
guides = LongDNA{4}.(guides)
guides = Base.map(x -> x[1:20], guides)
reverse!.(guides)
# lets randomize here some
phdb_path = joinpath(tdir, "prefixHashDBes")
mkpath(phdb_path)
build_prefixHashDB("samirandom", genome, setdist(Motif("Cas9"), 3), phdb_path)
detail_path = joinpath(ldb_path, "detail2.csv")
detail_path_es = joinpath(phdb_path, "detail_es.csv")
for g in guides
@info g
search_linearDB(ldb_path, [g], detail_path; distance = 3)
ldb = DataFrame(CSV.File(detail_path))
search_prefixHashDB(phdb_path, [g], detail_path_es;
distance = 3,
early_stopping = [100000, 100000, 100000, 100000])
pdbes = DataFrame(CSV.File(detail_path_es))
failed = antijoin(ldb, pdbes, on = [:guide, :distance, :chromosome, :start, :strand])
@test nrow(failed) == 0
end
end
=#


@testset "linearDB vs prefixHashDB early stopped" begin
# remember that this early stopping can find overlaps
Expand Down

0 comments on commit 473f56c

Please sign in to comment.