Skip to content

Commit

Permalink
speed improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
JokingHero committed Mar 15, 2024
1 parent 4b0a460 commit 46a48ea
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 44 deletions.
69 changes: 28 additions & 41 deletions src/db_prefix_hash.jl
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,13 @@ function build_prefixHashDB(
end


# sa - sometimes small, sometimes large
# prefixes - around 300M
function potential_ots_idx(sa::Vector{<:Unsigned}, prefixes::Vector{<:Unsigned})
idx = searchsorted.(Ref(prefixes), sa)
return filter(x -> x.start <= x.stop, idx)
end

"""
```
search_prefixHashDB(
Expand Down Expand Up @@ -291,49 +298,27 @@ function search_prefixHashDB(
paths = db.mpt.paths[db.mpt.paths_distances .<= distance, :]
mkpath(dirname(output_file))

ThreadsX.map(guides_) do g
ThreadsX.map(guides_) do g # maybe a function would be faster than lambda here?
guides_formated = ARTEMIS.guide_to_template_format(g; alphabet = ARTEMIS.ALPHABET_TWOBIT)
sa = guides_formated[paths]
sa = Base.map(x -> ARTEMIS.asUInt(eltype(db.prefix), x), eachrow(sa))
sa = Set(sa)
sa = in.(db.prefix, Ref(sa))

es_acc = zeros(Int64, length(early_stopping))
is_es = false
ots = LongDNA{4}.((convert.(ot_type, db.prefix[sa]) .<< (2 * s_len)) .| convert.(ot_type, db.suffix[sa]), ot_len)
sa = unique(sa)
sa = potential_ots_idx(sa, db.prefix)

es_acc = zeros(Int64, length(early_stopping))
detail_path = joinpath(dirname(output_file), "detail_" * string(g) * ".csv")
detail_file = open(detail_path, "w")
guide_stranded = db.mpt.dbi.motif.extends5 ? reverse(g) : g
guide_stranded = string(guide_stranded)
if length(ots) == 0
if length(sa) == 0
close(detail_file)
return
end

aln = ARTEMIS.align(g, ots[1], distance, iscompatible)
if aln.dist <= distance
es_acc[aln.dist + 1] += 1
if db.mpt.dbi.motif.extends5
aln_guide = reverse(aln.guide)
aln_ref = reverse(aln.ref)
else
aln_guide = aln.guide
aln_ref = aln.ref
end
strand = db.isplus[sa][1] ? "+" : "-"
ot = guide_stranded * "," * aln_guide * "," *
aln_ref * "," * string(aln.dist) * "," *
db.mpt.dbi.gi.chrom[db.chrom[sa][1]] * "," *
string(db.pos[sa][1]) * "," * strand * "\n"
write(detail_file, ot)
end
for i in 2:length(ots)
ot = ots[i]
if ot != ots[i - 1] # we can recycle alignments because ots are sorted
aln = ARTEMIS.align(g, ot, distance, iscompatible)
end

@inbounds for i in sa # each sa is range of indices of prefixes where all ots are the same
ot = LongDNA{4}((convert(ot_type, db.prefix[i.start]) .<< (2 * s_len)) |
convert(ot_type, db.suffix[i.start]), ot_len)
aln = ARTEMIS.align(g, ot, distance, iscompatible)
if aln.dist <= distance
if db.mpt.dbi.motif.extends5
aln_guide = reverse(aln.guide)
Expand All @@ -342,16 +327,18 @@ function search_prefixHashDB(
aln_guide = aln.guide
aln_ref = aln.ref
end
strand = db.isplus[sa][i] ? "+" : "-"
ot = guide_stranded * "," * aln_guide * "," *
aln_ref * "," * string(aln.dist) * "," *
db.mpt.dbi.gi.chrom[db.chrom[sa][i]] * "," *
string(db.pos[sa][i]) * "," * strand * "\n"
write(detail_file, ot)
es_acc[aln.dist + 1] += 1
if es_acc[aln.dist + 1] >= early_stopping[aln.dist + 1]
is_es = true
break
@inbounds for idx in i
strand = db.isplus[idx] ? "+" : "-"
ot = guide_stranded * "," * aln_guide * "," *
aln_ref * "," * string(aln.dist) * "," *
db.mpt.dbi.gi.chrom[db.chrom[idx]] * "," *
string(db.pos[idx]) * "," * strand * "\n"
write(detail_file, ot)
es_acc[aln.dist + 1] += 1
if es_acc[aln.dist + 1] >= early_stopping[aln.dist + 1]
close(detail_file)
return
end
end
end
end
Expand Down
2 changes: 1 addition & 1 deletion test/src/db.jl
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,6 @@ end
detail_path_es; distance = 2, early_stopping = repeat([0], 3))
pdbes = DataFrame(CSV.File(detail_path_es))
pdbes_res = summarize_offtargets(pdbes)
@test nrow(pdbes) == 36 # I checked these results
@test nrow(pdbes) == 20 # I checked these results
end
end
4 changes: 2 additions & 2 deletions test/src/db_extends5_false.jl
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ end
detail_path = joinpath(phdb_path, "detail.csv")

for d in 1:3
search_prefixHashDB(phdb_path, guides, detail_path; distance = d)
search_prefixHashDB(phdb_path, guides, detail_path; distance = d, early_stopping = repeat([300], d + 1))
phdb = DataFrame(CSV.File(detail_path))

search_linearDB(ldb_path, guides, detail_path; distance = d)
Expand Down Expand Up @@ -316,6 +316,6 @@ end
detail_path_es; distance = 3, early_stopping = repeat([0], 4))
ldbes = DataFrame(CSV.File(detail_path_es))
ldbes_res = summarize_offtargets(ldbes)
@test nrow(ldbes) == 6 # I checked these results
@test nrow(ldbes) == 3 # I checked these results
end
end

0 comments on commit 46a48ea

Please sign in to comment.