Skip to content

Commit

Permalink
move from UInt64 to UInt32, probably decrease in memory will mean inc…
Browse files Browse the repository at this point in the history
…rease in speed
  • Loading branch information
JokingHero committed Feb 21, 2024
1 parent 16f5582 commit 9bd3cae
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 12 deletions.
22 changes: 11 additions & 11 deletions src/db_linear_hash.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ struct SuffixHashDB
suffix::Vector{LongDNA{4}}
suffix_loci_idx::Vector{LociRange}
loci::Vector{Loc}
hash::Vector{UInt64}
hash::Vector{UInt32}
end


Expand Down Expand Up @@ -55,7 +55,7 @@ if interested with searches within distance 4, preferably use prefix length of
`prefix_len` - Size of the prefix by which off-targets are indexed. Prefix of 8 or larger will be the fastest,
however it will also result in large number of files.
`hash_len` - Length of the hash in bp. Should be below 20bp - maximum alignment distance.
`hash_len` - Length of the hash in bp. At maximum 16.
# Examples
```julia-repl
Expand All @@ -68,14 +68,14 @@ function build_linearHashDB(
motif::Motif,
storage_dir::String,
prefix_len::Int = 7,
hash_len::Int = (length_noPAM(motif) - motif.distance))
hash_len::Int = min(length_noPAM(motif) - motif.distance, 16))

if prefix_len <= motif.distance
throw("prefix_len $prefix_len is <= " * string(motif.distance))
end

if hash_len > (length_noPAM(motif) - motif.distance)
throw("prefix_len $hash_len is > " * string((length_noPAM(motif) - motif.distance)))
if hash_len > 16
throw("hash_len $hash_len is more than 16")
end

dbi = DBInfo(genomepath, name, motif)
Expand Down Expand Up @@ -113,7 +113,7 @@ function build_linearHashDB(
(guides, loci_range, loci) = unique_guides(guides, loci)
# hash part guides = prefix + guides
hashes = ThreadsX.map(guides) do guide
convert(UInt64, (prefix * guide)[1:hash_len])
convert(UInt32, (prefix * guide)[1:hash_len])
end
sdb = SuffixHashDB(prefix, guides, loci_range, loci, hashes)
save(sdb, joinpath(storage_dir, string(prefix) * ".bin"))
Expand All @@ -132,7 +132,7 @@ end

function search_prefix_hash(
prefix::LongDNA{4},
paths_set::Vector{Set{UInt64}},
paths_set::Vector{Set{UInt32}},
dist::Int,
dbi::DBInfo,
detail::String,
Expand Down Expand Up @@ -245,11 +245,11 @@ function search_linearHashDB(

paths = ldb.paths[ldb.paths_distances .<= distance, :]
paths_set = ThreadsX.map(copy(guides_)) do g
guides_uint64 = guide_to_template_format(g; alphabet = ALPHABET_TWOBIT)
ot_uint64 = guides_uint64[paths]
ot_uint64 = map(ARTEMIS.asUInt64, eachrow(ot_uint64))
guides_formated = guide_to_template_format(g; alphabet = ALPHABET_TWOBIT)
ot_uint32 = guides_formated[paths]
ot_uint32 = map(ARTEMIS.asUInt32, eachrow(ot_uint32))
# BinaryFuseFilter{UInt32}(unique(ot_uint64)) # very space efficient!!!
return Set(ot_uint64)
return Set(ot_uint32)
end

mkpath(dirname(output_file))
Expand Down
21 changes: 21 additions & 0 deletions src/motif_path_templates.jl
Original file line number Diff line number Diff line change
Expand Up @@ -504,6 +504,27 @@ function asUInt64(x::AbstractVecOrMat)
end


"""
```
asUInt32(x::AbstractVecOrMat)
```
Helper that allows you to create one UInt32 for DNA strings smaller 16bp.
There is no checking for the size of the vector therefore make sure it is <= 16bp.
matp = guide_in_template_format[pathTemplate]
map(asUInt32, eachrow(matp))
"""
function asUInt32(x::AbstractVecOrMat)
y = zero(UInt32)
for c in x
y = (y << 2) | UInt32(c)
end
mask = (one(UInt32) << (2 * length(x))) - UInt32(1)
return reinterpret(UInt32, y & mask)
end


"""
```
duplicated(x::Vector{UInt64})
Expand Down
24 changes: 23 additions & 1 deletion src/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,28 @@ function Base.convert(::Type{UInt64}, x::LongDNA{4})
end


function Base.convert(::Type{UInt32}, x::LongDNA{4})
x = LongDNA{2}(x)
if (length(x) > 16)
throw("Sequence too long to save as UInt32.")
end

y = zero(UInt32)
for c in x
nt = convert(DNA, c)
if isambiguous(nt)
throw(ArgumentError("cannot create a mer with ambiguous nucleotides"))
elseif isgap(nt)
throw(ArgumentError("cannot create a mer with gaps"))
end
y = (y << 2) | UInt32(BioSequences.twobitnucs[reinterpret(UInt8, nt) + 0x01])
end

mask = (one(UInt32) << (2 * length(x))) - one(UInt32)
return reinterpret(UInt32, y & mask) # encoded_data
end


# this is needed inside saca.jl
# TODO make sure the queried sequences are from the same type
# or conform to the same encoding as the reference
Expand All @@ -181,7 +203,7 @@ end
end


@inline function BioSequences.LongDNA{4}(x::UInt64, len::Int)
@inline function BioSequences.LongDNA{4}(x::Union{UInt64, UInt32}, len::Int)
y = []
for i in 1:len
push!(y, reinterpret(DNA, 0x01 << ((x >> 2(len - i)) & 0b11)))
Expand Down
15 changes: 15 additions & 0 deletions test/src/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,21 @@ using Combinatorics
@test isempty(findall(dna"AAANN", dna"ACTGAAANACTG"; ambig_max = 0))
end


@testset "UInt32 conversion" begin
x = dna"AAAAAATGCTACTGCG"
@test LongDNA{4}(convert(UInt32, x), length(x)) == x
@test_throws BioSequences.EncodeError convert(UInt32, dna"A-A")
@test_throws String convert(UInt32, dna"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA")

for i in 1:10000
for j in 1:16
x = getseq(ceil(Int, rand()*j), ['A', 'C', 'G', 'T'])
@test String(LongDNA{4}(convert(UInt32, x), length(x))) == String(copy(x))
end
end
end

@testset "UInt64 conversion" begin
x = dna"AAAAAATGCTACTG"
@test LongDNA{4}(convert(UInt64, x), length(x)) == x
Expand Down

0 comments on commit 9bd3cae

Please sign in to comment.