move from UInt64 to UInt32, probably decrease in memory will mean inc…

…rease in speed
JokingHero · Feb 21, 2024 · 9bd3cae · 9bd3cae
1 parent 16f5582
commit 9bd3cae
Show file tree

Hide file tree

Showing 4 changed files with 70 additions and 12 deletions.
diff --git a/src/db_linear_hash.jl b/src/db_linear_hash.jl
@@ -8,7 +8,7 @@ struct SuffixHashDB
     suffix::Vector{LongDNA{4}}
     suffix_loci_idx::Vector{LociRange}
     loci::Vector{Loc}
-    hash::Vector{UInt64}
+    hash::Vector{UInt32}
 end
 
 
@@ -55,7 +55,7 @@ if interested with searches within distance 4, preferably use prefix length of
 
 `prefix_len`  - Size of the prefix by which off-targets are indexed. Prefix of 8 or larger will be the fastest,
                 however it will also result in large number of files.
-`hash_len` - Length of the hash in bp. Should be below 20bp - maximum alignment distance.
+`hash_len` - Length of the hash in bp. At maximum 16.
 
 # Examples
 ```julia-repl
@@ -68,14 +68,14 @@ function build_linearHashDB(
     motif::Motif,
     storage_dir::String,
     prefix_len::Int = 7,
-    hash_len::Int = (length_noPAM(motif) - motif.distance))
+    hash_len::Int = min(length_noPAM(motif) - motif.distance, 16))
 
     if prefix_len <= motif.distance
         throw("prefix_len $prefix_len is <= " * string(motif.distance))
     end
 
-    if hash_len > (length_noPAM(motif) - motif.distance)
-        throw("prefix_len $hash_len is > " * string((length_noPAM(motif) - motif.distance)))
+    if hash_len > 16
+        throw("hash_len $hash_len is more than 16")
     end
 
     dbi = DBInfo(genomepath, name, motif)
@@ -113,7 +113,7 @@ function build_linearHashDB(
         (guides, loci_range, loci) = unique_guides(guides, loci)
         # hash part guides = prefix + guides
         hashes = ThreadsX.map(guides) do guide
-            convert(UInt64, (prefix * guide)[1:hash_len])
+            convert(UInt32, (prefix * guide)[1:hash_len])
         end
         sdb = SuffixHashDB(prefix, guides, loci_range, loci, hashes)
         save(sdb, joinpath(storage_dir, string(prefix) * ".bin"))
@@ -132,7 +132,7 @@ end
 
 function search_prefix_hash(
     prefix::LongDNA{4},
-    paths_set::Vector{Set{UInt64}},
+    paths_set::Vector{Set{UInt32}},
     dist::Int,
     dbi::DBInfo,
     detail::String,
@@ -245,11 +245,11 @@ function search_linearHashDB(
 
     paths = ldb.paths[ldb.paths_distances .<= distance, :]
     paths_set = ThreadsX.map(copy(guides_)) do g
-        guides_uint64 = guide_to_template_format(g; alphabet = ALPHABET_TWOBIT)
-        ot_uint64 = guides_uint64[paths]
-        ot_uint64 = map(ARTEMIS.asUInt64, eachrow(ot_uint64))
+        guides_formated = guide_to_template_format(g; alphabet = ALPHABET_TWOBIT)
+        ot_uint32 = guides_formated[paths]
+        ot_uint32 = map(ARTEMIS.asUInt32, eachrow(ot_uint32))
         # BinaryFuseFilter{UInt32}(unique(ot_uint64)) # very space efficient!!!
-        return Set(ot_uint64)
+        return Set(ot_uint32)
     end
 
     mkpath(dirname(output_file))

diff --git a/src/motif_path_templates.jl b/src/motif_path_templates.jl
@@ -504,6 +504,27 @@ function asUInt64(x::AbstractVecOrMat)
 end
 
 
+"""
+```
+asUInt32(x::AbstractVecOrMat)
+```
+
+Helper that allows you to create one UInt32 for DNA strings smaller 16bp.
+There is no checking for the size of the vector therefore make sure it is <= 16bp.
+
+    matp = guide_in_template_format[pathTemplate]
+    map(asUInt32, eachrow(matp))
+"""
+function asUInt32(x::AbstractVecOrMat)
+    y = zero(UInt32)
+    for c in x
+        y = (y << 2) | UInt32(c)
+    end
+    mask = (one(UInt32) << (2 * length(x))) - UInt32(1)
+    return reinterpret(UInt32, y & mask)
+end
+
+
 """
 ```
 duplicated(x::Vector{UInt64})

diff --git a/src/utils.jl b/src/utils.jl
@@ -162,6 +162,28 @@ function Base.convert(::Type{UInt64}, x::LongDNA{4})
 end
 
 
+function Base.convert(::Type{UInt32}, x::LongDNA{4})
+    x = LongDNA{2}(x)
+    if (length(x) > 16) 
+        throw("Sequence too long to save as UInt32.")
+    end
+
+    y = zero(UInt32)
+    for c in x
+        nt = convert(DNA, c)
+        if isambiguous(nt)
+            throw(ArgumentError("cannot create a mer with ambiguous nucleotides"))
+        elseif isgap(nt)
+            throw(ArgumentError("cannot create a mer with gaps"))
+        end
+        y = (y << 2) | UInt32(BioSequences.twobitnucs[reinterpret(UInt8, nt) + 0x01])
+    end
+
+    mask = (one(UInt32) << (2 * length(x))) - one(UInt32)
+    return reinterpret(UInt32, y & mask) # encoded_data
+end
+
+
 # this is needed inside saca.jl
 # TODO make sure the queried sequences are from the same type
 # or conform to the same encoding as the reference
@@ -181,7 +203,7 @@ end
 end
 
 
-@inline function BioSequences.LongDNA{4}(x::UInt64, len::Int)
+@inline function BioSequences.LongDNA{4}(x::Union{UInt64, UInt32}, len::Int)
     y = []
     for i in 1:len
         push!(y, reinterpret(DNA, 0x01 << ((x >> 2(len - i)) & 0b11)))

diff --git a/test/src/utils.jl b/test/src/utils.jl
@@ -73,6 +73,21 @@ using Combinatorics
         @test isempty(findall(dna"AAANN", dna"ACTGAAANACTG"; ambig_max = 0))
     end
 
+
+    @testset "UInt32 conversion" begin
+        x = dna"AAAAAATGCTACTGCG"
+        @test LongDNA{4}(convert(UInt32, x), length(x)) == x
+        @test_throws BioSequences.EncodeError convert(UInt32, dna"A-A")
+        @test_throws String convert(UInt32, dna"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA")
+
+        for i in 1:10000
+            for j in 1:16
+                x = getseq(ceil(Int, rand()*j), ['A', 'C', 'G', 'T'])
+                @test String(LongDNA{4}(convert(UInt32, x), length(x))) == String(copy(x))
+            end
+        end
+    end
+
     @testset "UInt64 conversion" begin
         x = dna"AAAAAATGCTACTG"
         @test LongDNA{4}(convert(UInt64, x), length(x)) == x