Skip to content

Commit

Permalink
Removed murmur3 (#33)
Browse files Browse the repository at this point in the history
  • Loading branch information
Koeng101 authored Dec 20, 2023
1 parent 9851184 commit 8e6f32d
Show file tree
Hide file tree
Showing 10 changed files with 26 additions and 801 deletions.
5 changes: 3 additions & 2 deletions lib/mash/example_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,16 @@ package mash_test

import (
"fmt"
"hash/crc32"

"github.com/koeng101/dnadesign/lib/mash"
)

func ExampleMash() {
fingerprint1 := mash.New(17, 10)
fingerprint1 := mash.New(17, 10, crc32.NewIEEE())
fingerprint1.Sketch("ATGCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGA")

fingerprint2 := mash.New(17, 9)
fingerprint2 := mash.New(17, 9, crc32.NewIEEE())
fingerprint2.Sketch("ATGCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGA")

distance := fingerprint1.Distance(fingerprint2)
Expand Down
19 changes: 12 additions & 7 deletions lib/mash/mash.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,24 +43,28 @@ Tim
package mash

import (
"encoding/binary"
"hash"
"sort"
)

"github.com/koeng101/dnadesign/lib/mash/murmur3"
) // murmur3 is a fast non-cryptographic hash algorithm that was also used in the original papers-> https://github.com/shenwei356/go-hashing-kmer-bench
// murmur3 is a fast non-cryptographic hash algorithm that was also used in the original papers-> https://github.com/shenwei356/go-hashing-kmer-bench

// Mash is a collection of hashes of kmers from a given sequence.
type Mash struct {
KmerSize int // The kmer size is the size of the sliding window that is used to generate the hashes.
SketchSize int // The sketch size is the number of hashes to store.
Sketches []uint32 // The sketches are the hashes of the kmers that we can compare to other sketches.
KmerSize int // The kmer size is the size of the sliding window that is used to generate the hashes.
SketchSize int // The sketch size is the number of hashes to store.
Sketches []uint32 // The sketches are the hashes of the kmers that we can compare to other sketches.
Hash hash.Hash // Hash is the go standard library hashing interface. Can be used to switch algorithms.
}

// New initializes a new mash sketch.
func New(kmerSize int, sketchSize int) *Mash {
func New(kmerSize int, sketchSize int, hashInterface hash.Hash) *Mash {
return &Mash{
KmerSize: kmerSize,
SketchSize: sketchSize,
Sketches: make([]uint32, sketchSize),
Hash: hashInterface,
}
}

Expand All @@ -73,7 +77,8 @@ func (mash *Mash) Sketch(sequence string) {
for kmerStart := 0; kmerStart < len(sequence)-mash.KmerSize; kmerStart++ {
kmer := sequence[kmerStart : kmerStart+mash.KmerSize]
// hash the kmer to a 32 bit number
hash := murmur3.Sum32([]byte(kmer))
hashBytes := mash.Hash.Sum([]byte(kmer))
hash := binary.BigEndian.Uint32(hashBytes[:4]) // need to convert []byte to uint32
// keep the minimum hash value of all the kmers in the window up to a given sketch size
// the sketch is a vector of the minimum hash values

Expand Down
21 changes: 11 additions & 10 deletions lib/mash/mash_test.go
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
package mash_test

import (
"hash/crc32"
"testing"

"github.com/koeng101/dnadesign/lib/mash"
)

func TestMash(t *testing.T) {
fingerprint1 := mash.New(17, 10)
fingerprint1 := mash.New(17, 10, crc32.NewIEEE())
fingerprint1.Sketch("ATGCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGA")

fingerprint2 := mash.New(17, 9)
fingerprint2 := mash.New(17, 9, crc32.NewIEEE())
fingerprint2.Sketch("ATGCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGA")

distance := fingerprint1.Distance(fingerprint2)
Expand All @@ -23,36 +24,36 @@ func TestMash(t *testing.T) {
t.Errorf("Expected distance to be 0, got %f", distance)
}

spoofedFingerprint := mash.New(17, 10)
spoofedFingerprint := mash.New(17, 10, crc32.NewIEEE())
spoofedFingerprint.Sketches[0] = 0

distance = fingerprint1.Distance(spoofedFingerprint)
if distance != 1 {
t.Errorf("Expected distance to be 1, got %f", distance)
}

spoofedFingerprint = mash.New(17, 9)
spoofedFingerprint = mash.New(17, 9, crc32.NewIEEE())

distance = fingerprint1.Distance(spoofedFingerprint)
if distance != 1 {
t.Errorf("Expected distance to be 1, got %f", distance)
}

fingerprint1 = mash.New(17, 10)
fingerprint1 = mash.New(17, 10, crc32.NewIEEE())
fingerprint1.Sketch("ATGCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGA")

fingerprint2 = mash.New(17, 5)
fingerprint2 = mash.New(17, 5, crc32.NewIEEE())
fingerprint2.Sketch("ATCGATCGATCGATCGATCGATCGATCGATCGATCGAATGCGATCGATCGATCGATCGATCG")

distance = fingerprint1.Distance(fingerprint2)
if !(distance > 0.19 && distance < 0.21) {
t.Errorf("Expected distance to be 0.19999999999999996, got %f", distance)
}

fingerprint1 = mash.New(17, 10)
fingerprint1 = mash.New(17, 10, crc32.NewIEEE())
fingerprint1.Sketch("ATCGATCGATCGATCGATCGATCGATCGATCGATCGAATGCGATCGATCGATCGATCGATCG")

fingerprint2 = mash.New(17, 5)
fingerprint2 = mash.New(17, 5, crc32.NewIEEE())
fingerprint2.Sketch("ATGCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGA")

distance = fingerprint1.Distance(fingerprint2)
Expand All @@ -62,10 +63,10 @@ func TestMash(t *testing.T) {
}

func BenchmarkMashDistancee(b *testing.B) {
fingerprint1 := mash.New(17, 10)
fingerprint1 := mash.New(17, 10, crc32.NewIEEE())
fingerprint1.Sketch("ATGCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGA")

fingerprint2 := mash.New(17, 9)
fingerprint2 := mash.New(17, 9, crc32.NewIEEE())
fingerprint2.Sketch("ATGCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGA")

for i := 0; i < b.N; i++ {
Expand Down
24 changes: 0 additions & 24 deletions lib/mash/murmur3/LICENSE

This file was deleted.

86 changes: 0 additions & 86 deletions lib/mash/murmur3/README.md

This file was deleted.

64 changes: 0 additions & 64 deletions lib/mash/murmur3/murmur.go

This file was deleted.

Loading

0 comments on commit 8e6f32d

Please sign in to comment.