diff --git a/dedeup.go b/dedeup.go new file mode 100644 index 0000000..d6385db --- /dev/null +++ b/dedeup.go @@ -0,0 +1,76 @@ +package rgen + +import ( + "crypto/md5" + "math/big" +) + +// Deduper is an object that can be used to deduplicate strings. +type Deduper interface { + // If Unique is true, the string is garanteed to be seen for the first time. + // If Unique is false, the string was most probably already seen. + Unique(st string) bool +} + +// ------------------------------------- + +// Creates a map-based Deduper. +// Unique always provides an exact response. +// Memory footprint will grow infinitely, but response is always exact. +// Avoid in production. +func NewDedupMap() Deduper { + d := new(dedupmap) + d.m = make(map[string]bool, 10) + return d +} + +type dedupmap struct { + m map[string]bool +} + +func (d *dedupmap) Unique(st string) bool { + if d.m[st] { + return false + } + d.m[st] = true + return true + +} + +// ---------------------------------------- + +// Creates a bloom-filter based Deduper. +// Memory footprint is fixed, driven by bloomsize setting in bits. +// True "Unique"" results are always right, but after a while, False Unique could be wrong (appear as non unique, despite really being unique). +// The larger the bloomsize, the lesser the error rate. +func NewDedupBloom(bloomsize int) Deduper { + d := new(dedupbloom) + d.z = big.NewInt(0) + d.bs = bloomsize + return d +} + +type dedupbloom struct { + bs int // bloomsize in bits + z *big.Int // we use this big.Int as a 256 bit field +} + +// Reasonable default value up to a few hundred thousands different strings (appx 33M bytes memory foot print) +const DefaultBloomSize = 256 * 256 * 256 * 16 // total filter size + +func (d *dedupbloom) Unique(st string) bool { + + hh := md5.Sum([]byte(st)) // hh is 16 bytes. + // On average, we will set about 16 bits at the beginning ... + changed := false + bb := 0 + for i := 1; i < len(hh); i++ { + bb = (bb*17 + 13*int(hh[i]) + 11) % d.bs // which bit to set ? + if d.z.Bit(bb) == 0 { + d.z.SetBit(d.z, bb, 1) + changed = true + } + + } + return changed +} diff --git a/dedup_test.go b/dedup_test.go new file mode 100644 index 0000000..fd19a73 --- /dev/null +++ b/dedup_test.go @@ -0,0 +1,25 @@ +package rgen + +import ( + "fmt" + "math/rand" + "testing" +) + +func TestUnique(t *testing.T) { + + // create map and bloom dedup + b1, b2 := NewDedupMap(), NewDedupBloom(DefaultBloomSize) + + // generate random numbers, and ensure uniqueness response are the same. + rd := rand.New(rand.NewSource(42)) + for i := 0; i < 10_000; i++ { + tt := fmt.Sprint(rd.Intn(1_000)) + u1, u2 := b1.Unique(tt), b2.Unique(tt) + if u1 != u2 { + t.Error("Unique response should be the same") + } + } + bb := b2.(*dedupbloom) + fmt.Println("Actual bloom size in Bytes:", len(bb.z.Bytes())) +} diff --git a/version.go b/version.go index a4cafe9..f3329a6 100644 --- a/version.go +++ b/version.go @@ -1,6 +1,7 @@ package rgen +// v0.4.2 dedup added (bloom filter) // v0.4.1 complete redesign from 0.3.x -const VERSION = "0.4.1" +const VERSION = "0.4.2" const COPYRIGHT = "(c) xavier gandillot 2022-2024"