-
Notifications
You must be signed in to change notification settings - Fork 0
/
CountingBloomFilter.hpp
99 lines (88 loc) · 3.33 KB
/
CountingBloomFilter.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#pragma once
#include <vector>
#include <cstdint>
#include <string>
#include <iostream>
#include <stdexcept>
#include "hasher/HashGen32.hpp"
#include "Utils.hpp"
class CountingBloomFilter
{
public:
// Constructor that automatically calculates optimal size and number of hash functions
CountingBloomFilter(int n, double p, const std::string &algorithm = "sha256", const std::string &scheme = Hash32::SCHEME_ENHANCED_DOUBLE_HASHING)
: algorithm_(algorithm), scheme_(scheme)
{
// Calculate the optimal number of bits (m) and hash functions (k) based on n (expected elements) and p (false positive rate)
size_ = BloomFilterUtils::CalculateOptimalM(n, p);
k_ = BloomFilterUtils::CalculateOptimalK(size_, n);
maxRange_ = 1000; // Default, can be adjusted or calculated if necessary
filter_.resize(size_, 0); // Initialize filter with 0
}
// Constructor that allows creating with arbitrary size and k (number of hash functions)
CountingBloomFilter(int k, int size, const std::string &algorithm = "sha256", const std::string &scheme = Hash32::SCHEME_ENHANCED_DOUBLE_HASHING)
: k_(k), size_(size), algorithm_(algorithm), scheme_(scheme)
{
if (size <= 0)
{
throw std::invalid_argument("Bloom filter size must be greater than 0.");
}
filter_.resize(size_, 0); // Initialize filter with 0
}
// Existing method to add data to the filter
void Add(const std::vector<uint8_t> &data)
{
auto indices = hashGen_.Execute(data, algorithm_, scheme_);
for (auto index : *indices)
{
// Increment the count for the bucket at 'index'
filter_[index % size_] += 1;
}
}
// Check if data exists in the filter
bool Contains(const std::vector<uint8_t> &data)
{
auto indices = hashGen_.Execute(data, algorithm_, scheme_);
for (auto index : *indices)
{
// Check if any of the corresponding buckets have a count of 0
if (filter_[index % size_] == 0)
{
return false;
}
}
return true;
}
// Remove data from the filter
void Remove(const std::vector<uint8_t> &data)
{
auto indices = hashGen_.Execute(data, algorithm_, scheme_);
for (auto index : *indices)
{
// Decrement the count for the bucket at 'index'
if (filter_[index % size_] > 0)
{
filter_[index % size_] -= 1;
}
}
}
// Print the internal state of the Bloom filter
void Print() const
{
for (size_t i = 0; i < filter_.size(); ++i)
{
std::cout << "Bucket " << i << ": " << filter_[i] << std::endl;
}
}
int K() { return k_; }
int MaxRange() { return maxRange_; }
int Size() { return size_; };
private:
int k_; // Number of hash functions
int maxRange_; // Maximum range for the hashes
int size_; // Number of buckets in the Bloom filter
std::string algorithm_; // Hash algorithm to use
std::string scheme_; // Hashing scheme to use (default: EDH)
Hash32::HashGen hashGen_; // Hash generator for creating hash indices
std::vector<uint8_t> filter_; // Bloom filter (counter array)
};