Implementation
#include <bitset>
#include <string>
#include <iostream>
#include <random>
#include <vector>
#include <cassert>
#include <iomanip>
#include <fstream>
using namespace std;
// MurmurHash2, 64-bit versions, by Austin Appleby
typedef unsigned __int64 uint64_t;
using cull = const unsigned long long;
using ull = unsigned long long;
uint64_t MurmurHash64A(const void * key, int len, unsigned int seed)
{
const uint64_t m = 0xc6a4a7935bd1e995;
const int r = 47;
uint64_t h = seed ^ (len * m);
const uint64_t * data = (const uint64_t *)key;
const uint64_t * end = data + (len / 8);
while (data != end)
{
uint64_t k = *data++;
k *= m;
k ^= k >> r;
k *= m;
h ^= k;
h *= m;
}
const unsigned char * data2 = (const unsigned char*)data;
switch (len & 7)
{
case 7: h ^= uint64_t(data2[6]) << 48;
case 6: h ^= uint64_t(data2[5]) << 40;
case 5: h ^= uint64_t(data2[4]) << 32;
case 4: h ^= uint64_t(data2[3]) << 24;
case 3: h ^= uint64_t(data2[2]) << 16;
case 2: h ^= uint64_t(data2[1]) << 8;
case 1: h ^= uint64_t(data2[0]);
h *= m;
};
h ^= h >> r;
h *= m;
h ^= h >> r;
return h;
}
class BloomFilter
{
public:
BloomFilter(cull planned_item_count, float acceptable_fpt) {
configure_settings(planned_item_count, acceptable_fpt);
}
~BloomFilter() {
if (bit_table_) delete bit_table_;
}
// seralize bloom filter to file
void dump(const char* out_path) {
std::ofstream out(out_path);
out << _to_string();
out.close();
}
// add content
void add(const std::string key) {
uint64_t seed = 0;
for (size_t i = 0; i < num_hash_functions_; i++)
{
uint64_t hash = MurmurHash64A(key.c_str(), key.length(), seed);
seed = hash; // next hash is effectively random for next seed
hash %= table_size_;
(*bit_table_)[hash] = true;
}
++inserted_element_count_;
}
inline void clear() const {
fill(bit_table_->begin(), bit_table_->end(), false);
}
// check if content exists
inline bool contains(const string &key) const {
unsigned int seed = 0;
for (size_t i = 0; i < num_hash_functions_; i++)
{
uint64_t hash = MurmurHash64A(key.c_str(), key.length(), seed);
seed = hash; // proceed to next hash function
hash %= table_size_;
if (!(*bit_table_)[hash])
return false;
}
return true;
}
inline unsigned long long int element_count() const {
return inserted_element_count_;
}
/*
* The effective false positive probability is calculated in
* conjunction with the current number of inserted elements
*/
inline double effective_fpp() const {
return std::pow(1.0 - std::exp(-1.0 * num_hash_functions_ *
inserted_element_count_ / table_size_), double(num_hash_functions_));
}
private:
vector<bool> *bit_table_;
ull table_size_;
ull projected_element_count_;
ull inserted_element_count_;
double desired_false_positive_probability_;
int num_hash_functions_;
string _to_string() {
string *s = new string();
s->reserve(table_size_);
for (bool bit : *bit_table_) {
if (bit) s->push_back('1');
else s->push_back('0');
}
return *s;
}
void configure_settings(cull planned_item_count, float acceptable_fpt) {
bit_table_ = nullptr;
table_size_ =
compute_optimal_bit_table_size(planned_item_count, acceptable_fpt);
num_hash_functions_ =
compute_optimal_hash_count(table_size_, planned_item_count);
projected_element_count_ = planned_item_count;
inserted_element_count_ = 0;
desired_false_positive_probability_ = acceptable_fpt;
bit_table_ = new vector<bool>(table_size_);
}
const int compute_optimal_bit_table_size(int planned_elements, double acceptable_false_probability) {
return abs(planned_elements * log(acceptable_false_probability) / pow(log(2), 2));
}
const unsigned int compute_optimal_hash_count(cull opt_bit_table_size, cull planned_elements) {
return abs(opt_bit_table_size / planned_elements * log(2));
}
};
void testBloomFilter(double false_prob_threshold)
{
const string infile = "../infile/in.txt";
BloomFilter bf(1693, false_prob_threshold); // 1/10000 error
string line;
ifstream in(infile);
// Insert some strings (real words)
while (getline(in, line)) {
bf.add(line);
cout << "Added: " << setw(15) << line;
cout << "\tEffective FP: " << bf.effective_fpp() << endl;
}
cout << "\n\n";
// Query the existence of strings
ifstream in2(infile);
while (getline(in2, line))
cout << "Contains: " << line << " -> " << boolalpha << bf.contains(line) << endl;
cout << "\n\n";
// Query the existence of invalid strings (gibberish)
vector<string> gib = {"sqowow", "asdb", "asdagg", "qoweok", "eobs"};
for (string s : gib)
cout << "Contains: " << s << " -> " << boolalpha << bf.contains(s) << endl;
cout << "\n\n";
}
int main()
{
double fpt = .01;
while (fpt < 1) {
testBloomFilter(fpt);
fpt *= 10;
}
}
Last updated