Implementation

#include <bitset>
#include <string>
#include <iostream>
#include <random>
#include <vector>
#include <cassert>
#include <iomanip>
#include <fstream>

using namespace std;


// MurmurHash2, 64-bit versions, by Austin Appleby
typedef unsigned __int64 uint64_t;
using cull = const unsigned long long;
using ull = unsigned long long;

uint64_t MurmurHash64A(const void * key, int len, unsigned int seed)
{
    const uint64_t m = 0xc6a4a7935bd1e995;
    const int r = 47;

    uint64_t h = seed ^ (len * m);

    const uint64_t * data = (const uint64_t *)key;
    const uint64_t * end = data + (len / 8);

    while (data != end)
    {
        uint64_t k = *data++;

        k *= m;
        k ^= k >> r;
        k *= m;

        h ^= k;
        h *= m;
    }

    const unsigned char * data2 = (const unsigned char*)data;

    switch (len & 7)
    {
    case 7: h ^= uint64_t(data2[6]) << 48;
    case 6: h ^= uint64_t(data2[5]) << 40;
    case 5: h ^= uint64_t(data2[4]) << 32;
    case 4: h ^= uint64_t(data2[3]) << 24;
    case 3: h ^= uint64_t(data2[2]) << 16;
    case 2: h ^= uint64_t(data2[1]) << 8;
    case 1: h ^= uint64_t(data2[0]);
        h *= m;
    };

    h ^= h >> r;
    h *= m;
    h ^= h >> r;

    return h;
}


class BloomFilter
{
public:

    BloomFilter(cull planned_item_count, float acceptable_fpt) {
        configure_settings(planned_item_count, acceptable_fpt);
    }

    ~BloomFilter() {
        if (bit_table_) delete bit_table_;
    }

    // seralize bloom filter to file
    void dump(const char* out_path) {
        std::ofstream out(out_path);
        out << _to_string();
        out.close();
    }

    // add content
    void add(const std::string key) {
        uint64_t seed = 0;
        for (size_t i = 0; i < num_hash_functions_; i++)
        {
            uint64_t hash = MurmurHash64A(key.c_str(), key.length(), seed);
            seed = hash; // next hash is effectively random for next seed
            hash %= table_size_;
            (*bit_table_)[hash] = true;
        }
        ++inserted_element_count_;
    }

    inline void clear() const {
        fill(bit_table_->begin(), bit_table_->end(), false);
    }

    // check if content exists 
    inline bool contains(const string &key) const {
        unsigned int seed = 0;
        for (size_t i = 0; i < num_hash_functions_; i++)
        {
            uint64_t hash = MurmurHash64A(key.c_str(), key.length(), seed);
            seed = hash; // proceed to next hash function
            hash %= table_size_;
            if (!(*bit_table_)[hash])
                return false;
        }
        return true;
    }

    inline unsigned long long int element_count() const {
        return inserted_element_count_;
    }

    /*
     * The effective false positive probability is calculated in
     * conjunction with the current number of inserted elements
     */
    inline double effective_fpp() const {
        return std::pow(1.0 - std::exp(-1.0 * num_hash_functions_ * 
                              inserted_element_count_ / table_size_), double(num_hash_functions_));
    }

private:

    vector<bool>  *bit_table_;
    ull              table_size_;
    ull              projected_element_count_;
    ull              inserted_element_count_;
    double        desired_false_positive_probability_;
    int              num_hash_functions_;

    string _to_string() {
        string *s = new string();
        s->reserve(table_size_);
        for (bool bit : *bit_table_) {
            if (bit) s->push_back('1');
            else s->push_back('0');
        }
        return *s;
    }

    void configure_settings(cull planned_item_count, float acceptable_fpt) {
        bit_table_ = nullptr;
        table_size_ = 
            compute_optimal_bit_table_size(planned_item_count, acceptable_fpt);
        num_hash_functions_ =
            compute_optimal_hash_count(table_size_, planned_item_count);
        projected_element_count_ = planned_item_count;
        inserted_element_count_ = 0;
        desired_false_positive_probability_ = acceptable_fpt;
        bit_table_ = new vector<bool>(table_size_);
    }

    const int compute_optimal_bit_table_size(int planned_elements, double acceptable_false_probability) {
        return abs(planned_elements * log(acceptable_false_probability) / pow(log(2), 2));
    }

    const unsigned int compute_optimal_hash_count(cull opt_bit_table_size, cull planned_elements) {
        return abs(opt_bit_table_size / planned_elements * log(2));
    }
};


void testBloomFilter(double false_prob_threshold)
{
    const string infile = "../infile/in.txt";
    BloomFilter bf(1693, false_prob_threshold); // 1/10000 error
    string line;
    ifstream in(infile);

    // Insert some strings (real words)
    while (getline(in, line)) {
        bf.add(line);
        cout << "Added: " << setw(15) << line;
        cout << "\tEffective FP: " << bf.effective_fpp() << endl;
    }
    cout << "\n\n";

    // Query the existence of strings 
    ifstream in2(infile);

    while (getline(in2, line))
        cout << "Contains: " << line << " -> " << boolalpha << bf.contains(line) << endl;
    cout << "\n\n";

    // Query the existence of invalid strings (gibberish)
    vector<string> gib = {"sqowow", "asdb", "asdagg", "qoweok", "eobs"};
    for (string s : gib) 
        cout << "Contains: " << s << " -> " << boolalpha << bf.contains(s) << endl;
    cout << "\n\n";
}

int main()
{
    double fpt = .01;
    while (fpt < 1) {
        testBloomFilter(fpt);
        fpt *= 10;
    }
}

Last updated