Implementation

Data-structures:

  • Priority-queue

  • Prefix tree

English

  • Place all frequencies of nodes into a priority queue, by ascending order

  • Select two most minimum frequencies (delete min x2), sum these together, and insert a new frequency fw+1f_{w+1}.

  • Use this to build internal nodes for a binary-tree and adds an addition bit to each of the subtrees involved, bottom up. Have leaf nodes contain the data at the bottom.

  • Stop once priority queue is <1 (the root node is reached).

Self Notes

  • Node represents both an internal node within the tree, as well as the base nodes for the original characters. I used two different constructors to identify them both.

  • Analyzing the file had just meant finding the frequency of characters, and then doing some math to discover the total number of bits used. I used the following calcuations to analyize the compression.

// original
bitlength = log_2(#unique_chars)
byte_size_per_char = bit_length * freq
total_byes = sum(byte_size_per_char)

// compressed
count_bits = #of bits to reach parent node from base
byte_size_per_char = count_bits * freq
total_bytes = sum(byte_size_per_char)
  • Initially I defined the base characters the base of the tree in base_nodes hashmap.

  • Inserting the node meant adding having two base nodes point to a combined parent node.

  • With id_char_bit_seq, I decided to not go the extra mile of hashing the user id. This is ok since the number of unique characters tends not to exceed 300 (ASCII).

using Data = tuple<char, unsigned, unsigned>;

class Node {
public:
    Node(const int _id, const int _freq, const int _usr_id) : // base nodes
        hash_id(_id), freq(_freq), user_id(_usr_id), parent(nullptr) {};

    Node(const int _id, const int _freq) : // internal nodes
        hash_id(_id), freq(_freq), parent(nullptr) {};

    int hash_id, freq;
    char user_id;
    bool right;
    Node *parent;

private:
};

struct pq_Cmptor
{
    bool operator()(const Node *lhs, const Node *rhs) const
    {
        return lhs->freq > rhs->freq;
    }
};

class HuffmanTree
{
public:
    HuffmanTree(const string filepath) {
        base_file_info = analyze_file(filepath);
        base_nodes.reserve(base_file_info.size());
    }

    void huffman_compression()
    {
        // add base elements
        for (auto i : base_file_info) {
            // hash id = char + frequency
            int identifer = (int)get<0>(i) + (int)get<1>(i);
            base_nodes.insert(new Node(identifer, get<1>(i), get<0>(i)));
        }

        // add pq base elements
        priority_queue<Node*, vector<Node*>, pq_Cmptor> pq;
        for (auto i : base_nodes) {
            pq.push(i);
        }

        // begin main algorithm
        while (pq.size() > 1) {
            Node *min1 = pq.top(); pq.pop();
            Node *min2 = pq.top(); pq.pop();

            // new hash id = (char2 + freq2) + (char2 + freq2);
            int identifer = (int)min1->user_id + min1->freq
                + (int)min2->user_id + min2->freq;

            // constructor for internal node
            Node *newNode = new Node(identifer, min1->freq + min2->freq);
            pq.push(newNode);

            insert(min1, min2, newNode);
        }
    }

    void analyze_compression()
    {
        cout << "Original File Statistics" << endl;
        print_base_fileinfo(); cout << endl;

        cout << "All Data Statistics Compressed" << endl;
        print_cmprsd_fileinfo(); cout << endl;

        cout << "Compression File Reduction" << endl;
        float cmp_r = 1 - ceilf(((float)total_bits_new / (float)total_bits_ori) * 100) / 100;
        cout << total_bits_new << "/" << total_bits_ori << "=" << cmp_r << endl;
    }

    vector<bool> id_char_bit_seq(char user_id)
    {
        // find the id
        Node *baseNode = nullptr;
        for (auto &i : base_nodes) {
            if (user_id == i->user_id) {
                baseNode = i;
            }
        }
        return get_bit_seq(baseNode);
    }

    string bit_seq_to_string(vector<bool> &seq)
    {
        string str_seq = "";
        for (bool i : seq) {
            if (i) str_seq += "1";
            else str_seq += "0";
        }
        return str_seq;
    }

    void print_all_bit_seq()
    {
        for (auto &i : base_nodes)
            cout << i->user_id << ": " << setw(15) << 
                bit_seq_to_string(get_bit_seq(i)) << endl;
    }

private:
    vector<Data> base_file_info;
    unordered_set<Node*> base_nodes;
    int total_bits_new, total_bits_ori;

    void insert(Node *left, Node *right, Node *newNode)
    {
        left->right = false;
        right->right = true;
        left->parent = newNode;
        right->parent = newNode;
    }

    void print_base_fileinfo()
    {
        cout << setw(10) << "char" << setw(10) << "freq" << 
            setw(15) << "total_bits" << endl;
        for (auto i : base_file_info) {
            total_bits_ori += get<2>(i);
            cout << setw(10) << get<0>(i) << setw(10) << get<1>(i) << 
                setw(15) << get<2>(i) << endl;
        }
        cout << endl;
    }


    void print_cmprsd_fileinfo()
    {
        cout << setw(10) << "user_id" << setw(10) << "hash_id"
            << setw(10) << "freq" << setw(15) << "total_bits" << endl;

        for (auto &i : base_nodes) {
            int total_bits = count_bits(i) * i->freq;
            total_bits_new += total_bits;
            cout << setw(10) << i->user_id << setw(10) << i->hash_id
                << setw(10) << i->freq << setw(15) << total_bits << endl;
        }
        cout << endl;
    }

    int count_bits(Node *baseNode) {
        Node *temp = baseNode;
        int count = 0;
        while (temp->parent != nullptr)
        {
            count++;
            temp = temp->parent;
        }
        return count;
    }

    vector<bool> get_bit_seq(Node *baseNode)
    {
        if (baseNode == nullptr) {
            cout << "Invalid Node" << endl;
            exit(0);
        }

        vector<bool> bit_seq;
        while (baseNode != nullptr)
        {
            if (baseNode->right) {
                bit_seq.push_back(1);
            }
            else bit_seq.push_back(0);
            baseNode = baseNode->parent;
        }

        return bit_seq;
    }

    vector<Data> analyze_file(const string &filepath)
    {
        ifstream infile;
        infile.open(filepath, ifstream::in);

        unordered_map<char, int> char_freq;
        char cur_char;
        while (infile.good())
        {
            cur_char = infile.get();
            if (char_freq.find(cur_char) == char_freq.end()) {
                char_freq.insert({ cur_char, 1 });
            }
            else char_freq[cur_char]++;
            if (infile.peek() == EOF) break;
        }

        vector<Data> info;
        int bit_length = ceil(log2(char_freq.size()));
        for (auto i : char_freq) {
            info.push_back(make_tuple(i.first, i.second, bit_length * i.second));
        }

        infile.close();
        return info;
    }
};

int main()
{
    HuffmanTree compr("../InFile/random.txt");
    compr.huffman_compression();
    compr.analyze_compression();
    compr.print_all_bit_seq();
}

Last updated