Z Algorithm

Input: some string s

Output: An array Z, where every Z[i] = length of the longest substring beginning at position i that matches with a prefix of the string.

Example:

T= A B A A B
Z= 0 0 1 2 0

Naive

// without dp
vector<int> get_z_array(const string &text)
{
    if (text.empty()) return vector<int>();

    vector<int> z_array;
    z_array.reserve(text.length());

    int cur_box_left, cur_box_right;
    int start = 0, z_count = 0;
    char start_char, end_char;

    z_array.push_back(z_count);
    for (int i = 1; i < text.length(); i++) {

        int i_temp = i;
        start_char = text[start];
        end_char = text[i_temp];
        while (start_char == end_char
               && i_temp < text.length()) {
            z_count++;
            start++;
            i_temp++;
            start_char = text[start];
            end_char = text[i_temp];
        }

        z_array.push_back(z_count);
        z_count = start = 0;
    }

    return z_array;
}

In the naive approach, we iterate through the string, and at every position of the original string i, we count the number of times that the current character, matches with the start of the string (++1 as we go). $O(n^2)$

Improved

int longest_sub(const string &text, int start_i, int end_i)
{
    int temp_itr = end_i;
    char start_char = text[start_i];
    char end_char = text[temp_itr];
    int z_count = 0;

    while (start_char == end_char
        && temp_itr < text.length()) {

        z_count++;
        start_i++;
        temp_itr++;

        start_char = text[start_i];
        end_char = text[temp_itr];
    }

    return z_count;
}

// with dp
vector<int> get_z_array_wdp(const string &text)
{
    vector<int> z_array;    
    z_array.reserve(text.length());

    bool dp_next_turn = false;
    z_array.push_back(0);

    for (int i = 1; i < text.length(); i++) {
        if (dp_next_turn) {
            int prev_z_i = *(z_array.end() - 1);

            int z_box_left = i - 1;
            int z_box_right = z_box_left + prev_z_i - 1;

            int z_box_left_ref = 1;
            int z_box_right_ref = prev_z_i - 1;

            int amount_overlap = z_box_left - z_box_right_ref;
            if (amount_overlap < 0) { // there is overlap
                z_box_right -= abs(amount_overlap);
            }

            int j = i;
            for (; j < z_box_right && j < text.length(); j++) {

                int left_z_score_cpy = z_array[z_box_left_ref++];
                if (left_z_score_cpy + j > z_box_right) {
                    // start new comparisons after bound
                    int z_count = longest_sub(text, left_z_score_cpy, j + left_z_score_cpy);
                    z_array.push_back(z_count + left_z_score_cpy);
                }
                else {
                    z_array.push_back(left_z_score_cpy);
                }
            }

            i = j - 1;
            dp_next_turn = false;
        }

        else {
            int z_count = longest_sub(text, 0, i);

            if (z_count > 1) dp_next_turn = true;
            else dp_next_turn = false;

            z_array.push_back(z_count);
        }
    }

    return z_array;
}

int main(int argc, char** argv)
{
    const string test = "aabxaabxcaabxaabxay";
    print(get_z_array_wdp(test));

    const string test2 = "abababbabababc";
    print(get_z_array_wdp(test2));
}

With dynamic programming, the key idea to reduce the algorithm to linear time is to reference previously calculated z-values. The algorithm begins the same way as before. There is a single iteration through the main string, and at every i the z score becomes calculated. If the z score is 1, we can immediately move on to the next character. Otherwise, if the count is greater than 1, we initiate dp then on the next run by initiating a Boolean flag.

With dp, we reference the previous z scores to determine the one that follows next (rather than recalculating it). Even though there is a second loop, the progress is still linear because i is set back to where j made progress (j - 1). The caveat we have to check for is if z score we are referencing plus the current index exceeds the right bound of the z box, then we'd have to perform additional calculations to find the final z score. This means that the z score we are referencing is guaranteed to be the lower bound for this z score we want, but the moment it exceeds the right bound of the z box, then it could potentially mean that the first character after the bound can additionally match, so we have to check for that. We begin, however, at the distance from the string that is the minimum z score (lower bound). So again, the progress is still linear. Otherwise, if there is no complication, we can simply use the corresponding the z score.

Alternative version that tells us what is going on

int longest_sub(const string &text, int start_i, int end_i)
{
    int temp_itr = end_i;
    char start_char = text[start_i];
    char end_char = text[temp_itr];
    int z_count = 0;

    while (start_char == end_char
        && temp_itr < text.length()) {

        z_count++;
        start_i++;
        temp_itr++;

        start_char = text[start_i];
        end_char = text[temp_itr];
    }

    return z_count;
}

// with dp
vector<int> get_z_array_wdp(const string &text)
{
    cout << "Z-algorithm Start" << endl;
    vector<int> z_array;    
    z_array.reserve(text.length());

    bool dp_next_turn = false;
    cout << "Default z value 0 to first element" << endl;
    z_array.push_back(0);

    for (int i = 1; i < text.length(); i++) {

        if (dp_next_turn) {
            cout << "Attempting to copy previous z_values" << endl;
            int prev_z_i = *(z_array.end() - 1);

            int z_box_left = i - 1;
            int z_box_right = z_box_left + prev_z_i - 1;
            cout << "Z-box left: " << z_box_left << endl;
            cout << "Z-box-right: " << z_box_right << endl;

            int z_box_left_ref = 1;
            int z_box_right_ref = prev_z_i - 1;

            int amount_overlap = z_box_left - z_box_right_ref;
            if (amount_overlap < 0) { // there is overlap
                z_box_right -= abs(amount_overlap);
                cout << "Overlap detected, limiting z_box_right to: " << z_box_right << endl;
            }

            int j = i;
            for (; j < z_box_right && j < text.length(); j++) {
                int left_z_score_cpy = z_array[z_box_left_ref++];
                cout << "Referencing previous z_score: " << left_z_score_cpy << endl;
                if (left_z_score_cpy + j > z_box_right) {
                    // start new comparisons after bound
                    cout << "Reference z_score " << left_z_score_cpy << " + index " << j << " exceed z_box_right " << z_box_right << endl;
                    cout << "Finding longest_sub referencing start at " << left_z_score_cpy << " and referencing end at " << j + left_z_score_cpy << endl;
                    int z_count = longest_sub(text, left_z_score_cpy, j + left_z_score_cpy);
                    z_array.push_back(z_count + left_z_score_cpy);
                    cout << "Next longest sub at " << j << ": " << z_count + left_z_score_cpy << endl;
                }
                else {
                    cout << "Copying value at index " << z_box_left_ref << " : " << left_z_score_cpy << endl;
                    z_array.push_back(left_z_score_cpy);
                }
                print(z_array);
            }

            i = j - 1;
            dp_next_turn = false;
            cout << "DP next phase finished" << endl;
        }

        else {
            int z_count = longest_sub(text, 0, i);
            cout << "Next longest sub at index " << i << ": " << z_count << endl;

            if (z_count > 1) dp_next_turn = true;
            else dp_next_turn = false;

            z_array.push_back(z_count);
        }

        cout << "Updated Z-array" << endl;
        print(z_array);
        cout << endl << endl;
    }

    return z_array;
}

int main(int argc, char** argv)
{
    const string test2 = "abababbabababc";
    print(get_z_array_wdp(test2));

    getchar();
}

PreviousString Algorithms NextBoyer-Moore

Last updated 4 years ago