diff options
Diffstat (limited to 'gl/str-two-way.h')
-rw-r--r-- | gl/str-two-way.h | 59 |
1 files changed, 40 insertions, 19 deletions
diff --git a/gl/str-two-way.h b/gl/str-two-way.h index c08f60e..4d555f9 100644 --- a/gl/str-two-way.h +++ b/gl/str-two-way.h | |||
@@ -95,26 +95,37 @@ | |||
95 | A critical factorization has the property that the local period | 95 | A critical factorization has the property that the local period |
96 | equals the global period. All strings have at least one critical | 96 | equals the global period. All strings have at least one critical |
97 | factorization with the left half smaller than the global period. | 97 | factorization with the left half smaller than the global period. |
98 | And while some strings have more than one critical factorization, | ||
99 | it is provable that with an ordered alphabet, at least one of the | ||
100 | critical factorizations corresponds to a maximal suffix. | ||
98 | 101 | ||
99 | Given an ordered alphabet, a critical factorization can be computed | 102 | Given an ordered alphabet, a critical factorization can be computed |
100 | in linear time, with 2 * NEEDLE_LEN comparisons, by computing the | 103 | in linear time, with 2 * NEEDLE_LEN comparisons, by computing the |
101 | larger of two ordered maximal suffixes. The ordered maximal | 104 | shorter of two ordered maximal suffixes. The ordered maximal |
102 | suffixes are determined by lexicographic comparison of | 105 | suffixes are determined by lexicographic comparison while tracking |
103 | periodicity. */ | 106 | periodicity. */ |
104 | static size_t | 107 | static size_t |
105 | critical_factorization (const unsigned char *needle, size_t needle_len, | 108 | critical_factorization (const unsigned char *needle, size_t needle_len, |
106 | size_t *period) | 109 | size_t *period) |
107 | { | 110 | { |
108 | /* Index of last byte of left half, or SIZE_MAX. */ | 111 | /* Index of last byte of left half. */ |
109 | size_t max_suffix, max_suffix_rev; | 112 | size_t max_suffix, max_suffix_rev; |
110 | size_t j; /* Index into NEEDLE for current candidate suffix. */ | 113 | size_t j; /* Index into NEEDLE for current candidate suffix. */ |
111 | size_t k; /* Offset into current period. */ | 114 | size_t k; /* Offset into current period. */ |
112 | size_t p; /* Intermediate period. */ | 115 | size_t p; /* Intermediate period. */ |
113 | unsigned char a, b; /* Current comparison bytes. */ | 116 | unsigned char a, b; /* Current comparison bytes. */ |
114 | 117 | ||
118 | /* Special case NEEDLE_LEN of 1 or 2 (all callers already filtered | ||
119 | out 0-length needles. */ | ||
120 | if (needle_len < 3) | ||
121 | { | ||
122 | *period = 1; | ||
123 | return needle_len - 1; | ||
124 | } | ||
125 | |||
115 | /* Invariants: | 126 | /* Invariants: |
116 | 0 <= j < NEEDLE_LEN - 1 | 127 | 1 <= j < NEEDLE_LEN - 1 |
117 | -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed) | 128 | 0 <= max_suffix{,_rev} < j |
118 | min(max_suffix, max_suffix_rev) < global period of NEEDLE | 129 | min(max_suffix, max_suffix_rev) < global period of NEEDLE |
119 | 1 <= p <= global period of NEEDLE | 130 | 1 <= p <= global period of NEEDLE |
120 | p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j] | 131 | p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j] |
@@ -122,9 +133,8 @@ critical_factorization (const unsigned char *needle, size_t needle_len, | |||
122 | */ | 133 | */ |
123 | 134 | ||
124 | /* Perform lexicographic search. */ | 135 | /* Perform lexicographic search. */ |
125 | max_suffix = SIZE_MAX; | 136 | max_suffix = 0; |
126 | j = 0; | 137 | j = k = p = 1; |
127 | k = p = 1; | ||
128 | while (j + k < needle_len) | 138 | while (j + k < needle_len) |
129 | { | 139 | { |
130 | a = CANON_ELEMENT (needle[j + k]); | 140 | a = CANON_ELEMENT (needle[j + k]); |
@@ -157,9 +167,8 @@ critical_factorization (const unsigned char *needle, size_t needle_len, | |||
157 | *period = p; | 167 | *period = p; |
158 | 168 | ||
159 | /* Perform reverse lexicographic search. */ | 169 | /* Perform reverse lexicographic search. */ |
160 | max_suffix_rev = SIZE_MAX; | 170 | max_suffix_rev = 0; |
161 | j = 0; | 171 | j = k = p = 1; |
162 | k = p = 1; | ||
163 | while (j + k < needle_len) | 172 | while (j + k < needle_len) |
164 | { | 173 | { |
165 | a = CANON_ELEMENT (needle[j + k]); | 174 | a = CANON_ELEMENT (needle[j + k]); |
@@ -190,8 +199,20 @@ critical_factorization (const unsigned char *needle, size_t needle_len, | |||
190 | } | 199 | } |
191 | } | 200 | } |
192 | 201 | ||
193 | /* Choose the longer suffix. Return the first byte of the right | 202 | /* Choose the shorter suffix. Return the index of the first byte of |
194 | half, rather than the last byte of the left half. */ | 203 | the right half, rather than the last byte of the left half. |
204 | |||
205 | For some examples, 'banana' has two critical factorizations, both | ||
206 | exposed by the two lexicographic extreme suffixes of 'anana' and | ||
207 | 'nana', where both suffixes have a period of 2. On the other | ||
208 | hand, with 'aab' and 'bba', both strings have a single critical | ||
209 | factorization of the last byte, with the suffix having a period | ||
210 | of 1. While the maximal lexicographic suffix of 'aab' is 'b', | ||
211 | the maximal lexicographic suffix of 'bba' is 'ba', which is not a | ||
212 | critical factorization. Conversely, the maximal reverse | ||
213 | lexicographic suffix of 'a' works for 'bba', but not 'ab' for | ||
214 | 'aab'. The shorter suffix of the two will always be a critical | ||
215 | factorization. */ | ||
195 | if (max_suffix_rev + 1 < max_suffix + 1) | 216 | if (max_suffix_rev + 1 < max_suffix + 1) |
196 | return max_suffix + 1; | 217 | return max_suffix + 1; |
197 | *period = p; | 218 | *period = p; |
@@ -226,9 +247,9 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, | |||
226 | first. */ | 247 | first. */ |
227 | if (CMP_FUNC (needle, needle + period, suffix) == 0) | 248 | if (CMP_FUNC (needle, needle + period, suffix) == 0) |
228 | { | 249 | { |
229 | /* Entire needle is periodic; a mismatch can only advance by the | 250 | /* Entire needle is periodic; a mismatch in the left half can |
230 | period, so use memory to avoid rescanning known occurrences | 251 | only advance by the period, so use memory to avoid rescanning |
231 | of the period. */ | 252 | known occurrences of the period in the right half. */ |
232 | size_t memory = 0; | 253 | size_t memory = 0; |
233 | j = 0; | 254 | j = 0; |
234 | while (AVAILABLE (haystack, haystack_len, j, needle_len)) | 255 | while (AVAILABLE (haystack, haystack_len, j, needle_len)) |
@@ -330,9 +351,9 @@ two_way_long_needle (const unsigned char *haystack, size_t haystack_len, | |||
330 | first. */ | 351 | first. */ |
331 | if (CMP_FUNC (needle, needle + period, suffix) == 0) | 352 | if (CMP_FUNC (needle, needle + period, suffix) == 0) |
332 | { | 353 | { |
333 | /* Entire needle is periodic; a mismatch can only advance by the | 354 | /* Entire needle is periodic; a mismatch in the left half can |
334 | period, so use memory to avoid rescanning known occurrences | 355 | only advance by the period, so use memory to avoid rescanning |
335 | of the period. */ | 356 | known occurrences of the period in the right half. */ |
336 | size_t memory = 0; | 357 | size_t memory = 0; |
337 | size_t shift; | 358 | size_t shift; |
338 | j = 0; | 359 | j = 0; |