summaryrefslogtreecommitdiffstats
path: root/gl/str-two-way.h
diff options
context:
space:
mode:
Diffstat (limited to 'gl/str-two-way.h')
-rw-r--r--gl/str-two-way.h59
1 files changed, 40 insertions, 19 deletions
diff --git a/gl/str-two-way.h b/gl/str-two-way.h
index c08f60e..4d555f9 100644
--- a/gl/str-two-way.h
+++ b/gl/str-two-way.h
@@ -95,26 +95,37 @@
95 A critical factorization has the property that the local period 95 A critical factorization has the property that the local period
96 equals the global period. All strings have at least one critical 96 equals the global period. All strings have at least one critical
97 factorization with the left half smaller than the global period. 97 factorization with the left half smaller than the global period.
98 And while some strings have more than one critical factorization,
99 it is provable that with an ordered alphabet, at least one of the
100 critical factorizations corresponds to a maximal suffix.
98 101
99 Given an ordered alphabet, a critical factorization can be computed 102 Given an ordered alphabet, a critical factorization can be computed
100 in linear time, with 2 * NEEDLE_LEN comparisons, by computing the 103 in linear time, with 2 * NEEDLE_LEN comparisons, by computing the
101 larger of two ordered maximal suffixes. The ordered maximal 104 shorter of two ordered maximal suffixes. The ordered maximal
102 suffixes are determined by lexicographic comparison of 105 suffixes are determined by lexicographic comparison while tracking
103 periodicity. */ 106 periodicity. */
104static size_t 107static size_t
105critical_factorization (const unsigned char *needle, size_t needle_len, 108critical_factorization (const unsigned char *needle, size_t needle_len,
106 size_t *period) 109 size_t *period)
107{ 110{
108 /* Index of last byte of left half, or SIZE_MAX. */ 111 /* Index of last byte of left half. */
109 size_t max_suffix, max_suffix_rev; 112 size_t max_suffix, max_suffix_rev;
110 size_t j; /* Index into NEEDLE for current candidate suffix. */ 113 size_t j; /* Index into NEEDLE for current candidate suffix. */
111 size_t k; /* Offset into current period. */ 114 size_t k; /* Offset into current period. */
112 size_t p; /* Intermediate period. */ 115 size_t p; /* Intermediate period. */
113 unsigned char a, b; /* Current comparison bytes. */ 116 unsigned char a, b; /* Current comparison bytes. */
114 117
118 /* Special case NEEDLE_LEN of 1 or 2 (all callers already filtered
119 out 0-length needles. */
120 if (needle_len < 3)
121 {
122 *period = 1;
123 return needle_len - 1;
124 }
125
115 /* Invariants: 126 /* Invariants:
116 0 <= j < NEEDLE_LEN - 1 127 1 <= j < NEEDLE_LEN - 1
117 -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed) 128 0 <= max_suffix{,_rev} < j
118 min(max_suffix, max_suffix_rev) < global period of NEEDLE 129 min(max_suffix, max_suffix_rev) < global period of NEEDLE
119 1 <= p <= global period of NEEDLE 130 1 <= p <= global period of NEEDLE
120 p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j] 131 p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j]
@@ -122,9 +133,8 @@ critical_factorization (const unsigned char *needle, size_t needle_len,
122 */ 133 */
123 134
124 /* Perform lexicographic search. */ 135 /* Perform lexicographic search. */
125 max_suffix = SIZE_MAX; 136 max_suffix = 0;
126 j = 0; 137 j = k = p = 1;
127 k = p = 1;
128 while (j + k < needle_len) 138 while (j + k < needle_len)
129 { 139 {
130 a = CANON_ELEMENT (needle[j + k]); 140 a = CANON_ELEMENT (needle[j + k]);
@@ -157,9 +167,8 @@ critical_factorization (const unsigned char *needle, size_t needle_len,
157 *period = p; 167 *period = p;
158 168
159 /* Perform reverse lexicographic search. */ 169 /* Perform reverse lexicographic search. */
160 max_suffix_rev = SIZE_MAX; 170 max_suffix_rev = 0;
161 j = 0; 171 j = k = p = 1;
162 k = p = 1;
163 while (j + k < needle_len) 172 while (j + k < needle_len)
164 { 173 {
165 a = CANON_ELEMENT (needle[j + k]); 174 a = CANON_ELEMENT (needle[j + k]);
@@ -190,8 +199,20 @@ critical_factorization (const unsigned char *needle, size_t needle_len,
190 } 199 }
191 } 200 }
192 201
193 /* Choose the longer suffix. Return the first byte of the right 202 /* Choose the shorter suffix. Return the index of the first byte of
194 half, rather than the last byte of the left half. */ 203 the right half, rather than the last byte of the left half.
204
205 For some examples, 'banana' has two critical factorizations, both
206 exposed by the two lexicographic extreme suffixes of 'anana' and
207 'nana', where both suffixes have a period of 2. On the other
208 hand, with 'aab' and 'bba', both strings have a single critical
209 factorization of the last byte, with the suffix having a period
210 of 1. While the maximal lexicographic suffix of 'aab' is 'b',
211 the maximal lexicographic suffix of 'bba' is 'ba', which is not a
212 critical factorization. Conversely, the maximal reverse
213 lexicographic suffix of 'a' works for 'bba', but not 'ab' for
214 'aab'. The shorter suffix of the two will always be a critical
215 factorization. */
195 if (max_suffix_rev + 1 < max_suffix + 1) 216 if (max_suffix_rev + 1 < max_suffix + 1)
196 return max_suffix + 1; 217 return max_suffix + 1;
197 *period = p; 218 *period = p;
@@ -226,9 +247,9 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
226 first. */ 247 first. */
227 if (CMP_FUNC (needle, needle + period, suffix) == 0) 248 if (CMP_FUNC (needle, needle + period, suffix) == 0)
228 { 249 {
229 /* Entire needle is periodic; a mismatch can only advance by the 250 /* Entire needle is periodic; a mismatch in the left half can
230 period, so use memory to avoid rescanning known occurrences 251 only advance by the period, so use memory to avoid rescanning
231 of the period. */ 252 known occurrences of the period in the right half. */
232 size_t memory = 0; 253 size_t memory = 0;
233 j = 0; 254 j = 0;
234 while (AVAILABLE (haystack, haystack_len, j, needle_len)) 255 while (AVAILABLE (haystack, haystack_len, j, needle_len))
@@ -330,9 +351,9 @@ two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
330 first. */ 351 first. */
331 if (CMP_FUNC (needle, needle + period, suffix) == 0) 352 if (CMP_FUNC (needle, needle + period, suffix) == 0)
332 { 353 {
333 /* Entire needle is periodic; a mismatch can only advance by the 354 /* Entire needle is periodic; a mismatch in the left half can
334 period, so use memory to avoid rescanning known occurrences 355 only advance by the period, so use memory to avoid rescanning
335 of the period. */ 356 known occurrences of the period in the right half. */
336 size_t memory = 0; 357 size_t memory = 0;
337 size_t shift; 358 size_t shift;
338 j = 0; 359 j = 0;