diff options
Diffstat (limited to 'gl/str-two-way.h')
-rw-r--r-- | gl/str-two-way.h | 65 |
1 files changed, 44 insertions, 21 deletions
diff --git a/gl/str-two-way.h b/gl/str-two-way.h index c08f60e..707145d 100644 --- a/gl/str-two-way.h +++ b/gl/str-two-way.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /* Byte-wise substring search, using the Two-Way algorithm. | 1 | /* Byte-wise substring search, using the Two-Way algorithm. |
2 | Copyright (C) 2008, 2009, 2010 Free Software Foundation, Inc. | 2 | Copyright (C) 2008-2013 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. | 3 | This file is part of the GNU C Library. |
4 | Written by Eric Blake <ebb9@byu.net>, 2008. | 4 | Written by Eric Blake <ebb9@byu.net>, 2008. |
5 | 5 | ||
@@ -14,8 +14,7 @@ | |||
14 | GNU General Public License for more details. | 14 | GNU General Public License for more details. |
15 | 15 | ||
16 | You should have received a copy of the GNU General Public License along | 16 | You should have received a copy of the GNU General Public License along |
17 | with this program; if not, write to the Free Software Foundation, | 17 | with this program; if not, see <http://www.gnu.org/licenses/>. */ |
18 | Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ | ||
19 | 18 | ||
20 | /* Before including this file, you need to include <config.h> and | 19 | /* Before including this file, you need to include <config.h> and |
21 | <string.h>, and define: | 20 | <string.h>, and define: |
@@ -44,14 +43,15 @@ | |||
44 | #include <limits.h> | 43 | #include <limits.h> |
45 | #include <stdint.h> | 44 | #include <stdint.h> |
46 | 45 | ||
47 | /* We use the Two-Way string matching algorithm, which guarantees | 46 | /* We use the Two-Way string matching algorithm (also known as |
48 | linear complexity with constant space. Additionally, for long | 47 | Chrochemore-Perrin), which guarantees linear complexity with |
49 | needles, we also use a bad character shift table similar to the | 48 | constant space. Additionally, for long needles, we also use a bad |
50 | Boyer-Moore algorithm to achieve improved (potentially sub-linear) | 49 | character shift table similar to the Boyer-Moore algorithm to |
51 | performance. | 50 | achieve improved (potentially sub-linear) performance. |
52 | 51 | ||
53 | See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 | 52 | See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260, |
54 | and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm | 53 | http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm, |
54 | http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.34.6641&rep=rep1&type=pdf | ||
55 | */ | 55 | */ |
56 | 56 | ||
57 | /* Point at which computing a bad-byte shift table is likely to be | 57 | /* Point at which computing a bad-byte shift table is likely to be |
@@ -95,11 +95,14 @@ | |||
95 | A critical factorization has the property that the local period | 95 | A critical factorization has the property that the local period |
96 | equals the global period. All strings have at least one critical | 96 | equals the global period. All strings have at least one critical |
97 | factorization with the left half smaller than the global period. | 97 | factorization with the left half smaller than the global period. |
98 | And while some strings have more than one critical factorization, | ||
99 | it is provable that with an ordered alphabet, at least one of the | ||
100 | critical factorizations corresponds to a maximal suffix. | ||
98 | 101 | ||
99 | Given an ordered alphabet, a critical factorization can be computed | 102 | Given an ordered alphabet, a critical factorization can be computed |
100 | in linear time, with 2 * NEEDLE_LEN comparisons, by computing the | 103 | in linear time, with 2 * NEEDLE_LEN comparisons, by computing the |
101 | larger of two ordered maximal suffixes. The ordered maximal | 104 | shorter of two ordered maximal suffixes. The ordered maximal |
102 | suffixes are determined by lexicographic comparison of | 105 | suffixes are determined by lexicographic comparison while tracking |
103 | periodicity. */ | 106 | periodicity. */ |
104 | static size_t | 107 | static size_t |
105 | critical_factorization (const unsigned char *needle, size_t needle_len, | 108 | critical_factorization (const unsigned char *needle, size_t needle_len, |
@@ -112,6 +115,14 @@ critical_factorization (const unsigned char *needle, size_t needle_len, | |||
112 | size_t p; /* Intermediate period. */ | 115 | size_t p; /* Intermediate period. */ |
113 | unsigned char a, b; /* Current comparison bytes. */ | 116 | unsigned char a, b; /* Current comparison bytes. */ |
114 | 117 | ||
118 | /* Special case NEEDLE_LEN of 1 or 2 (all callers already filtered | ||
119 | out 0-length needles. */ | ||
120 | if (needle_len < 3) | ||
121 | { | ||
122 | *period = 1; | ||
123 | return needle_len - 1; | ||
124 | } | ||
125 | |||
115 | /* Invariants: | 126 | /* Invariants: |
116 | 0 <= j < NEEDLE_LEN - 1 | 127 | 0 <= j < NEEDLE_LEN - 1 |
117 | -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed) | 128 | -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed) |
@@ -190,8 +201,20 @@ critical_factorization (const unsigned char *needle, size_t needle_len, | |||
190 | } | 201 | } |
191 | } | 202 | } |
192 | 203 | ||
193 | /* Choose the longer suffix. Return the first byte of the right | 204 | /* Choose the shorter suffix. Return the index of the first byte of |
194 | half, rather than the last byte of the left half. */ | 205 | the right half, rather than the last byte of the left half. |
206 | |||
207 | For some examples, 'banana' has two critical factorizations, both | ||
208 | exposed by the two lexicographic extreme suffixes of 'anana' and | ||
209 | 'nana', where both suffixes have a period of 2. On the other | ||
210 | hand, with 'aab' and 'bba', both strings have a single critical | ||
211 | factorization of the last byte, with the suffix having a period | ||
212 | of 1. While the maximal lexicographic suffix of 'aab' is 'b', | ||
213 | the maximal lexicographic suffix of 'bba' is 'ba', which is not a | ||
214 | critical factorization. Conversely, the maximal reverse | ||
215 | lexicographic suffix of 'a' works for 'bba', but not 'ab' for | ||
216 | 'aab'. The shorter suffix of the two will always be a critical | ||
217 | factorization. */ | ||
195 | if (max_suffix_rev + 1 < max_suffix + 1) | 218 | if (max_suffix_rev + 1 < max_suffix + 1) |
196 | return max_suffix + 1; | 219 | return max_suffix + 1; |
197 | *period = p; | 220 | *period = p; |
@@ -226,9 +249,9 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, | |||
226 | first. */ | 249 | first. */ |
227 | if (CMP_FUNC (needle, needle + period, suffix) == 0) | 250 | if (CMP_FUNC (needle, needle + period, suffix) == 0) |
228 | { | 251 | { |
229 | /* Entire needle is periodic; a mismatch can only advance by the | 252 | /* Entire needle is periodic; a mismatch in the left half can |
230 | period, so use memory to avoid rescanning known occurrences | 253 | only advance by the period, so use memory to avoid rescanning |
231 | of the period. */ | 254 | known occurrences of the period in the right half. */ |
232 | size_t memory = 0; | 255 | size_t memory = 0; |
233 | j = 0; | 256 | j = 0; |
234 | while (AVAILABLE (haystack, haystack_len, j, needle_len)) | 257 | while (AVAILABLE (haystack, haystack_len, j, needle_len)) |
@@ -330,9 +353,9 @@ two_way_long_needle (const unsigned char *haystack, size_t haystack_len, | |||
330 | first. */ | 353 | first. */ |
331 | if (CMP_FUNC (needle, needle + period, suffix) == 0) | 354 | if (CMP_FUNC (needle, needle + period, suffix) == 0) |
332 | { | 355 | { |
333 | /* Entire needle is periodic; a mismatch can only advance by the | 356 | /* Entire needle is periodic; a mismatch in the left half can |
334 | period, so use memory to avoid rescanning known occurrences | 357 | only advance by the period, so use memory to avoid rescanning |
335 | of the period. */ | 358 | known occurrences of the period in the right half. */ |
336 | size_t memory = 0; | 359 | size_t memory = 0; |
337 | size_t shift; | 360 | size_t shift; |
338 | j = 0; | 361 | j = 0; |
@@ -349,8 +372,8 @@ two_way_long_needle (const unsigned char *haystack, size_t haystack_len, | |||
349 | a byte out of place, there can be no match until | 372 | a byte out of place, there can be no match until |
350 | after the mismatch. */ | 373 | after the mismatch. */ |
351 | shift = needle_len - period; | 374 | shift = needle_len - period; |
352 | memory = 0; | ||
353 | } | 375 | } |
376 | memory = 0; | ||
354 | j += shift; | 377 | j += shift; |
355 | continue; | 378 | continue; |
356 | } | 379 | } |