diff options
Diffstat (limited to 'gl/str-two-way.h')
-rw-r--r-- | gl/str-two-way.h | 38 |
1 files changed, 20 insertions, 18 deletions
diff --git a/gl/str-two-way.h b/gl/str-two-way.h index 4d555f9..707145d 100644 --- a/gl/str-two-way.h +++ b/gl/str-two-way.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /* Byte-wise substring search, using the Two-Way algorithm. | 1 | /* Byte-wise substring search, using the Two-Way algorithm. |
2 | Copyright (C) 2008, 2009, 2010 Free Software Foundation, Inc. | 2 | Copyright (C) 2008-2013 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. | 3 | This file is part of the GNU C Library. |
4 | Written by Eric Blake <ebb9@byu.net>, 2008. | 4 | Written by Eric Blake <ebb9@byu.net>, 2008. |
5 | 5 | ||
@@ -14,8 +14,7 @@ | |||
14 | GNU General Public License for more details. | 14 | GNU General Public License for more details. |
15 | 15 | ||
16 | You should have received a copy of the GNU General Public License along | 16 | You should have received a copy of the GNU General Public License along |
17 | with this program; if not, write to the Free Software Foundation, | 17 | with this program; if not, see <http://www.gnu.org/licenses/>. */ |
18 | Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ | ||
19 | 18 | ||
20 | /* Before including this file, you need to include <config.h> and | 19 | /* Before including this file, you need to include <config.h> and |
21 | <string.h>, and define: | 20 | <string.h>, and define: |
@@ -44,14 +43,15 @@ | |||
44 | #include <limits.h> | 43 | #include <limits.h> |
45 | #include <stdint.h> | 44 | #include <stdint.h> |
46 | 45 | ||
47 | /* We use the Two-Way string matching algorithm, which guarantees | 46 | /* We use the Two-Way string matching algorithm (also known as |
48 | linear complexity with constant space. Additionally, for long | 47 | Chrochemore-Perrin), which guarantees linear complexity with |
49 | needles, we also use a bad character shift table similar to the | 48 | constant space. Additionally, for long needles, we also use a bad |
50 | Boyer-Moore algorithm to achieve improved (potentially sub-linear) | 49 | character shift table similar to the Boyer-Moore algorithm to |
51 | performance. | 50 | achieve improved (potentially sub-linear) performance. |
52 | 51 | ||
53 | See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 | 52 | See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260, |
54 | and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm | 53 | http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm, |
54 | http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.34.6641&rep=rep1&type=pdf | ||
55 | */ | 55 | */ |
56 | 56 | ||
57 | /* Point at which computing a bad-byte shift table is likely to be | 57 | /* Point at which computing a bad-byte shift table is likely to be |
@@ -108,7 +108,7 @@ static size_t | |||
108 | critical_factorization (const unsigned char *needle, size_t needle_len, | 108 | critical_factorization (const unsigned char *needle, size_t needle_len, |
109 | size_t *period) | 109 | size_t *period) |
110 | { | 110 | { |
111 | /* Index of last byte of left half. */ | 111 | /* Index of last byte of left half, or SIZE_MAX. */ |
112 | size_t max_suffix, max_suffix_rev; | 112 | size_t max_suffix, max_suffix_rev; |
113 | size_t j; /* Index into NEEDLE for current candidate suffix. */ | 113 | size_t j; /* Index into NEEDLE for current candidate suffix. */ |
114 | size_t k; /* Offset into current period. */ | 114 | size_t k; /* Offset into current period. */ |
@@ -124,8 +124,8 @@ critical_factorization (const unsigned char *needle, size_t needle_len, | |||
124 | } | 124 | } |
125 | 125 | ||
126 | /* Invariants: | 126 | /* Invariants: |
127 | 1 <= j < NEEDLE_LEN - 1 | 127 | 0 <= j < NEEDLE_LEN - 1 |
128 | 0 <= max_suffix{,_rev} < j | 128 | -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed) |
129 | min(max_suffix, max_suffix_rev) < global period of NEEDLE | 129 | min(max_suffix, max_suffix_rev) < global period of NEEDLE |
130 | 1 <= p <= global period of NEEDLE | 130 | 1 <= p <= global period of NEEDLE |
131 | p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j] | 131 | p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j] |
@@ -133,8 +133,9 @@ critical_factorization (const unsigned char *needle, size_t needle_len, | |||
133 | */ | 133 | */ |
134 | 134 | ||
135 | /* Perform lexicographic search. */ | 135 | /* Perform lexicographic search. */ |
136 | max_suffix = 0; | 136 | max_suffix = SIZE_MAX; |
137 | j = k = p = 1; | 137 | j = 0; |
138 | k = p = 1; | ||
138 | while (j + k < needle_len) | 139 | while (j + k < needle_len) |
139 | { | 140 | { |
140 | a = CANON_ELEMENT (needle[j + k]); | 141 | a = CANON_ELEMENT (needle[j + k]); |
@@ -167,8 +168,9 @@ critical_factorization (const unsigned char *needle, size_t needle_len, | |||
167 | *period = p; | 168 | *period = p; |
168 | 169 | ||
169 | /* Perform reverse lexicographic search. */ | 170 | /* Perform reverse lexicographic search. */ |
170 | max_suffix_rev = 0; | 171 | max_suffix_rev = SIZE_MAX; |
171 | j = k = p = 1; | 172 | j = 0; |
173 | k = p = 1; | ||
172 | while (j + k < needle_len) | 174 | while (j + k < needle_len) |
173 | { | 175 | { |
174 | a = CANON_ELEMENT (needle[j + k]); | 176 | a = CANON_ELEMENT (needle[j + k]); |
@@ -370,8 +372,8 @@ two_way_long_needle (const unsigned char *haystack, size_t haystack_len, | |||
370 | a byte out of place, there can be no match until | 372 | a byte out of place, there can be no match until |
371 | after the mismatch. */ | 373 | after the mismatch. */ |
372 | shift = needle_len - period; | 374 | shift = needle_len - period; |
373 | memory = 0; | ||
374 | } | 375 | } |
376 | memory = 0; | ||
375 | j += shift; | 377 | j += shift; |
376 | continue; | 378 | continue; |
377 | } | 379 | } |