diff options
Diffstat (limited to 'lib/regex_internal.c')
-rw-r--r-- | lib/regex_internal.c | 1656 |
1 files changed, 1656 insertions, 0 deletions
diff --git a/lib/regex_internal.c b/lib/regex_internal.c new file mode 100644 index 0000000..ad618cf --- /dev/null +++ b/lib/regex_internal.c | |||
@@ -0,0 +1,1656 @@ | |||
1 | /* Extended regular expression matching and search library. | ||
2 | Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc. | ||
3 | This file is part of the GNU C Library. | ||
4 | Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. | ||
5 | |||
6 | This program is free software; you can redistribute it and/or modify | ||
7 | it under the terms of the GNU General Public License as published by | ||
8 | the Free Software Foundation; either version 2, or (at your option) | ||
9 | any later version. | ||
10 | |||
11 | This program is distributed in the hope that it will be useful, | ||
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
14 | GNU General Public License for more details. | ||
15 | |||
16 | You should have received a copy of the GNU General Public License along | ||
17 | with this program; if not, write to the Free Software Foundation, | ||
18 | Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ | ||
19 | |||
20 | static void re_string_construct_common (const char *str, Idx len, | ||
21 | re_string_t *pstr, | ||
22 | REG_TRANSLATE_TYPE trans, bool icase, | ||
23 | const re_dfa_t *dfa) internal_function; | ||
24 | static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa, | ||
25 | const re_node_set *nodes, | ||
26 | re_hashval_t hash) internal_function; | ||
27 | static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa, | ||
28 | const re_node_set *nodes, | ||
29 | unsigned int context, | ||
30 | re_hashval_t hash) internal_function; | ||
31 | |||
32 | /* Functions for string operation. */ | ||
33 | |||
34 | /* This function allocate the buffers. It is necessary to call | ||
35 | re_string_reconstruct before using the object. */ | ||
36 | |||
37 | static reg_errcode_t | ||
38 | internal_function | ||
39 | re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len, | ||
40 | REG_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa) | ||
41 | { | ||
42 | reg_errcode_t ret; | ||
43 | Idx init_buf_len; | ||
44 | |||
45 | /* Ensure at least one character fits into the buffers. */ | ||
46 | if (init_len < dfa->mb_cur_max) | ||
47 | init_len = dfa->mb_cur_max; | ||
48 | init_buf_len = (len + 1 < init_len) ? len + 1: init_len; | ||
49 | re_string_construct_common (str, len, pstr, trans, icase, dfa); | ||
50 | |||
51 | ret = re_string_realloc_buffers (pstr, init_buf_len); | ||
52 | if (BE (ret != REG_NOERROR, 0)) | ||
53 | return ret; | ||
54 | |||
55 | pstr->word_char = dfa->word_char; | ||
56 | pstr->word_ops_used = dfa->word_ops_used; | ||
57 | pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str; | ||
58 | pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len; | ||
59 | pstr->valid_raw_len = pstr->valid_len; | ||
60 | return REG_NOERROR; | ||
61 | } | ||
62 | |||
63 | /* This function allocate the buffers, and initialize them. */ | ||
64 | |||
65 | static reg_errcode_t | ||
66 | internal_function | ||
67 | re_string_construct (re_string_t *pstr, const char *str, Idx len, | ||
68 | REG_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa) | ||
69 | { | ||
70 | reg_errcode_t ret; | ||
71 | memset (pstr, '\0', sizeof (re_string_t)); | ||
72 | re_string_construct_common (str, len, pstr, trans, icase, dfa); | ||
73 | |||
74 | if (len > 0) | ||
75 | { | ||
76 | ret = re_string_realloc_buffers (pstr, len + 1); | ||
77 | if (BE (ret != REG_NOERROR, 0)) | ||
78 | return ret; | ||
79 | } | ||
80 | pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str; | ||
81 | |||
82 | if (icase) | ||
83 | { | ||
84 | #ifdef RE_ENABLE_I18N | ||
85 | if (dfa->mb_cur_max > 1) | ||
86 | { | ||
87 | while (1) | ||
88 | { | ||
89 | ret = build_wcs_upper_buffer (pstr); | ||
90 | if (BE (ret != REG_NOERROR, 0)) | ||
91 | return ret; | ||
92 | if (pstr->valid_raw_len >= len) | ||
93 | break; | ||
94 | if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max) | ||
95 | break; | ||
96 | ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2); | ||
97 | if (BE (ret != REG_NOERROR, 0)) | ||
98 | return ret; | ||
99 | } | ||
100 | } | ||
101 | else | ||
102 | #endif /* RE_ENABLE_I18N */ | ||
103 | build_upper_buffer (pstr); | ||
104 | } | ||
105 | else | ||
106 | { | ||
107 | #ifdef RE_ENABLE_I18N | ||
108 | if (dfa->mb_cur_max > 1) | ||
109 | build_wcs_buffer (pstr); | ||
110 | else | ||
111 | #endif /* RE_ENABLE_I18N */ | ||
112 | { | ||
113 | if (trans != NULL) | ||
114 | re_string_translate_buffer (pstr); | ||
115 | else | ||
116 | { | ||
117 | pstr->valid_len = pstr->bufs_len; | ||
118 | pstr->valid_raw_len = pstr->bufs_len; | ||
119 | } | ||
120 | } | ||
121 | } | ||
122 | |||
123 | return REG_NOERROR; | ||
124 | } | ||
125 | |||
126 | /* Helper functions for re_string_allocate, and re_string_construct. */ | ||
127 | |||
128 | static reg_errcode_t | ||
129 | internal_function | ||
130 | re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len) | ||
131 | { | ||
132 | #ifdef RE_ENABLE_I18N | ||
133 | if (pstr->mb_cur_max > 1) | ||
134 | { | ||
135 | wint_t *new_wcs = re_xrealloc (pstr->wcs, wint_t, new_buf_len); | ||
136 | if (BE (new_wcs == NULL, 0)) | ||
137 | return REG_ESPACE; | ||
138 | pstr->wcs = new_wcs; | ||
139 | if (pstr->offsets != NULL) | ||
140 | { | ||
141 | Idx *new_offsets = re_xrealloc (pstr->offsets, Idx, new_buf_len); | ||
142 | if (BE (new_offsets == NULL, 0)) | ||
143 | return REG_ESPACE; | ||
144 | pstr->offsets = new_offsets; | ||
145 | } | ||
146 | } | ||
147 | #endif /* RE_ENABLE_I18N */ | ||
148 | if (pstr->mbs_allocated) | ||
149 | { | ||
150 | unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char, | ||
151 | new_buf_len); | ||
152 | if (BE (new_mbs == NULL, 0)) | ||
153 | return REG_ESPACE; | ||
154 | pstr->mbs = new_mbs; | ||
155 | } | ||
156 | pstr->bufs_len = new_buf_len; | ||
157 | return REG_NOERROR; | ||
158 | } | ||
159 | |||
160 | |||
161 | static void | ||
162 | internal_function | ||
163 | re_string_construct_common (const char *str, Idx len, re_string_t *pstr, | ||
164 | REG_TRANSLATE_TYPE trans, bool icase, | ||
165 | const re_dfa_t *dfa) | ||
166 | { | ||
167 | pstr->raw_mbs = (const unsigned char *) str; | ||
168 | pstr->len = len; | ||
169 | pstr->raw_len = len; | ||
170 | pstr->trans = (unsigned REG_TRANSLATE_TYPE) trans; | ||
171 | pstr->icase = icase; | ||
172 | pstr->mbs_allocated = (trans != NULL || icase); | ||
173 | pstr->mb_cur_max = dfa->mb_cur_max; | ||
174 | pstr->is_utf8 = dfa->is_utf8; | ||
175 | pstr->map_notascii = dfa->map_notascii; | ||
176 | pstr->stop = pstr->len; | ||
177 | pstr->raw_stop = pstr->stop; | ||
178 | } | ||
179 | |||
180 | #ifdef RE_ENABLE_I18N | ||
181 | |||
182 | /* Build wide character buffer PSTR->WCS. | ||
183 | If the byte sequence of the string are: | ||
184 | <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3> | ||
185 | Then wide character buffer will be: | ||
186 | <wc1> , WEOF , <wc2> , WEOF , <wc3> | ||
187 | We use WEOF for padding, they indicate that the position isn't | ||
188 | a first byte of a multibyte character. | ||
189 | |||
190 | Note that this function assumes PSTR->VALID_LEN elements are already | ||
191 | built and starts from PSTR->VALID_LEN. */ | ||
192 | |||
193 | static void | ||
194 | internal_function | ||
195 | build_wcs_buffer (re_string_t *pstr) | ||
196 | { | ||
197 | #ifdef _LIBC | ||
198 | unsigned char buf[MB_LEN_MAX]; | ||
199 | assert (MB_LEN_MAX >= pstr->mb_cur_max); | ||
200 | #else | ||
201 | unsigned char buf[64]; | ||
202 | #endif | ||
203 | mbstate_t prev_st; | ||
204 | Idx byte_idx, end_idx, remain_len; | ||
205 | size_t mbclen; | ||
206 | |||
207 | /* Build the buffers from pstr->valid_len to either pstr->len or | ||
208 | pstr->bufs_len. */ | ||
209 | end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; | ||
210 | for (byte_idx = pstr->valid_len; byte_idx < end_idx;) | ||
211 | { | ||
212 | wchar_t wc; | ||
213 | const char *p; | ||
214 | |||
215 | remain_len = end_idx - byte_idx; | ||
216 | prev_st = pstr->cur_state; | ||
217 | /* Apply the translation if we need. */ | ||
218 | if (BE (pstr->trans != NULL, 0)) | ||
219 | { | ||
220 | int i, ch; | ||
221 | |||
222 | for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i) | ||
223 | { | ||
224 | ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i]; | ||
225 | buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch]; | ||
226 | } | ||
227 | p = (const char *) buf; | ||
228 | } | ||
229 | else | ||
230 | p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx; | ||
231 | mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state); | ||
232 | if (BE (mbclen == (size_t) -2, 0)) | ||
233 | { | ||
234 | /* The buffer doesn't have enough space, finish to build. */ | ||
235 | pstr->cur_state = prev_st; | ||
236 | break; | ||
237 | } | ||
238 | else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0)) | ||
239 | { | ||
240 | /* We treat these cases as a singlebyte character. */ | ||
241 | mbclen = 1; | ||
242 | wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; | ||
243 | if (BE (pstr->trans != NULL, 0)) | ||
244 | wc = pstr->trans[wc]; | ||
245 | pstr->cur_state = prev_st; | ||
246 | } | ||
247 | |||
248 | /* Write wide character and padding. */ | ||
249 | pstr->wcs[byte_idx++] = wc; | ||
250 | /* Write paddings. */ | ||
251 | for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) | ||
252 | pstr->wcs[byte_idx++] = WEOF; | ||
253 | } | ||
254 | pstr->valid_len = byte_idx; | ||
255 | pstr->valid_raw_len = byte_idx; | ||
256 | } | ||
257 | |||
258 | /* Build wide character buffer PSTR->WCS like build_wcs_buffer, | ||
259 | but for REG_ICASE. */ | ||
260 | |||
261 | static reg_errcode_t | ||
262 | internal_function | ||
263 | build_wcs_upper_buffer (re_string_t *pstr) | ||
264 | { | ||
265 | mbstate_t prev_st; | ||
266 | Idx src_idx, byte_idx, end_idx, remain_len; | ||
267 | size_t mbclen; | ||
268 | #ifdef _LIBC | ||
269 | char buf[MB_LEN_MAX]; | ||
270 | assert (MB_LEN_MAX >= pstr->mb_cur_max); | ||
271 | #else | ||
272 | char buf[64]; | ||
273 | #endif | ||
274 | |||
275 | byte_idx = pstr->valid_len; | ||
276 | end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; | ||
277 | |||
278 | /* The following optimization assumes that ASCII characters can be | ||
279 | mapped to wide characters with a simple cast. */ | ||
280 | if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed) | ||
281 | { | ||
282 | while (byte_idx < end_idx) | ||
283 | { | ||
284 | wchar_t wc; | ||
285 | |||
286 | if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]) | ||
287 | && mbsinit (&pstr->cur_state)) | ||
288 | { | ||
289 | /* In case of a singlebyte character. */ | ||
290 | pstr->mbs[byte_idx] | ||
291 | = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]); | ||
292 | /* The next step uses the assumption that wchar_t is encoded | ||
293 | ASCII-safe: all ASCII values can be converted like this. */ | ||
294 | pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx]; | ||
295 | ++byte_idx; | ||
296 | continue; | ||
297 | } | ||
298 | |||
299 | remain_len = end_idx - byte_idx; | ||
300 | prev_st = pstr->cur_state; | ||
301 | mbclen = mbrtowc (&wc, | ||
302 | ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx | ||
303 | + byte_idx), remain_len, &pstr->cur_state); | ||
304 | if (BE ((size_t) (mbclen + 2) > 2, 1)) | ||
305 | { | ||
306 | wchar_t wcu = wc; | ||
307 | if (iswlower (wc)) | ||
308 | { | ||
309 | size_t mbcdlen; | ||
310 | |||
311 | wcu = towupper (wc); | ||
312 | mbcdlen = wcrtomb (buf, wcu, &prev_st); | ||
313 | if (BE (mbclen == mbcdlen, 1)) | ||
314 | memcpy (pstr->mbs + byte_idx, buf, mbclen); | ||
315 | else | ||
316 | { | ||
317 | src_idx = byte_idx; | ||
318 | goto offsets_needed; | ||
319 | } | ||
320 | } | ||
321 | else | ||
322 | memcpy (pstr->mbs + byte_idx, | ||
323 | pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen); | ||
324 | pstr->wcs[byte_idx++] = wcu; | ||
325 | /* Write paddings. */ | ||
326 | for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) | ||
327 | pstr->wcs[byte_idx++] = WEOF; | ||
328 | } | ||
329 | else if (mbclen == (size_t) -1 || mbclen == 0) | ||
330 | { | ||
331 | /* It is an invalid character or '\0'. Just use the byte. */ | ||
332 | int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; | ||
333 | pstr->mbs[byte_idx] = ch; | ||
334 | /* And also cast it to wide char. */ | ||
335 | pstr->wcs[byte_idx++] = (wchar_t) ch; | ||
336 | if (BE (mbclen == (size_t) -1, 0)) | ||
337 | pstr->cur_state = prev_st; | ||
338 | } | ||
339 | else | ||
340 | { | ||
341 | /* The buffer doesn't have enough space, finish to build. */ | ||
342 | pstr->cur_state = prev_st; | ||
343 | break; | ||
344 | } | ||
345 | } | ||
346 | pstr->valid_len = byte_idx; | ||
347 | pstr->valid_raw_len = byte_idx; | ||
348 | return REG_NOERROR; | ||
349 | } | ||
350 | else | ||
351 | for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;) | ||
352 | { | ||
353 | wchar_t wc; | ||
354 | const char *p; | ||
355 | offsets_needed: | ||
356 | remain_len = end_idx - byte_idx; | ||
357 | prev_st = pstr->cur_state; | ||
358 | if (BE (pstr->trans != NULL, 0)) | ||
359 | { | ||
360 | int i, ch; | ||
361 | |||
362 | for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i) | ||
363 | { | ||
364 | ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i]; | ||
365 | buf[i] = pstr->trans[ch]; | ||
366 | } | ||
367 | p = (const char *) buf; | ||
368 | } | ||
369 | else | ||
370 | p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx; | ||
371 | mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state); | ||
372 | if (BE ((size_t) (mbclen + 2) > 2, 1)) | ||
373 | { | ||
374 | wchar_t wcu = wc; | ||
375 | if (iswlower (wc)) | ||
376 | { | ||
377 | size_t mbcdlen; | ||
378 | |||
379 | wcu = towupper (wc); | ||
380 | mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st); | ||
381 | if (BE (mbclen == mbcdlen, 1)) | ||
382 | memcpy (pstr->mbs + byte_idx, buf, mbclen); | ||
383 | else if (mbcdlen != (size_t) -1) | ||
384 | { | ||
385 | size_t i; | ||
386 | |||
387 | if (byte_idx + mbcdlen > pstr->bufs_len) | ||
388 | { | ||
389 | pstr->cur_state = prev_st; | ||
390 | break; | ||
391 | } | ||
392 | |||
393 | if (pstr->offsets == NULL) | ||
394 | { | ||
395 | pstr->offsets = re_xmalloc (Idx, pstr->bufs_len); | ||
396 | |||
397 | if (pstr->offsets == NULL) | ||
398 | return REG_ESPACE; | ||
399 | } | ||
400 | if (!pstr->offsets_needed) | ||
401 | { | ||
402 | for (i = 0; i < (size_t) byte_idx; ++i) | ||
403 | pstr->offsets[i] = i; | ||
404 | pstr->offsets_needed = 1; | ||
405 | } | ||
406 | |||
407 | memcpy (pstr->mbs + byte_idx, buf, mbcdlen); | ||
408 | pstr->wcs[byte_idx] = wcu; | ||
409 | pstr->offsets[byte_idx] = src_idx; | ||
410 | for (i = 1; i < mbcdlen; ++i) | ||
411 | { | ||
412 | pstr->offsets[byte_idx + i] | ||
413 | = src_idx + (i < mbclen ? i : mbclen - 1); | ||
414 | pstr->wcs[byte_idx + i] = WEOF; | ||
415 | } | ||
416 | pstr->len += mbcdlen - mbclen; | ||
417 | if (pstr->raw_stop > src_idx) | ||
418 | pstr->stop += mbcdlen - mbclen; | ||
419 | end_idx = (pstr->bufs_len > pstr->len) | ||
420 | ? pstr->len : pstr->bufs_len; | ||
421 | byte_idx += mbcdlen; | ||
422 | src_idx += mbclen; | ||
423 | continue; | ||
424 | } | ||
425 | else | ||
426 | memcpy (pstr->mbs + byte_idx, p, mbclen); | ||
427 | } | ||
428 | else | ||
429 | memcpy (pstr->mbs + byte_idx, p, mbclen); | ||
430 | |||
431 | if (BE (pstr->offsets_needed != 0, 0)) | ||
432 | { | ||
433 | size_t i; | ||
434 | for (i = 0; i < mbclen; ++i) | ||
435 | pstr->offsets[byte_idx + i] = src_idx + i; | ||
436 | } | ||
437 | src_idx += mbclen; | ||
438 | |||
439 | pstr->wcs[byte_idx++] = wcu; | ||
440 | /* Write paddings. */ | ||
441 | for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) | ||
442 | pstr->wcs[byte_idx++] = WEOF; | ||
443 | } | ||
444 | else if (mbclen == (size_t) -1 || mbclen == 0) | ||
445 | { | ||
446 | /* It is an invalid character or '\0'. Just use the byte. */ | ||
447 | int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx]; | ||
448 | |||
449 | if (BE (pstr->trans != NULL, 0)) | ||
450 | ch = pstr->trans [ch]; | ||
451 | pstr->mbs[byte_idx] = ch; | ||
452 | |||
453 | if (BE (pstr->offsets_needed != 0, 0)) | ||
454 | pstr->offsets[byte_idx] = src_idx; | ||
455 | ++src_idx; | ||
456 | |||
457 | /* And also cast it to wide char. */ | ||
458 | pstr->wcs[byte_idx++] = (wchar_t) ch; | ||
459 | if (BE (mbclen == (size_t) -1, 0)) | ||
460 | pstr->cur_state = prev_st; | ||
461 | } | ||
462 | else | ||
463 | { | ||
464 | /* The buffer doesn't have enough space, finish to build. */ | ||
465 | pstr->cur_state = prev_st; | ||
466 | break; | ||
467 | } | ||
468 | } | ||
469 | pstr->valid_len = byte_idx; | ||
470 | pstr->valid_raw_len = src_idx; | ||
471 | return REG_NOERROR; | ||
472 | } | ||
473 | |||
474 | /* Skip characters until the index becomes greater than NEW_RAW_IDX. | ||
475 | Return the index. */ | ||
476 | |||
477 | static Idx | ||
478 | internal_function | ||
479 | re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc) | ||
480 | { | ||
481 | mbstate_t prev_st; | ||
482 | Idx rawbuf_idx; | ||
483 | size_t mbclen; | ||
484 | wchar_t wc = 0; | ||
485 | |||
486 | /* Skip the characters which are not necessary to check. */ | ||
487 | for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len; | ||
488 | rawbuf_idx < new_raw_idx;) | ||
489 | { | ||
490 | Idx remain_len; | ||
491 | remain_len = pstr->len - rawbuf_idx; | ||
492 | prev_st = pstr->cur_state; | ||
493 | mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx, | ||
494 | remain_len, &pstr->cur_state); | ||
495 | if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0)) | ||
496 | { | ||
497 | /* We treat these cases as a singlebyte character. */ | ||
498 | mbclen = 1; | ||
499 | pstr->cur_state = prev_st; | ||
500 | } | ||
501 | /* Then proceed the next character. */ | ||
502 | rawbuf_idx += mbclen; | ||
503 | } | ||
504 | *last_wc = (wint_t) wc; | ||
505 | return rawbuf_idx; | ||
506 | } | ||
507 | #endif /* RE_ENABLE_I18N */ | ||
508 | |||
509 | /* Build the buffer PSTR->MBS, and apply the translation if we need. | ||
510 | This function is used in case of REG_ICASE. */ | ||
511 | |||
512 | static void | ||
513 | internal_function | ||
514 | build_upper_buffer (re_string_t *pstr) | ||
515 | { | ||
516 | Idx char_idx, end_idx; | ||
517 | end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; | ||
518 | |||
519 | for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx) | ||
520 | { | ||
521 | int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx]; | ||
522 | if (BE (pstr->trans != NULL, 0)) | ||
523 | ch = pstr->trans[ch]; | ||
524 | if (islower (ch)) | ||
525 | pstr->mbs[char_idx] = toupper (ch); | ||
526 | else | ||
527 | pstr->mbs[char_idx] = ch; | ||
528 | } | ||
529 | pstr->valid_len = char_idx; | ||
530 | pstr->valid_raw_len = char_idx; | ||
531 | } | ||
532 | |||
533 | /* Apply TRANS to the buffer in PSTR. */ | ||
534 | |||
535 | static void | ||
536 | internal_function | ||
537 | re_string_translate_buffer (re_string_t *pstr) | ||
538 | { | ||
539 | Idx buf_idx, end_idx; | ||
540 | end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; | ||
541 | |||
542 | for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx) | ||
543 | { | ||
544 | int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx]; | ||
545 | pstr->mbs[buf_idx] = pstr->trans[ch]; | ||
546 | } | ||
547 | |||
548 | pstr->valid_len = buf_idx; | ||
549 | pstr->valid_raw_len = buf_idx; | ||
550 | } | ||
551 | |||
552 | /* This function re-construct the buffers. | ||
553 | Concretely, convert to wide character in case of pstr->mb_cur_max > 1, | ||
554 | convert to upper case in case of REG_ICASE, apply translation. */ | ||
555 | |||
556 | static reg_errcode_t | ||
557 | internal_function | ||
558 | re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags) | ||
559 | { | ||
560 | Idx offset; | ||
561 | |||
562 | if (BE (pstr->raw_mbs_idx <= idx, 0)) | ||
563 | offset = idx - pstr->raw_mbs_idx; | ||
564 | else | ||
565 | { | ||
566 | /* Reset buffer. */ | ||
567 | #ifdef RE_ENABLE_I18N | ||
568 | if (pstr->mb_cur_max > 1) | ||
569 | memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); | ||
570 | #endif /* RE_ENABLE_I18N */ | ||
571 | pstr->len = pstr->raw_len; | ||
572 | pstr->stop = pstr->raw_stop; | ||
573 | pstr->valid_len = 0; | ||
574 | pstr->raw_mbs_idx = 0; | ||
575 | pstr->valid_raw_len = 0; | ||
576 | pstr->offsets_needed = 0; | ||
577 | pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF | ||
578 | : CONTEXT_NEWLINE | CONTEXT_BEGBUF); | ||
579 | if (!pstr->mbs_allocated) | ||
580 | pstr->mbs = (unsigned char *) pstr->raw_mbs; | ||
581 | offset = idx; | ||
582 | } | ||
583 | |||
584 | if (BE (offset != 0, 1)) | ||
585 | { | ||
586 | /* Are the characters which are already checked remain? */ | ||
587 | if (BE (offset < pstr->valid_raw_len, 1) | ||
588 | #ifdef RE_ENABLE_I18N | ||
589 | /* Handling this would enlarge the code too much. | ||
590 | Accept a slowdown in that case. */ | ||
591 | && pstr->offsets_needed == 0 | ||
592 | #endif | ||
593 | ) | ||
594 | { | ||
595 | /* Yes, move them to the front of the buffer. */ | ||
596 | pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags); | ||
597 | #ifdef RE_ENABLE_I18N | ||
598 | if (pstr->mb_cur_max > 1) | ||
599 | memmove (pstr->wcs, pstr->wcs + offset, | ||
600 | (pstr->valid_len - offset) * sizeof (wint_t)); | ||
601 | #endif /* RE_ENABLE_I18N */ | ||
602 | if (BE (pstr->mbs_allocated, 0)) | ||
603 | memmove (pstr->mbs, pstr->mbs + offset, | ||
604 | pstr->valid_len - offset); | ||
605 | pstr->valid_len -= offset; | ||
606 | pstr->valid_raw_len -= offset; | ||
607 | #if DEBUG | ||
608 | assert (pstr->valid_len > 0); | ||
609 | #endif | ||
610 | } | ||
611 | else | ||
612 | { | ||
613 | /* No, skip all characters until IDX. */ | ||
614 | #ifdef RE_ENABLE_I18N | ||
615 | if (BE (pstr->offsets_needed, 0)) | ||
616 | { | ||
617 | pstr->len = pstr->raw_len - idx + offset; | ||
618 | pstr->stop = pstr->raw_stop - idx + offset; | ||
619 | pstr->offsets_needed = 0; | ||
620 | } | ||
621 | #endif | ||
622 | pstr->valid_len = 0; | ||
623 | pstr->valid_raw_len = 0; | ||
624 | #ifdef RE_ENABLE_I18N | ||
625 | if (pstr->mb_cur_max > 1) | ||
626 | { | ||
627 | Idx wcs_idx; | ||
628 | wint_t wc = WEOF; | ||
629 | |||
630 | if (pstr->is_utf8) | ||
631 | { | ||
632 | const unsigned char *raw, *p, *q, *end; | ||
633 | |||
634 | /* Special case UTF-8. Multi-byte chars start with any | ||
635 | byte other than 0x80 - 0xbf. */ | ||
636 | raw = pstr->raw_mbs + pstr->raw_mbs_idx; | ||
637 | end = raw + (offset - pstr->mb_cur_max); | ||
638 | for (p = raw + offset - 1; p >= end; --p) | ||
639 | if ((*p & 0xc0) != 0x80) | ||
640 | { | ||
641 | mbstate_t cur_state; | ||
642 | wchar_t wc2; | ||
643 | Idx mlen = raw + pstr->len - p; | ||
644 | unsigned char buf[6]; | ||
645 | size_t mbclen; | ||
646 | |||
647 | q = p; | ||
648 | if (BE (pstr->trans != NULL, 0)) | ||
649 | { | ||
650 | int i = mlen < 6 ? mlen : 6; | ||
651 | while (--i >= 0) | ||
652 | buf[i] = pstr->trans[p[i]]; | ||
653 | q = buf; | ||
654 | } | ||
655 | /* XXX Don't use mbrtowc, we know which conversion | ||
656 | to use (UTF-8 -> UCS4). */ | ||
657 | memset (&cur_state, 0, sizeof (cur_state)); | ||
658 | mbclen = mbrtowc (&wc2, (const char *) p, mlen, | ||
659 | &cur_state); | ||
660 | if (raw + offset - p <= mbclen && mbclen < (size_t) -2) | ||
661 | { | ||
662 | memset (&pstr->cur_state, '\0', | ||
663 | sizeof (mbstate_t)); | ||
664 | pstr->valid_len = mbclen - (raw + offset - p); | ||
665 | wc = wc2; | ||
666 | } | ||
667 | break; | ||
668 | } | ||
669 | } | ||
670 | |||
671 | if (wc == WEOF) | ||
672 | pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx; | ||
673 | if (BE (pstr->valid_len, 0)) | ||
674 | { | ||
675 | for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx) | ||
676 | pstr->wcs[wcs_idx] = WEOF; | ||
677 | if (pstr->mbs_allocated) | ||
678 | memset (pstr->mbs, -1, pstr->valid_len); | ||
679 | } | ||
680 | pstr->valid_raw_len = pstr->valid_len; | ||
681 | pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0) | ||
682 | && IS_WIDE_WORD_CHAR (wc)) | ||
683 | ? CONTEXT_WORD | ||
684 | : ((IS_WIDE_NEWLINE (wc) | ||
685 | && pstr->newline_anchor) | ||
686 | ? CONTEXT_NEWLINE : 0)); | ||
687 | } | ||
688 | else | ||
689 | #endif /* RE_ENABLE_I18N */ | ||
690 | { | ||
691 | int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1]; | ||
692 | if (pstr->trans) | ||
693 | c = pstr->trans[c]; | ||
694 | pstr->tip_context = (bitset_contain (pstr->word_char, c) | ||
695 | ? CONTEXT_WORD | ||
696 | : ((IS_NEWLINE (c) && pstr->newline_anchor) | ||
697 | ? CONTEXT_NEWLINE : 0)); | ||
698 | } | ||
699 | } | ||
700 | if (!BE (pstr->mbs_allocated, 0)) | ||
701 | pstr->mbs += offset; | ||
702 | } | ||
703 | pstr->raw_mbs_idx = idx; | ||
704 | pstr->len -= offset; | ||
705 | pstr->stop -= offset; | ||
706 | |||
707 | /* Then build the buffers. */ | ||
708 | #ifdef RE_ENABLE_I18N | ||
709 | if (pstr->mb_cur_max > 1) | ||
710 | { | ||
711 | if (pstr->icase) | ||
712 | { | ||
713 | reg_errcode_t ret = build_wcs_upper_buffer (pstr); | ||
714 | if (BE (ret != REG_NOERROR, 0)) | ||
715 | return ret; | ||
716 | } | ||
717 | else | ||
718 | build_wcs_buffer (pstr); | ||
719 | } | ||
720 | else | ||
721 | #endif /* RE_ENABLE_I18N */ | ||
722 | if (BE (pstr->mbs_allocated, 0)) | ||
723 | { | ||
724 | if (pstr->icase) | ||
725 | build_upper_buffer (pstr); | ||
726 | else if (pstr->trans != NULL) | ||
727 | re_string_translate_buffer (pstr); | ||
728 | } | ||
729 | else | ||
730 | pstr->valid_len = pstr->len; | ||
731 | |||
732 | pstr->cur_idx = 0; | ||
733 | return REG_NOERROR; | ||
734 | } | ||
735 | |||
736 | static unsigned char | ||
737 | internal_function __attribute ((pure)) | ||
738 | re_string_peek_byte_case (const re_string_t *pstr, Idx idx) | ||
739 | { | ||
740 | int ch; | ||
741 | Idx off; | ||
742 | |||
743 | /* Handle the common (easiest) cases first. */ | ||
744 | if (BE (!pstr->mbs_allocated, 1)) | ||
745 | return re_string_peek_byte (pstr, idx); | ||
746 | |||
747 | #ifdef RE_ENABLE_I18N | ||
748 | if (pstr->mb_cur_max > 1 | ||
749 | && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx)) | ||
750 | return re_string_peek_byte (pstr, idx); | ||
751 | #endif | ||
752 | |||
753 | off = pstr->cur_idx + idx; | ||
754 | #ifdef RE_ENABLE_I18N | ||
755 | if (pstr->offsets_needed) | ||
756 | off = pstr->offsets[off]; | ||
757 | #endif | ||
758 | |||
759 | ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; | ||
760 | |||
761 | #ifdef RE_ENABLE_I18N | ||
762 | /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I | ||
763 | this function returns CAPITAL LETTER I instead of first byte of | ||
764 | DOTLESS SMALL LETTER I. The latter would confuse the parser, | ||
765 | since peek_byte_case doesn't advance cur_idx in any way. */ | ||
766 | if (pstr->offsets_needed && !isascii (ch)) | ||
767 | return re_string_peek_byte (pstr, idx); | ||
768 | #endif | ||
769 | |||
770 | return ch; | ||
771 | } | ||
772 | |||
773 | static unsigned char | ||
774 | internal_function __attribute ((pure)) | ||
775 | re_string_fetch_byte_case (re_string_t *pstr) | ||
776 | { | ||
777 | if (BE (!pstr->mbs_allocated, 1)) | ||
778 | return re_string_fetch_byte (pstr); | ||
779 | |||
780 | #ifdef RE_ENABLE_I18N | ||
781 | if (pstr->offsets_needed) | ||
782 | { | ||
783 | Idx off; | ||
784 | int ch; | ||
785 | |||
786 | /* For tr_TR.UTF-8 [[:islower:]] there is | ||
787 | [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip | ||
788 | in that case the whole multi-byte character and return | ||
789 | the original letter. On the other side, with | ||
790 | [[: DOTLESS SMALL LETTER I return [[:I, as doing | ||
791 | anything else would complicate things too much. */ | ||
792 | |||
793 | if (!re_string_first_byte (pstr, pstr->cur_idx)) | ||
794 | return re_string_fetch_byte (pstr); | ||
795 | |||
796 | off = pstr->offsets[pstr->cur_idx]; | ||
797 | ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; | ||
798 | |||
799 | if (! isascii (ch)) | ||
800 | return re_string_fetch_byte (pstr); | ||
801 | |||
802 | re_string_skip_bytes (pstr, | ||
803 | re_string_char_size_at (pstr, pstr->cur_idx)); | ||
804 | return ch; | ||
805 | } | ||
806 | #endif | ||
807 | |||
808 | return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++]; | ||
809 | } | ||
810 | |||
811 | static void | ||
812 | internal_function | ||
813 | re_string_destruct (re_string_t *pstr) | ||
814 | { | ||
815 | #ifdef RE_ENABLE_I18N | ||
816 | re_free (pstr->wcs); | ||
817 | re_free (pstr->offsets); | ||
818 | #endif /* RE_ENABLE_I18N */ | ||
819 | if (pstr->mbs_allocated) | ||
820 | re_free (pstr->mbs); | ||
821 | } | ||
822 | |||
823 | /* Return the context at IDX in INPUT. */ | ||
824 | |||
825 | static unsigned int | ||
826 | internal_function | ||
827 | re_string_context_at (const re_string_t *input, Idx idx, int eflags) | ||
828 | { | ||
829 | int c; | ||
830 | if (BE (! REG_VALID_INDEX (idx), 0)) | ||
831 | /* In this case, we use the value stored in input->tip_context, | ||
832 | since we can't know the character in input->mbs[-1] here. */ | ||
833 | return input->tip_context; | ||
834 | if (BE (idx == input->len, 0)) | ||
835 | return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF | ||
836 | : CONTEXT_NEWLINE | CONTEXT_ENDBUF); | ||
837 | #ifdef RE_ENABLE_I18N | ||
838 | if (input->mb_cur_max > 1) | ||
839 | { | ||
840 | wint_t wc; | ||
841 | Idx wc_idx = idx; | ||
842 | while(input->wcs[wc_idx] == WEOF) | ||
843 | { | ||
844 | #ifdef DEBUG | ||
845 | /* It must not happen. */ | ||
846 | assert (REG_VALID_INDEX (wc_idx)); | ||
847 | #endif | ||
848 | --wc_idx; | ||
849 | if (! REG_VALID_INDEX (wc_idx)) | ||
850 | return input->tip_context; | ||
851 | } | ||
852 | wc = input->wcs[wc_idx]; | ||
853 | if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc)) | ||
854 | return CONTEXT_WORD; | ||
855 | return (IS_WIDE_NEWLINE (wc) && input->newline_anchor | ||
856 | ? CONTEXT_NEWLINE : 0); | ||
857 | } | ||
858 | else | ||
859 | #endif | ||
860 | { | ||
861 | c = re_string_byte_at (input, idx); | ||
862 | if (bitset_contain (input->word_char, c)) | ||
863 | return CONTEXT_WORD; | ||
864 | return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0; | ||
865 | } | ||
866 | } | ||
867 | |||
868 | /* Functions for set operation. */ | ||
869 | |||
870 | static reg_errcode_t | ||
871 | internal_function | ||
872 | re_node_set_alloc (re_node_set *set, Idx size) | ||
873 | { | ||
874 | set->alloc = size; | ||
875 | set->nelem = 0; | ||
876 | set->elems = re_xmalloc (Idx, size); | ||
877 | if (BE (set->elems == NULL, 0)) | ||
878 | return REG_ESPACE; | ||
879 | return REG_NOERROR; | ||
880 | } | ||
881 | |||
882 | static reg_errcode_t | ||
883 | internal_function | ||
884 | re_node_set_init_1 (re_node_set *set, Idx elem) | ||
885 | { | ||
886 | set->alloc = 1; | ||
887 | set->nelem = 1; | ||
888 | set->elems = re_malloc (Idx, 1); | ||
889 | if (BE (set->elems == NULL, 0)) | ||
890 | { | ||
891 | set->alloc = set->nelem = 0; | ||
892 | return REG_ESPACE; | ||
893 | } | ||
894 | set->elems[0] = elem; | ||
895 | return REG_NOERROR; | ||
896 | } | ||
897 | |||
898 | static reg_errcode_t | ||
899 | internal_function | ||
900 | re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2) | ||
901 | { | ||
902 | set->alloc = 2; | ||
903 | set->elems = re_malloc (Idx, 2); | ||
904 | if (BE (set->elems == NULL, 0)) | ||
905 | return REG_ESPACE; | ||
906 | if (elem1 == elem2) | ||
907 | { | ||
908 | set->nelem = 1; | ||
909 | set->elems[0] = elem1; | ||
910 | } | ||
911 | else | ||
912 | { | ||
913 | set->nelem = 2; | ||
914 | if (elem1 < elem2) | ||
915 | { | ||
916 | set->elems[0] = elem1; | ||
917 | set->elems[1] = elem2; | ||
918 | } | ||
919 | else | ||
920 | { | ||
921 | set->elems[0] = elem2; | ||
922 | set->elems[1] = elem1; | ||
923 | } | ||
924 | } | ||
925 | return REG_NOERROR; | ||
926 | } | ||
927 | |||
928 | static reg_errcode_t | ||
929 | internal_function | ||
930 | re_node_set_init_copy (re_node_set *dest, const re_node_set *src) | ||
931 | { | ||
932 | dest->nelem = src->nelem; | ||
933 | if (src->nelem > 0) | ||
934 | { | ||
935 | dest->alloc = dest->nelem; | ||
936 | dest->elems = re_malloc (Idx, dest->alloc); | ||
937 | if (BE (dest->elems == NULL, 0)) | ||
938 | { | ||
939 | dest->alloc = dest->nelem = 0; | ||
940 | return REG_ESPACE; | ||
941 | } | ||
942 | memcpy (dest->elems, src->elems, src->nelem * sizeof dest->elems[0]); | ||
943 | } | ||
944 | else | ||
945 | re_node_set_init_empty (dest); | ||
946 | return REG_NOERROR; | ||
947 | } | ||
948 | |||
949 | /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to | ||
950 | DEST. Return value indicate the error code or REG_NOERROR if succeeded. | ||
951 | Note: We assume dest->elems is NULL, when dest->alloc is 0. */ | ||
952 | |||
953 | static reg_errcode_t | ||
954 | internal_function | ||
955 | re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1, | ||
956 | const re_node_set *src2) | ||
957 | { | ||
958 | Idx i1, i2, is, id, delta, sbase; | ||
959 | if (src1->nelem == 0 || src2->nelem == 0) | ||
960 | return REG_NOERROR; | ||
961 | |||
962 | /* We need dest->nelem + 2 * elems_in_intersection; this is a | ||
963 | conservative estimate. */ | ||
964 | if (src1->nelem + src2->nelem + dest->nelem > dest->alloc) | ||
965 | { | ||
966 | Idx new_alloc = src1->nelem + src2->nelem + dest->alloc; | ||
967 | Idx *new_elems; | ||
968 | if (sizeof (Idx) < 3 | ||
969 | && (new_alloc < dest->alloc | ||
970 | || ((Idx) (src1->nelem + src2->nelem) < src1->nelem))) | ||
971 | return REG_ESPACE; | ||
972 | new_elems = re_xrealloc (dest->elems, Idx, new_alloc); | ||
973 | if (BE (new_elems == NULL, 0)) | ||
974 | return REG_ESPACE; | ||
975 | dest->elems = new_elems; | ||
976 | dest->alloc = new_alloc; | ||
977 | } | ||
978 | |||
979 | /* Find the items in the intersection of SRC1 and SRC2, and copy | ||
980 | into the top of DEST those that are not already in DEST itself. */ | ||
981 | sbase = dest->nelem + src1->nelem + src2->nelem; | ||
982 | i1 = src1->nelem - 1; | ||
983 | i2 = src2->nelem - 1; | ||
984 | id = dest->nelem - 1; | ||
985 | for (;;) | ||
986 | { | ||
987 | if (src1->elems[i1] == src2->elems[i2]) | ||
988 | { | ||
989 | /* Try to find the item in DEST. Maybe we could binary search? */ | ||
990 | while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1]) | ||
991 | --id; | ||
992 | |||
993 | if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1]) | ||
994 | dest->elems[--sbase] = src1->elems[i1]; | ||
995 | |||
996 | if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2)) | ||
997 | break; | ||
998 | } | ||
999 | |||
1000 | /* Lower the highest of the two items. */ | ||
1001 | else if (src1->elems[i1] < src2->elems[i2]) | ||
1002 | { | ||
1003 | if (! REG_VALID_INDEX (--i2)) | ||
1004 | break; | ||
1005 | } | ||
1006 | else | ||
1007 | { | ||
1008 | if (! REG_VALID_INDEX (--i1)) | ||
1009 | break; | ||
1010 | } | ||
1011 | } | ||
1012 | |||
1013 | id = dest->nelem - 1; | ||
1014 | is = dest->nelem + src1->nelem + src2->nelem - 1; | ||
1015 | delta = is - sbase + 1; | ||
1016 | |||
1017 | /* Now copy. When DELTA becomes zero, the remaining | ||
1018 | DEST elements are already in place; this is more or | ||
1019 | less the same loop that is in re_node_set_merge. */ | ||
1020 | dest->nelem += delta; | ||
1021 | if (delta > 0 && REG_VALID_INDEX (id)) | ||
1022 | for (;;) | ||
1023 | { | ||
1024 | if (dest->elems[is] > dest->elems[id]) | ||
1025 | { | ||
1026 | /* Copy from the top. */ | ||
1027 | dest->elems[id + delta--] = dest->elems[is--]; | ||
1028 | if (delta == 0) | ||
1029 | break; | ||
1030 | } | ||
1031 | else | ||
1032 | { | ||
1033 | /* Slide from the bottom. */ | ||
1034 | dest->elems[id + delta] = dest->elems[id]; | ||
1035 | if (! REG_VALID_INDEX (--id)) | ||
1036 | break; | ||
1037 | } | ||
1038 | } | ||
1039 | |||
1040 | /* Copy remaining SRC elements. */ | ||
1041 | memcpy (dest->elems, dest->elems + sbase, delta * sizeof dest->elems[0]); | ||
1042 | |||
1043 | return REG_NOERROR; | ||
1044 | } | ||
1045 | |||
1046 | /* Calculate the union set of the sets SRC1 and SRC2. And store it to | ||
1047 | DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ | ||
1048 | |||
1049 | static reg_errcode_t | ||
1050 | internal_function | ||
1051 | re_node_set_init_union (re_node_set *dest, const re_node_set *src1, | ||
1052 | const re_node_set *src2) | ||
1053 | { | ||
1054 | Idx i1, i2, id; | ||
1055 | if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0) | ||
1056 | { | ||
1057 | dest->alloc = src1->nelem + src2->nelem; | ||
1058 | if (sizeof (Idx) < 2 && dest->alloc < src1->nelem) | ||
1059 | return REG_ESPACE; | ||
1060 | dest->elems = re_xmalloc (Idx, dest->alloc); | ||
1061 | if (BE (dest->elems == NULL, 0)) | ||
1062 | return REG_ESPACE; | ||
1063 | } | ||
1064 | else | ||
1065 | { | ||
1066 | if (src1 != NULL && src1->nelem > 0) | ||
1067 | return re_node_set_init_copy (dest, src1); | ||
1068 | else if (src2 != NULL && src2->nelem > 0) | ||
1069 | return re_node_set_init_copy (dest, src2); | ||
1070 | else | ||
1071 | re_node_set_init_empty (dest); | ||
1072 | return REG_NOERROR; | ||
1073 | } | ||
1074 | for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;) | ||
1075 | { | ||
1076 | if (src1->elems[i1] > src2->elems[i2]) | ||
1077 | { | ||
1078 | dest->elems[id++] = src2->elems[i2++]; | ||
1079 | continue; | ||
1080 | } | ||
1081 | if (src1->elems[i1] == src2->elems[i2]) | ||
1082 | ++i2; | ||
1083 | dest->elems[id++] = src1->elems[i1++]; | ||
1084 | } | ||
1085 | if (i1 < src1->nelem) | ||
1086 | { | ||
1087 | memcpy (dest->elems + id, src1->elems + i1, | ||
1088 | (src1->nelem - i1) * sizeof dest->elems[0]); | ||
1089 | id += src1->nelem - i1; | ||
1090 | } | ||
1091 | else if (i2 < src2->nelem) | ||
1092 | { | ||
1093 | memcpy (dest->elems + id, src2->elems + i2, | ||
1094 | (src2->nelem - i2) * sizeof dest->elems[0]); | ||
1095 | id += src2->nelem - i2; | ||
1096 | } | ||
1097 | dest->nelem = id; | ||
1098 | return REG_NOERROR; | ||
1099 | } | ||
1100 | |||
1101 | /* Calculate the union set of the sets DEST and SRC. And store it to | ||
1102 | DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ | ||
1103 | |||
1104 | static reg_errcode_t | ||
1105 | internal_function | ||
1106 | re_node_set_merge (re_node_set *dest, const re_node_set *src) | ||
1107 | { | ||
1108 | Idx is, id, sbase, delta; | ||
1109 | if (src == NULL || src->nelem == 0) | ||
1110 | return REG_NOERROR; | ||
1111 | if (sizeof (Idx) < 3 | ||
1112 | && ((Idx) (2 * src->nelem) < src->nelem | ||
1113 | || (Idx) (2 * src->nelem + dest->nelem) < dest->nelem)) | ||
1114 | return REG_ESPACE; | ||
1115 | if (dest->alloc < 2 * src->nelem + dest->nelem) | ||
1116 | { | ||
1117 | Idx new_alloc = src->nelem + dest->alloc; | ||
1118 | Idx *new_buffer; | ||
1119 | if (sizeof (Idx) < 4 && new_alloc < dest->alloc) | ||
1120 | return REG_ESPACE; | ||
1121 | new_buffer = re_x2realloc (dest->elems, Idx, &new_alloc); | ||
1122 | if (BE (new_buffer == NULL, 0)) | ||
1123 | return REG_ESPACE; | ||
1124 | dest->elems = new_buffer; | ||
1125 | dest->alloc = new_alloc; | ||
1126 | } | ||
1127 | |||
1128 | if (BE (dest->nelem == 0, 0)) | ||
1129 | { | ||
1130 | dest->nelem = src->nelem; | ||
1131 | memcpy (dest->elems, src->elems, src->nelem * sizeof dest->elems[0]); | ||
1132 | return REG_NOERROR; | ||
1133 | } | ||
1134 | |||
1135 | /* Copy into the top of DEST the items of SRC that are not | ||
1136 | found in DEST. Maybe we could binary search in DEST? */ | ||
1137 | for (sbase = dest->nelem + 2 * src->nelem, | ||
1138 | is = src->nelem - 1, id = dest->nelem - 1; | ||
1139 | REG_VALID_INDEX (is) && REG_VALID_INDEX (id); ) | ||
1140 | { | ||
1141 | if (dest->elems[id] == src->elems[is]) | ||
1142 | is--, id--; | ||
1143 | else if (dest->elems[id] < src->elems[is]) | ||
1144 | dest->elems[--sbase] = src->elems[is--]; | ||
1145 | else /* if (dest->elems[id] > src->elems[is]) */ | ||
1146 | --id; | ||
1147 | } | ||
1148 | |||
1149 | if (REG_VALID_INDEX (is)) | ||
1150 | { | ||
1151 | /* If DEST is exhausted, the remaining items of SRC must be unique. */ | ||
1152 | sbase -= is + 1; | ||
1153 | memcpy (dest->elems + sbase, src->elems, | ||
1154 | (is + 1) * sizeof dest->elems[0]); | ||
1155 | } | ||
1156 | |||
1157 | id = dest->nelem - 1; | ||
1158 | is = dest->nelem + 2 * src->nelem - 1; | ||
1159 | delta = is - sbase + 1; | ||
1160 | if (delta == 0) | ||
1161 | return REG_NOERROR; | ||
1162 | |||
1163 | /* Now copy. When DELTA becomes zero, the remaining | ||
1164 | DEST elements are already in place. */ | ||
1165 | dest->nelem += delta; | ||
1166 | for (;;) | ||
1167 | { | ||
1168 | if (dest->elems[is] > dest->elems[id]) | ||
1169 | { | ||
1170 | /* Copy from the top. */ | ||
1171 | dest->elems[id + delta--] = dest->elems[is--]; | ||
1172 | if (delta == 0) | ||
1173 | break; | ||
1174 | } | ||
1175 | else | ||
1176 | { | ||
1177 | /* Slide from the bottom. */ | ||
1178 | dest->elems[id + delta] = dest->elems[id]; | ||
1179 | if (! REG_VALID_INDEX (--id)) | ||
1180 | { | ||
1181 | /* Copy remaining SRC elements. */ | ||
1182 | memcpy (dest->elems, dest->elems + sbase, | ||
1183 | delta * sizeof dest->elems[0]); | ||
1184 | break; | ||
1185 | } | ||
1186 | } | ||
1187 | } | ||
1188 | |||
1189 | return REG_NOERROR; | ||
1190 | } | ||
1191 | |||
1192 | /* Insert the new element ELEM to the re_node_set* SET. | ||
1193 | SET should not already have ELEM. | ||
1194 | Return true if successful. */ | ||
1195 | |||
1196 | static bool | ||
1197 | internal_function | ||
1198 | re_node_set_insert (re_node_set *set, Idx elem) | ||
1199 | { | ||
1200 | Idx idx; | ||
1201 | /* In case the set is empty. */ | ||
1202 | if (set->alloc == 0) | ||
1203 | return re_node_set_init_1 (set, elem) == REG_NOERROR; | ||
1204 | |||
1205 | if (BE (set->nelem, 0) == 0) | ||
1206 | { | ||
1207 | /* We already guaranteed above that set->alloc != 0. */ | ||
1208 | set->elems[0] = elem; | ||
1209 | ++set->nelem; | ||
1210 | return true; | ||
1211 | } | ||
1212 | |||
1213 | /* Realloc if we need. */ | ||
1214 | if (set->alloc == set->nelem) | ||
1215 | { | ||
1216 | Idx *new_elems = re_x2realloc (set->elems, Idx, &set->alloc); | ||
1217 | if (BE (new_elems == NULL, 0)) | ||
1218 | return false; | ||
1219 | set->elems = new_elems; | ||
1220 | } | ||
1221 | |||
1222 | /* Move the elements which follows the new element. Test the | ||
1223 | first element separately to skip a check in the inner loop. */ | ||
1224 | if (elem < set->elems[0]) | ||
1225 | { | ||
1226 | idx = 0; | ||
1227 | for (idx = set->nelem; idx > 0; idx--) | ||
1228 | set->elems[idx] = set->elems[idx - 1]; | ||
1229 | } | ||
1230 | else | ||
1231 | { | ||
1232 | for (idx = set->nelem; set->elems[idx - 1] > elem; idx--) | ||
1233 | set->elems[idx] = set->elems[idx - 1]; | ||
1234 | } | ||
1235 | |||
1236 | /* Insert the new element. */ | ||
1237 | set->elems[idx] = elem; | ||
1238 | ++set->nelem; | ||
1239 | return true; | ||
1240 | } | ||
1241 | |||
1242 | /* Insert the new element ELEM to the re_node_set* SET. | ||
1243 | SET should not already have any element greater than or equal to ELEM. | ||
1244 | Return true if successful. */ | ||
1245 | |||
1246 | static bool | ||
1247 | internal_function | ||
1248 | re_node_set_insert_last (re_node_set *set, Idx elem) | ||
1249 | { | ||
1250 | /* Realloc if we need. */ | ||
1251 | if (set->alloc == set->nelem) | ||
1252 | { | ||
1253 | Idx *new_elems; | ||
1254 | new_elems = re_x2realloc (set->elems, Idx, &set->alloc); | ||
1255 | if (BE (new_elems == NULL, 0)) | ||
1256 | return false; | ||
1257 | set->elems = new_elems; | ||
1258 | } | ||
1259 | |||
1260 | /* Insert the new element. */ | ||
1261 | set->elems[set->nelem++] = elem; | ||
1262 | return true; | ||
1263 | } | ||
1264 | |||
1265 | /* Compare two node sets SET1 and SET2. | ||
1266 | Return true if SET1 and SET2 are equivalent. */ | ||
1267 | |||
1268 | static bool | ||
1269 | internal_function __attribute ((pure)) | ||
1270 | re_node_set_compare (const re_node_set *set1, const re_node_set *set2) | ||
1271 | { | ||
1272 | Idx i; | ||
1273 | if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem) | ||
1274 | return false; | ||
1275 | for (i = set1->nelem ; REG_VALID_INDEX (--i) ; ) | ||
1276 | if (set1->elems[i] != set2->elems[i]) | ||
1277 | return false; | ||
1278 | return true; | ||
1279 | } | ||
1280 | |||
1281 | /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */ | ||
1282 | |||
1283 | static Idx | ||
1284 | internal_function __attribute ((pure)) | ||
1285 | re_node_set_contains (const re_node_set *set, Idx elem) | ||
1286 | { | ||
1287 | __re_size_t idx, right, mid; | ||
1288 | if (! REG_VALID_NONZERO_INDEX (set->nelem)) | ||
1289 | return 0; | ||
1290 | |||
1291 | /* Binary search the element. */ | ||
1292 | idx = 0; | ||
1293 | right = set->nelem - 1; | ||
1294 | while (idx < right) | ||
1295 | { | ||
1296 | mid = (idx + right) / 2; | ||
1297 | if (set->elems[mid] < elem) | ||
1298 | idx = mid + 1; | ||
1299 | else | ||
1300 | right = mid; | ||
1301 | } | ||
1302 | return set->elems[idx] == elem ? idx + 1 : 0; | ||
1303 | } | ||
1304 | |||
1305 | static void | ||
1306 | internal_function | ||
1307 | re_node_set_remove_at (re_node_set *set, Idx idx) | ||
1308 | { | ||
1309 | if (idx < 0 || idx >= set->nelem) | ||
1310 | return; | ||
1311 | --set->nelem; | ||
1312 | for (; idx < set->nelem; idx++) | ||
1313 | set->elems[idx] = set->elems[idx + 1]; | ||
1314 | } | ||
1315 | |||
1316 | |||
1317 | /* Add the token TOKEN to dfa->nodes, and return the index of the token. | ||
1318 | Or return REG_MISSING if an error occurred. */ | ||
1319 | |||
1320 | static Idx | ||
1321 | internal_function | ||
1322 | re_dfa_add_node (re_dfa_t *dfa, re_token_t token) | ||
1323 | { | ||
1324 | int type = token.type; | ||
1325 | if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0)) | ||
1326 | { | ||
1327 | Idx new_nodes_alloc = dfa->nodes_alloc; | ||
1328 | Idx *new_nexts, *new_indices; | ||
1329 | re_node_set *new_edests, *new_eclosures; | ||
1330 | |||
1331 | re_token_t *new_nodes = re_x2realloc (dfa->nodes, re_token_t, | ||
1332 | &new_nodes_alloc); | ||
1333 | if (BE (new_nodes == NULL, 0)) | ||
1334 | return REG_MISSING; | ||
1335 | dfa->nodes = new_nodes; | ||
1336 | new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc); | ||
1337 | new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc); | ||
1338 | new_edests = re_xrealloc (dfa->edests, re_node_set, new_nodes_alloc); | ||
1339 | new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc); | ||
1340 | if (BE (new_nexts == NULL || new_indices == NULL | ||
1341 | || new_edests == NULL || new_eclosures == NULL, 0)) | ||
1342 | return REG_MISSING; | ||
1343 | dfa->nexts = new_nexts; | ||
1344 | dfa->org_indices = new_indices; | ||
1345 | dfa->edests = new_edests; | ||
1346 | dfa->eclosures = new_eclosures; | ||
1347 | dfa->nodes_alloc = new_nodes_alloc; | ||
1348 | } | ||
1349 | dfa->nodes[dfa->nodes_len] = token; | ||
1350 | dfa->nodes[dfa->nodes_len].constraint = 0; | ||
1351 | #ifdef RE_ENABLE_I18N | ||
1352 | dfa->nodes[dfa->nodes_len].accept_mb = | ||
1353 | (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET; | ||
1354 | #endif | ||
1355 | dfa->nexts[dfa->nodes_len] = REG_MISSING; | ||
1356 | re_node_set_init_empty (dfa->edests + dfa->nodes_len); | ||
1357 | re_node_set_init_empty (dfa->eclosures + dfa->nodes_len); | ||
1358 | return dfa->nodes_len++; | ||
1359 | } | ||
1360 | |||
1361 | static inline re_hashval_t | ||
1362 | internal_function | ||
1363 | calc_state_hash (const re_node_set *nodes, unsigned int context) | ||
1364 | { | ||
1365 | re_hashval_t hash = nodes->nelem + context; | ||
1366 | Idx i; | ||
1367 | for (i = 0 ; i < nodes->nelem ; i++) | ||
1368 | hash += nodes->elems[i]; | ||
1369 | return hash; | ||
1370 | } | ||
1371 | |||
1372 | /* Search for the state whose node_set is equivalent to NODES. | ||
1373 | Return the pointer to the state, if we found it in the DFA. | ||
1374 | Otherwise create the new one and return it. In case of an error | ||
1375 | return NULL and set the error code in ERR. | ||
1376 | Note: - We assume NULL as the invalid state, then it is possible that | ||
1377 | return value is NULL and ERR is REG_NOERROR. | ||
1378 | - We never return non-NULL value in case of any errors, it is for | ||
1379 | optimization. */ | ||
1380 | |||
1381 | static re_dfastate_t* | ||
1382 | internal_function | ||
1383 | re_acquire_state (reg_errcode_t *err, re_dfa_t *dfa, const re_node_set *nodes) | ||
1384 | { | ||
1385 | re_hashval_t hash; | ||
1386 | re_dfastate_t *new_state; | ||
1387 | struct re_state_table_entry *spot; | ||
1388 | Idx i; | ||
1389 | #ifdef lint | ||
1390 | /* Suppress bogus uninitialized-variable warnings. */ | ||
1391 | *err = REG_NOERROR; | ||
1392 | #endif | ||
1393 | if (BE (nodes->nelem == 0, 0)) | ||
1394 | { | ||
1395 | *err = REG_NOERROR; | ||
1396 | return NULL; | ||
1397 | } | ||
1398 | hash = calc_state_hash (nodes, 0); | ||
1399 | spot = dfa->state_table + (hash & dfa->state_hash_mask); | ||
1400 | |||
1401 | for (i = 0 ; i < spot->num ; i++) | ||
1402 | { | ||
1403 | re_dfastate_t *state = spot->array[i]; | ||
1404 | if (hash != state->hash) | ||
1405 | continue; | ||
1406 | if (re_node_set_compare (&state->nodes, nodes)) | ||
1407 | return state; | ||
1408 | } | ||
1409 | |||
1410 | /* There are no appropriate state in the dfa, create the new one. */ | ||
1411 | new_state = create_ci_newstate (dfa, nodes, hash); | ||
1412 | if (BE (new_state != NULL, 1)) | ||
1413 | return new_state; | ||
1414 | else | ||
1415 | { | ||
1416 | *err = REG_ESPACE; | ||
1417 | return NULL; | ||
1418 | } | ||
1419 | } | ||
1420 | |||
1421 | /* Search for the state whose node_set is equivalent to NODES and | ||
1422 | whose context is equivalent to CONTEXT. | ||
1423 | Return the pointer to the state, if we found it in the DFA. | ||
1424 | Otherwise create the new one and return it. In case of an error | ||
1425 | return NULL and set the error code in ERR. | ||
1426 | Note: - We assume NULL as the invalid state, then it is possible that | ||
1427 | return value is NULL and ERR is REG_NOERROR. | ||
1428 | - We never return non-NULL value in case of any errors, it is for | ||
1429 | optimization. */ | ||
1430 | |||
1431 | static re_dfastate_t* | ||
1432 | internal_function | ||
1433 | re_acquire_state_context (reg_errcode_t *err, re_dfa_t *dfa, | ||
1434 | const re_node_set *nodes, unsigned int context) | ||
1435 | { | ||
1436 | re_hashval_t hash; | ||
1437 | re_dfastate_t *new_state; | ||
1438 | struct re_state_table_entry *spot; | ||
1439 | Idx i; | ||
1440 | #ifdef lint | ||
1441 | /* Suppress bogus uninitialized-variable warnings. */ | ||
1442 | *err = REG_NOERROR; | ||
1443 | #endif | ||
1444 | if (nodes->nelem == 0) | ||
1445 | { | ||
1446 | *err = REG_NOERROR; | ||
1447 | return NULL; | ||
1448 | } | ||
1449 | hash = calc_state_hash (nodes, context); | ||
1450 | spot = dfa->state_table + (hash & dfa->state_hash_mask); | ||
1451 | |||
1452 | for (i = 0 ; i < spot->num ; i++) | ||
1453 | { | ||
1454 | re_dfastate_t *state = spot->array[i]; | ||
1455 | if (state->hash == hash | ||
1456 | && state->context == context | ||
1457 | && re_node_set_compare (state->entrance_nodes, nodes)) | ||
1458 | return state; | ||
1459 | } | ||
1460 | /* There are no appropriate state in `dfa', create the new one. */ | ||
1461 | new_state = create_cd_newstate (dfa, nodes, context, hash); | ||
1462 | if (BE (new_state != NULL, 1)) | ||
1463 | return new_state; | ||
1464 | else | ||
1465 | { | ||
1466 | *err = REG_ESPACE; | ||
1467 | return NULL; | ||
1468 | } | ||
1469 | } | ||
1470 | |||
1471 | /* Finish initialization of the new state NEWSTATE, and using its hash value | ||
1472 | HASH put in the appropriate bucket of DFA's state table. Return value | ||
1473 | indicates the error code if failed. */ | ||
1474 | |||
1475 | static reg_errcode_t | ||
1476 | internal_function | ||
1477 | register_state (const re_dfa_t *dfa, re_dfastate_t *newstate, re_hashval_t hash) | ||
1478 | { | ||
1479 | struct re_state_table_entry *spot; | ||
1480 | reg_errcode_t err; | ||
1481 | Idx i; | ||
1482 | |||
1483 | newstate->hash = hash; | ||
1484 | err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem); | ||
1485 | if (BE (err != REG_NOERROR, 0)) | ||
1486 | return REG_ESPACE; | ||
1487 | for (i = 0; i < newstate->nodes.nelem; i++) | ||
1488 | { | ||
1489 | Idx elem = newstate->nodes.elems[i]; | ||
1490 | if (!IS_EPSILON_NODE (dfa->nodes[elem].type)) | ||
1491 | { | ||
1492 | bool ok = re_node_set_insert_last (&newstate->non_eps_nodes, elem); | ||
1493 | if (BE (! ok, 0)) | ||
1494 | return REG_ESPACE; | ||
1495 | } | ||
1496 | } | ||
1497 | |||
1498 | spot = dfa->state_table + (hash & dfa->state_hash_mask); | ||
1499 | if (BE (spot->alloc <= spot->num, 0)) | ||
1500 | { | ||
1501 | Idx new_alloc = spot->num; | ||
1502 | re_dfastate_t **new_array = re_x2realloc (spot->array, re_dfastate_t *, | ||
1503 | &new_alloc); | ||
1504 | if (BE (new_array == NULL, 0)) | ||
1505 | return REG_ESPACE; | ||
1506 | spot->array = new_array; | ||
1507 | spot->alloc = new_alloc; | ||
1508 | } | ||
1509 | spot->array[spot->num++] = newstate; | ||
1510 | return REG_NOERROR; | ||
1511 | } | ||
1512 | |||
1513 | /* Create the new state which is independ of contexts. | ||
1514 | Return the new state if succeeded, otherwise return NULL. */ | ||
1515 | |||
1516 | static re_dfastate_t * | ||
1517 | internal_function | ||
1518 | create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes, | ||
1519 | re_hashval_t hash) | ||
1520 | { | ||
1521 | Idx i; | ||
1522 | reg_errcode_t err; | ||
1523 | re_dfastate_t *newstate; | ||
1524 | |||
1525 | newstate = re_calloc (re_dfastate_t, 1); | ||
1526 | if (BE (newstate == NULL, 0)) | ||
1527 | return NULL; | ||
1528 | err = re_node_set_init_copy (&newstate->nodes, nodes); | ||
1529 | if (BE (err != REG_NOERROR, 0)) | ||
1530 | { | ||
1531 | re_free (newstate); | ||
1532 | return NULL; | ||
1533 | } | ||
1534 | |||
1535 | newstate->entrance_nodes = &newstate->nodes; | ||
1536 | for (i = 0 ; i < nodes->nelem ; i++) | ||
1537 | { | ||
1538 | re_token_t *node = dfa->nodes + nodes->elems[i]; | ||
1539 | re_token_type_t type = node->type; | ||
1540 | if (type == CHARACTER && !node->constraint) | ||
1541 | continue; | ||
1542 | #ifdef RE_ENABLE_I18N | ||
1543 | newstate->accept_mb |= node->accept_mb; | ||
1544 | #endif /* RE_ENABLE_I18N */ | ||
1545 | |||
1546 | /* If the state has the halt node, the state is a halt state. */ | ||
1547 | if (type == END_OF_RE) | ||
1548 | newstate->halt = 1; | ||
1549 | else if (type == OP_BACK_REF) | ||
1550 | newstate->has_backref = 1; | ||
1551 | else if (type == ANCHOR || node->constraint) | ||
1552 | newstate->has_constraint = 1; | ||
1553 | } | ||
1554 | err = register_state (dfa, newstate, hash); | ||
1555 | if (BE (err != REG_NOERROR, 0)) | ||
1556 | { | ||
1557 | free_state (newstate); | ||
1558 | newstate = NULL; | ||
1559 | } | ||
1560 | return newstate; | ||
1561 | } | ||
1562 | |||
1563 | /* Create the new state which is depend on the context CONTEXT. | ||
1564 | Return the new state if succeeded, otherwise return NULL. */ | ||
1565 | |||
1566 | static re_dfastate_t * | ||
1567 | internal_function | ||
1568 | create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes, | ||
1569 | unsigned int context, re_hashval_t hash) | ||
1570 | { | ||
1571 | Idx i, nctx_nodes = 0; | ||
1572 | reg_errcode_t err; | ||
1573 | re_dfastate_t *newstate; | ||
1574 | |||
1575 | newstate = re_calloc (re_dfastate_t, 1); | ||
1576 | if (BE (newstate == NULL, 0)) | ||
1577 | return NULL; | ||
1578 | err = re_node_set_init_copy (&newstate->nodes, nodes); | ||
1579 | if (BE (err != REG_NOERROR, 0)) | ||
1580 | { | ||
1581 | re_free (newstate); | ||
1582 | return NULL; | ||
1583 | } | ||
1584 | |||
1585 | newstate->context = context; | ||
1586 | newstate->entrance_nodes = &newstate->nodes; | ||
1587 | |||
1588 | for (i = 0 ; i < nodes->nelem ; i++) | ||
1589 | { | ||
1590 | unsigned int constraint = 0; | ||
1591 | re_token_t *node = dfa->nodes + nodes->elems[i]; | ||
1592 | re_token_type_t type = node->type; | ||
1593 | if (node->constraint) | ||
1594 | constraint = node->constraint; | ||
1595 | |||
1596 | if (type == CHARACTER && !constraint) | ||
1597 | continue; | ||
1598 | #ifdef RE_ENABLE_I18N | ||
1599 | newstate->accept_mb |= node->accept_mb; | ||
1600 | #endif /* RE_ENABLE_I18N */ | ||
1601 | |||
1602 | /* If the state has the halt node, the state is a halt state. */ | ||
1603 | if (type == END_OF_RE) | ||
1604 | newstate->halt = 1; | ||
1605 | else if (type == OP_BACK_REF) | ||
1606 | newstate->has_backref = 1; | ||
1607 | else if (type == ANCHOR) | ||
1608 | constraint = node->opr.ctx_type; | ||
1609 | |||
1610 | if (constraint) | ||
1611 | { | ||
1612 | if (newstate->entrance_nodes == &newstate->nodes) | ||
1613 | { | ||
1614 | newstate->entrance_nodes = re_malloc (re_node_set, 1); | ||
1615 | if (BE (newstate->entrance_nodes == NULL, 0)) | ||
1616 | { | ||
1617 | free_state (newstate); | ||
1618 | return NULL; | ||
1619 | } | ||
1620 | re_node_set_init_copy (newstate->entrance_nodes, nodes); | ||
1621 | nctx_nodes = 0; | ||
1622 | newstate->has_constraint = 1; | ||
1623 | } | ||
1624 | |||
1625 | if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context)) | ||
1626 | { | ||
1627 | re_node_set_remove_at (&newstate->nodes, i - nctx_nodes); | ||
1628 | ++nctx_nodes; | ||
1629 | } | ||
1630 | } | ||
1631 | } | ||
1632 | err = register_state (dfa, newstate, hash); | ||
1633 | if (BE (err != REG_NOERROR, 0)) | ||
1634 | { | ||
1635 | free_state (newstate); | ||
1636 | newstate = NULL; | ||
1637 | } | ||
1638 | return newstate; | ||
1639 | } | ||
1640 | |||
1641 | static void | ||
1642 | internal_function | ||
1643 | free_state (re_dfastate_t *state) | ||
1644 | { | ||
1645 | re_node_set_free (&state->non_eps_nodes); | ||
1646 | re_node_set_free (&state->inveclosure); | ||
1647 | if (state->entrance_nodes != &state->nodes) | ||
1648 | { | ||
1649 | re_node_set_free (state->entrance_nodes); | ||
1650 | re_free (state->entrance_nodes); | ||
1651 | } | ||
1652 | re_node_set_free (&state->nodes); | ||
1653 | re_free (state->word_trtable); | ||
1654 | re_free (state->trtable); | ||
1655 | re_free (state); | ||
1656 | } | ||