summaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rw-r--r--lib/.cvsignore18
-rw-r--r--lib/regcomp.c3779
-rw-r--r--lib/regex.c68
-rw-r--r--lib/regex.h769
-rw-r--r--lib/regex.obin0 -> 172827 bytes
-rw-r--r--lib/regex_internal.c1656
-rw-r--r--lib/regex_internal.h911
-rw-r--r--lib/regexec.c4333
-rw-r--r--lib/strcase.h48
9 files changed, 11565 insertions, 17 deletions
diff --git a/lib/.cvsignore b/lib/.cvsignore
index b057eef..20e66d6 100644
--- a/lib/.cvsignore
+++ b/lib/.cvsignore
@@ -1,20 +1,4 @@
1Makefile 1Makefile
2Makefile.in 2Makefile.in
3.deps 3.deps
4codeset.m4 4getopt.h
5gettext.m4
6glibc21.m4
7iconv.m4
8intdiv0.m4
9inttypes-pri.m4
10inttypes.m4
11inttypes_h.m4
12isc-posix.m4
13lcmessage.m4
14lib-ld.m4
15lib-link.m4
16lib-prefix.m4
17progtest.m4
18stdint_h.m4
19uintmax_t.m4
20ulonglong.m4
diff --git a/lib/regcomp.c b/lib/regcomp.c
new file mode 100644
index 0000000..279b20c
--- /dev/null
+++ b/lib/regcomp.c
@@ -0,0 +1,3779 @@
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19
20static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 Idx length, reg_syntax_t syntax);
22static void re_compile_fastmap_iter (regex_t *bufp,
23 const re_dfastate_t *init_state,
24 char *fastmap);
25static reg_errcode_t init_dfa (re_dfa_t *dfa, Idx pat_len);
26#ifdef RE_ENABLE_I18N
27static void free_charset (re_charset_t *cset);
28#endif /* RE_ENABLE_I18N */
29static void free_workarea_compile (regex_t *preg);
30static reg_errcode_t create_initial_state (re_dfa_t *dfa);
31#ifdef RE_ENABLE_I18N
32static void optimize_utf8 (re_dfa_t *dfa);
33#endif
34static reg_errcode_t analyze (regex_t *preg);
35static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
37 void *extra);
38static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
40 void *extra);
41static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44 bin_tree_t *node);
45static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
50 unsigned int constraint);
51static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53 Idx node, bool root);
54static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55static Idx fetch_number (re_string_t *input, re_token_t *token,
56 reg_syntax_t syntax);
57static int peek_token (re_token_t *token, re_string_t *input,
58 reg_syntax_t syntax);
59static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 Idx nest, reg_errcode_t *err);
64static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 Idx nest, reg_errcode_t *err);
67static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 Idx nest, reg_errcode_t *err);
70static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 Idx nest, reg_errcode_t *err);
73static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
78 reg_errcode_t *err);
79static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80 re_string_t *regexp,
81 re_token_t *token, int token_len,
82 re_dfa_t *dfa,
83 reg_syntax_t syntax,
84 bool accept_hyphen);
85static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86 re_string_t *regexp,
87 re_token_t *token);
88#ifdef RE_ENABLE_I18N
89static reg_errcode_t build_equiv_class (bitset sbcset,
90 re_charset_t *mbcset,
91 Idx *equiv_class_alloc,
92 const unsigned char *name);
93static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
94 bitset sbcset,
95 re_charset_t *mbcset,
96 Idx *char_class_alloc,
97 const unsigned char *class_name,
98 reg_syntax_t syntax);
99#else /* not RE_ENABLE_I18N */
100static reg_errcode_t build_equiv_class (bitset sbcset,
101 const unsigned char *name);
102static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
103 bitset sbcset,
104 const unsigned char *class_name,
105 reg_syntax_t syntax);
106#endif /* not RE_ENABLE_I18N */
107static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 unsigned REG_TRANSLATE_TYPE trans,
109 const unsigned char *class_name,
110 const unsigned char *extra,
111 bool non_match, reg_errcode_t *err);
112static bin_tree_t *create_tree (re_dfa_t *dfa,
113 bin_tree_t *left, bin_tree_t *right,
114 re_token_type_t type);
115static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 const re_token_t *token);
118static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119static void free_token (re_token_t *node);
120static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
122
123/* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
127
128const char __re_error_msgid[] attribute_hidden =
129 {
130#define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
132 "\0"
133#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
135 "\0"
136#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
138 "\0"
139#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
141 "\0"
142#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
144 "\0"
145#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
147 "\0"
148#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
150 "\0"
151#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
153 "\0"
154#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
156 "\0"
157#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
159 "\0"
160#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
162 "\0"
163#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
165 "\0"
166#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
168 "\0"
169#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
171 "\0"
172#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
174 "\0"
175#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
177 "\0"
178#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
180 };
181
182const size_t __re_error_msgid_idx[] attribute_hidden =
183 {
184 REG_NOERROR_IDX,
185 REG_NOMATCH_IDX,
186 REG_BADPAT_IDX,
187 REG_ECOLLATE_IDX,
188 REG_ECTYPE_IDX,
189 REG_EESCAPE_IDX,
190 REG_ESUBREG_IDX,
191 REG_EBRACK_IDX,
192 REG_EPAREN_IDX,
193 REG_EBRACE_IDX,
194 REG_BADBR_IDX,
195 REG_ERANGE_IDX,
196 REG_ESPACE_IDX,
197 REG_BADRPT_IDX,
198 REG_EEND_IDX,
199 REG_ESIZE_IDX,
200 REG_ERPAREN_IDX
201 };
202
203/* Entry points for GNU code. */
204
205/* re_compile_pattern is the GNU regular expression compiler: it
206 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207 Returns 0 if the pattern was valid, otherwise an error string.
208
209 Assumes the `re_allocated' (and perhaps `re_buffer') and `translate' fields
210 are set in BUFP on entry. */
211
212const char *
213re_compile_pattern (const char *pattern, size_t length,
214 struct re_pattern_buffer *bufp)
215{
216 reg_errcode_t ret;
217
218 /* And GNU code determines whether or not to get register information
219 by passing null for the REGS argument to re_match, etc., not by
220 setting re_no_sub, unless REG_NO_SUB is set. */
221 bufp->re_no_sub = !!(re_syntax_options & REG_NO_SUB);
222
223 /* Match anchors at newline. */
224 bufp->re_newline_anchor = 1;
225
226 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
227
228 if (!ret)
229 return NULL;
230 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
231}
232#ifdef _LIBC
233weak_alias (__re_compile_pattern, re_compile_pattern)
234#endif
235
236/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
237 also be assigned to arbitrarily: each pattern buffer stores its own
238 syntax, so it can be changed between regex compilations. */
239/* This has no initializer because initialized variables in Emacs
240 become read-only after dumping. */
241reg_syntax_t re_syntax_options;
242
243
244/* Specify the precise syntax of regexps for compilation. This provides
245 for compatibility for various utilities which historically have
246 different, incompatible syntaxes.
247
248 The argument SYNTAX is a bit mask comprised of the various bits
249 defined in regex.h. We return the old syntax. */
250
251reg_syntax_t
252re_set_syntax (reg_syntax_t syntax)
253{
254 reg_syntax_t ret = re_syntax_options;
255
256 re_syntax_options = syntax;
257 return ret;
258}
259#ifdef _LIBC
260weak_alias (__re_set_syntax, re_set_syntax)
261#endif
262
263int
264re_compile_fastmap (struct re_pattern_buffer *bufp)
265{
266 re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
267 char *fastmap = bufp->re_fastmap;
268
269 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
270 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
271 if (dfa->init_state != dfa->init_state_word)
272 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
273 if (dfa->init_state != dfa->init_state_nl)
274 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
275 if (dfa->init_state != dfa->init_state_begbuf)
276 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
277 bufp->re_fastmap_accurate = 1;
278 return 0;
279}
280#ifdef _LIBC
281weak_alias (__re_compile_fastmap, re_compile_fastmap)
282#endif
283
284static inline void
285__attribute ((always_inline))
286re_set_fastmap (char *fastmap, bool icase, int ch)
287{
288 fastmap[ch] = 1;
289 if (icase)
290 fastmap[tolower (ch)] = 1;
291}
292
293/* Helper function for re_compile_fastmap.
294 Compile fastmap for the initial_state INIT_STATE. */
295
296static void
297re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
298 char *fastmap)
299{
300 re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
301 Idx node_cnt;
302 bool icase = (dfa->mb_cur_max == 1 && (bufp->re_syntax & REG_IGNORE_CASE));
303 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
304 {
305 Idx node = init_state->nodes.elems[node_cnt];
306 re_token_type_t type = dfa->nodes[node].type;
307
308 if (type == CHARACTER)
309 {
310 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
311#ifdef RE_ENABLE_I18N
312 if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
313 {
314 unsigned char buf[MB_LEN_MAX];
315 unsigned char *p;
316 wchar_t wc;
317 mbstate_t state;
318
319 p = buf;
320 *p++ = dfa->nodes[node].opr.c;
321 while (++node < dfa->nodes_len
322 && dfa->nodes[node].type == CHARACTER
323 && dfa->nodes[node].mb_partial)
324 *p++ = dfa->nodes[node].opr.c;
325 memset (&state, 0, sizeof (state));
326 if (mbrtowc (&wc, (const char *) buf, p - buf,
327 &state) == p - buf
328 && (__wcrtomb ((char *) buf, towlower (wc), &state)
329 != (size_t) -1))
330 re_set_fastmap (fastmap, false, buf[0]);
331 }
332#endif
333 }
334 else if (type == SIMPLE_BRACKET)
335 {
336 int i, j, ch;
337 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
338 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
339 if (dfa->nodes[node].opr.sbcset[i] & ((bitset_word) 1 << j))
340 re_set_fastmap (fastmap, icase, ch);
341 }
342#ifdef RE_ENABLE_I18N
343 else if (type == COMPLEX_BRACKET)
344 {
345 Idx i;
346 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
347 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
348 || cset->nranges || cset->nchar_classes)
349 {
350# ifdef _LIBC
351 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
352 {
353 /* In this case we want to catch the bytes which are
354 the first byte of any collation elements.
355 e.g. In da_DK, we want to catch 'a' since "aa"
356 is a valid collation element, and don't catch
357 'b' since 'b' is the only collation element
358 which starts from 'b'. */
359 const int32_t *table = (const int32_t *)
360 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
361 for (i = 0; i < SBC_MAX; ++i)
362 if (table[i] < 0)
363 re_set_fastmap (fastmap, icase, i);
364 }
365# else
366 if (dfa->mb_cur_max > 1)
367 for (i = 0; i < SBC_MAX; ++i)
368 if (__btowc (i) == WEOF)
369 re_set_fastmap (fastmap, icase, i);
370# endif /* not _LIBC */
371 }
372 for (i = 0; i < cset->nmbchars; ++i)
373 {
374 char buf[256];
375 mbstate_t state;
376 memset (&state, '\0', sizeof (state));
377 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
378 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
379 if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
380 {
381 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
382 != (size_t) -1)
383 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
384 }
385 }
386 }
387#endif /* RE_ENABLE_I18N */
388 else if (type == OP_PERIOD
389#ifdef RE_ENABLE_I18N
390 || type == OP_UTF8_PERIOD
391#endif /* RE_ENABLE_I18N */
392 || type == END_OF_RE)
393 {
394 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
395 if (type == END_OF_RE)
396 bufp->re_can_be_null = 1;
397 return;
398 }
399 }
400}
401
402/* Entry point for POSIX code. */
403/* regcomp takes a regular expression as a string and compiles it.
404
405 PREG is a regex_t *. We do not expect any fields to be initialized,
406 since POSIX says we shouldn't. Thus, we set
407
408 `re_buffer' to the compiled pattern;
409 `re_used' to the length of the compiled pattern;
410 `re_syntax' to REG_SYNTAX_POSIX_EXTENDED if the
411 REG_EXTENDED bit in CFLAGS is set; otherwise, to
412 REG_SYNTAX_POSIX_BASIC;
413 `re_newline_anchor' to REG_NEWLINE being set in CFLAGS;
414 `re_fastmap' to an allocated space for the fastmap;
415 `re_fastmap_accurate' to zero;
416 `re_nsub' to the number of subexpressions in PATTERN.
417
418 PATTERN is the address of the pattern string.
419
420 CFLAGS is a series of bits which affect compilation.
421
422 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
423 use POSIX basic syntax.
424
425 If REG_NEWLINE is set, then . and [^...] don't match newline.
426 Also, regexec will try a match beginning after every newline.
427
428 If REG_ICASE is set, then we considers upper- and lowercase
429 versions of letters to be equivalent when matching.
430
431 If REG_NOSUB is set, then when PREG is passed to regexec, that
432 routine will report only success or failure, and nothing about the
433 registers.
434
435 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
436 the return codes and their meanings.) */
437
438int
439regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
440{
441 reg_errcode_t ret;
442 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? REG_SYNTAX_POSIX_EXTENDED
443 : REG_SYNTAX_POSIX_BASIC);
444
445 preg->re_buffer = NULL;
446 preg->re_allocated = 0;
447 preg->re_used = 0;
448
449 /* Try to allocate space for the fastmap. */
450 preg->re_fastmap = re_malloc (char, SBC_MAX);
451 if (BE (preg->re_fastmap == NULL, 0))
452 return REG_ESPACE;
453
454 syntax |= (cflags & REG_ICASE) ? REG_IGNORE_CASE : 0;
455
456 /* If REG_NEWLINE is set, newlines are treated differently. */
457 if (cflags & REG_NEWLINE)
458 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
459 syntax &= ~REG_DOT_NEWLINE;
460 syntax |= REG_HAT_LISTS_NOT_NEWLINE;
461 /* It also changes the matching behavior. */
462 preg->re_newline_anchor = 1;
463 }
464 else
465 preg->re_newline_anchor = 0;
466 preg->re_no_sub = !!(cflags & REG_NOSUB);
467 preg->re_translate = NULL;
468
469 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
470
471 /* POSIX doesn't distinguish between an unmatched open-group and an
472 unmatched close-group: both are REG_EPAREN. */
473 if (ret == REG_ERPAREN)
474 ret = REG_EPAREN;
475
476 /* We have already checked preg->re_fastmap != NULL. */
477 if (BE (ret == REG_NOERROR, 1))
478 /* Compute the fastmap now, since regexec cannot modify the pattern
479 buffer. This function never fails in this implementation. */
480 (void) re_compile_fastmap (preg);
481 else
482 {
483 /* Some error occurred while compiling the expression. */
484 re_free (preg->re_fastmap);
485 preg->re_fastmap = NULL;
486 }
487
488 return (int) ret;
489}
490#ifdef _LIBC
491weak_alias (__regcomp, regcomp)
492#endif
493
494/* Returns a message corresponding to an error code, ERRCODE, returned
495 from either regcomp or regexec. We don't use PREG here. */
496
497size_t
498regerror (int errcode, const regex_t *__restrict preg,
499 char *__restrict errbuf, size_t errbuf_size)
500{
501 const char *msg;
502 size_t msg_size;
503
504 if (BE (errcode < 0
505 || errcode >= (int) (sizeof (__re_error_msgid_idx)
506 / sizeof (__re_error_msgid_idx[0])), 0))
507 /* Only error codes returned by the rest of the code should be passed
508 to this routine. If we are given anything else, or if other regex
509 code generates an invalid error code, then the program has a bug.
510 Dump core so we can fix it. */
511 abort ();
512
513 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
514
515 msg_size = strlen (msg) + 1; /* Includes the null. */
516
517 if (BE (errbuf_size != 0, 1))
518 {
519 if (BE (msg_size > errbuf_size, 0))
520 {
521#if defined HAVE_MEMPCPY || defined _LIBC
522 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
523#else
524 memcpy (errbuf, msg, errbuf_size - 1);
525 errbuf[errbuf_size - 1] = 0;
526#endif
527 }
528 else
529 memcpy (errbuf, msg, msg_size);
530 }
531
532 return msg_size;
533}
534#ifdef _LIBC
535weak_alias (__regerror, regerror)
536#endif
537
538
539#ifdef RE_ENABLE_I18N
540/* This static array is used for the map to single-byte characters when
541 UTF-8 is used. Otherwise we would allocate memory just to initialize
542 it the same all the time. UTF-8 is the preferred encoding so this is
543 a worthwhile optimization. */
544static const bitset utf8_sb_map =
545{
546 /* Set the first 128 bits. */
547# if 2 < BITSET_WORDS
548 BITSET_WORD_MAX,
549# endif
550# if 4 < BITSET_WORDS
551 BITSET_WORD_MAX,
552# endif
553# if 6 < BITSET_WORDS
554 BITSET_WORD_MAX,
555# endif
556# if 8 < BITSET_WORDS
557# error "Invalid BITSET_WORDS"
558# endif
559 (BITSET_WORD_MAX
560 >> (SBC_MAX % BITSET_WORD_BITS == 0
561 ? 0
562 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
563};
564#endif
565
566
567static void
568free_dfa_content (re_dfa_t *dfa)
569{
570 Idx i, j;
571
572 if (dfa->nodes)
573 for (i = 0; i < dfa->nodes_len; ++i)
574 free_token (dfa->nodes + i);
575 re_free (dfa->nexts);
576 for (i = 0; i < dfa->nodes_len; ++i)
577 {
578 if (dfa->eclosures != NULL)
579 re_node_set_free (dfa->eclosures + i);
580 if (dfa->inveclosures != NULL)
581 re_node_set_free (dfa->inveclosures + i);
582 if (dfa->edests != NULL)
583 re_node_set_free (dfa->edests + i);
584 }
585 re_free (dfa->edests);
586 re_free (dfa->eclosures);
587 re_free (dfa->inveclosures);
588 re_free (dfa->nodes);
589
590 if (dfa->state_table)
591 for (i = 0; i <= dfa->state_hash_mask; ++i)
592 {
593 struct re_state_table_entry *entry = dfa->state_table + i;
594 for (j = 0; j < entry->num; ++j)
595 {
596 re_dfastate_t *state = entry->array[j];
597 free_state (state);
598 }
599 re_free (entry->array);
600 }
601 re_free (dfa->state_table);
602#ifdef RE_ENABLE_I18N
603 if (dfa->sb_char != utf8_sb_map)
604 re_free (dfa->sb_char);
605#endif
606 re_free (dfa->subexp_map);
607#ifdef DEBUG
608 re_free (dfa->re_str);
609#endif
610
611 re_free (dfa);
612}
613
614
615/* Free dynamically allocated space used by PREG. */
616
617void
618regfree (regex_t *preg)
619{
620 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
621 if (BE (dfa != NULL, 1))
622 free_dfa_content (dfa);
623 preg->re_buffer = NULL;
624 preg->re_allocated = 0;
625
626 re_free (preg->re_fastmap);
627 preg->re_fastmap = NULL;
628
629 re_free (preg->re_translate);
630 preg->re_translate = NULL;
631}
632#ifdef _LIBC
633weak_alias (__regfree, regfree)
634#endif
635
636/* Entry points compatible with 4.2 BSD regex library. We don't define
637 them unless specifically requested. */
638
639#if defined _REGEX_RE_COMP || defined _LIBC
640
641/* BSD has one and only one pattern buffer. */
642static struct re_pattern_buffer re_comp_buf;
643
644char *
645# ifdef _LIBC
646/* Make these definitions weak in libc, so POSIX programs can redefine
647 these names if they don't use our functions, and still use
648 regcomp/regexec above without link errors. */
649weak_function
650# endif
651re_comp (const char *s)
652{
653 reg_errcode_t ret;
654 char *fastmap;
655
656 if (!s)
657 {
658 if (!re_comp_buf.re_buffer)
659 return gettext ("No previous regular expression");
660 return 0;
661 }
662
663 if (re_comp_buf.re_buffer)
664 {
665 fastmap = re_comp_buf.re_fastmap;
666 re_comp_buf.re_fastmap = NULL;
667 __regfree (&re_comp_buf);
668 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
669 re_comp_buf.re_fastmap = fastmap;
670 }
671
672 if (re_comp_buf.re_fastmap == NULL)
673 {
674 re_comp_buf.re_fastmap = (char *) malloc (SBC_MAX);
675 if (re_comp_buf.re_fastmap == NULL)
676 return (char *) gettext (__re_error_msgid
677 + __re_error_msgid_idx[(int) REG_ESPACE]);
678 }
679
680 /* Since `re_exec' always passes NULL for the `regs' argument, we
681 don't need to initialize the pattern buffer fields which affect it. */
682
683 /* Match anchors at newlines. */
684 re_comp_buf.re_newline_anchor = 1;
685
686 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
687
688 if (!ret)
689 return NULL;
690
691 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
692 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
693}
694
695#ifdef _LIBC
696libc_freeres_fn (free_mem)
697{
698 __regfree (&re_comp_buf);
699}
700#endif
701
702#endif /* _REGEX_RE_COMP */
703
704/* Internal entry point.
705 Compile the regular expression PATTERN, whose length is LENGTH.
706 SYNTAX indicate regular expression's syntax. */
707
708static reg_errcode_t
709re_compile_internal (regex_t *preg, const char *pattern, Idx length,
710 reg_syntax_t syntax)
711{
712 reg_errcode_t err = REG_NOERROR;
713 re_dfa_t *dfa;
714 re_string_t regexp;
715
716 /* Initialize the pattern buffer. */
717 preg->re_fastmap_accurate = 0;
718 preg->re_syntax = syntax;
719 preg->re_not_bol = preg->re_not_eol = 0;
720 preg->re_used = 0;
721 preg->re_nsub = 0;
722 preg->re_can_be_null = 0;
723 preg->re_regs_allocated = REG_UNALLOCATED;
724
725 /* Initialize the dfa. */
726 dfa = (re_dfa_t *) preg->re_buffer;
727 if (BE (preg->re_allocated < sizeof (re_dfa_t), 0))
728 {
729 /* If zero allocated, but buffer is non-null, try to realloc
730 enough space. This loses if buffer's address is bogus, but
731 that is the user's responsibility. If buffer is null this
732 is a simple allocation. */
733 dfa = re_realloc (preg->re_buffer, re_dfa_t, 1);
734 if (dfa == NULL)
735 return REG_ESPACE;
736 preg->re_allocated = sizeof (re_dfa_t);
737 preg->re_buffer = (unsigned char *) dfa;
738 }
739 preg->re_used = sizeof (re_dfa_t);
740
741 __libc_lock_init (dfa->lock);
742
743 err = init_dfa (dfa, length);
744 if (BE (err != REG_NOERROR, 0))
745 {
746 free_dfa_content (dfa);
747 preg->re_buffer = NULL;
748 preg->re_allocated = 0;
749 return err;
750 }
751#ifdef DEBUG
752 dfa->re_str = re_malloc (char, length + 1);
753 strncpy (dfa->re_str, pattern, length + 1);
754#endif
755
756 err = re_string_construct (&regexp, pattern, length, preg->re_translate,
757 syntax & REG_IGNORE_CASE, dfa);
758 if (BE (err != REG_NOERROR, 0))
759 {
760 re_compile_internal_free_return:
761 free_workarea_compile (preg);
762 re_string_destruct (&regexp);
763 free_dfa_content (dfa);
764 preg->re_buffer = NULL;
765 preg->re_allocated = 0;
766 return err;
767 }
768
769 /* Parse the regular expression, and build a structure tree. */
770 preg->re_nsub = 0;
771 dfa->str_tree = parse (&regexp, preg, syntax, &err);
772 if (BE (dfa->str_tree == NULL, 0))
773 goto re_compile_internal_free_return;
774
775 /* Analyze the tree and create the nfa. */
776 err = analyze (preg);
777 if (BE (err != REG_NOERROR, 0))
778 goto re_compile_internal_free_return;
779
780#ifdef RE_ENABLE_I18N
781 /* If possible, do searching in single byte encoding to speed things up. */
782 if (dfa->is_utf8 && !(syntax & REG_IGNORE_CASE) && preg->re_translate == NULL)
783 optimize_utf8 (dfa);
784#endif
785
786 /* Then create the initial state of the dfa. */
787 err = create_initial_state (dfa);
788
789 /* Release work areas. */
790 free_workarea_compile (preg);
791 re_string_destruct (&regexp);
792
793 if (BE (err != REG_NOERROR, 0))
794 {
795 free_dfa_content (dfa);
796 preg->re_buffer = NULL;
797 preg->re_allocated = 0;
798 }
799
800 return err;
801}
802
803/* Initialize DFA. We use the length of the regular expression PAT_LEN
804 as the initial length of some arrays. */
805
806static reg_errcode_t
807init_dfa (re_dfa_t *dfa, Idx pat_len)
808{
809 __re_size_t table_size;
810#ifndef _LIBC
811 char *codeset_name;
812#endif
813
814 memset (dfa, '\0', sizeof (re_dfa_t));
815
816 /* Force allocation of str_tree_storage the first time. */
817 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
818
819 dfa->nodes_alloc = pat_len + 1;
820 dfa->nodes = re_xmalloc (re_token_t, dfa->nodes_alloc);
821
822 /* table_size = 2 ^ ceil(log pat_len) */
823 for (table_size = 1; table_size <= pat_len; table_size <<= 1)
824 if (0 < (Idx) -1 && table_size == 0)
825 return REG_ESPACE;
826
827 dfa->state_table = re_calloc (struct re_state_table_entry, table_size);
828 dfa->state_hash_mask = table_size - 1;
829
830 dfa->mb_cur_max = MB_CUR_MAX;
831#ifdef _LIBC
832 if (dfa->mb_cur_max == 6
833 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
834 dfa->is_utf8 = 1;
835 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
836 != 0);
837#else
838# ifdef HAVE_LANGINFO_CODESET
839 codeset_name = nl_langinfo (CODESET);
840# else
841 codeset_name = getenv ("LC_ALL");
842 if (codeset_name == NULL || codeset_name[0] == '\0')
843 codeset_name = getenv ("LC_CTYPE");
844 if (codeset_name == NULL || codeset_name[0] == '\0')
845 codeset_name = getenv ("LANG");
846 if (codeset_name == NULL)
847 codeset_name = "";
848 else if (strchr (codeset_name, '.') != NULL)
849 codeset_name = strchr (codeset_name, '.') + 1;
850# endif
851
852 if (strcasecmp (codeset_name, "UTF-8") == 0
853 || strcasecmp (codeset_name, "UTF8") == 0)
854 dfa->is_utf8 = 1;
855
856 /* We check exhaustively in the loop below if this charset is a
857 superset of ASCII. */
858 dfa->map_notascii = 0;
859#endif
860
861#ifdef RE_ENABLE_I18N
862 if (dfa->mb_cur_max > 1)
863 {
864 if (dfa->is_utf8)
865 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
866 else
867 {
868 int i, j, ch;
869
870 dfa->sb_char = re_calloc (bitset_word, BITSET_WORDS);
871 if (BE (dfa->sb_char == NULL, 0))
872 return REG_ESPACE;
873
874 /* Set the bits corresponding to single byte chars. */
875 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
876 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
877 {
878 wint_t wch = __btowc (ch);
879 if (wch != WEOF)
880 dfa->sb_char[i] |= (bitset_word) 1 << j;
881# ifndef _LIBC
882 if (isascii (ch) && wch != ch)
883 dfa->map_notascii = 1;
884# endif
885 }
886 }
887 }
888#endif
889
890 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
891 return REG_ESPACE;
892 return REG_NOERROR;
893}
894
895/* Initialize WORD_CHAR table, which indicate which character is
896 "word". In this case "word" means that it is the word construction
897 character used by some operators like "\<", "\>", etc. */
898
899static void
900init_word_char (re_dfa_t *dfa)
901{
902 int i, j, ch;
903 dfa->word_ops_used = 1;
904 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
905 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
906 if (isalnum (ch) || ch == '_')
907 dfa->word_char[i] |= (bitset_word) 1 << j;
908}
909
910/* Free the work area which are only used while compiling. */
911
912static void
913free_workarea_compile (regex_t *preg)
914{
915 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
916 bin_tree_storage_t *storage, *next;
917 for (storage = dfa->str_tree_storage; storage; storage = next)
918 {
919 next = storage->next;
920 re_free (storage);
921 }
922 dfa->str_tree_storage = NULL;
923 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
924 dfa->str_tree = NULL;
925 re_free (dfa->org_indices);
926 dfa->org_indices = NULL;
927}
928
929/* Create initial states for all contexts. */
930
931static reg_errcode_t
932create_initial_state (re_dfa_t *dfa)
933{
934 Idx first, i;
935 reg_errcode_t err;
936 re_node_set init_nodes;
937
938 /* Initial states have the epsilon closure of the node which is
939 the first node of the regular expression. */
940 first = dfa->str_tree->first->node_idx;
941 dfa->init_node = first;
942 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
943 if (BE (err != REG_NOERROR, 0))
944 return err;
945
946 /* The back-references which are in initial states can epsilon transit,
947 since in this case all of the subexpressions can be null.
948 Then we add epsilon closures of the nodes which are the next nodes of
949 the back-references. */
950 if (dfa->nbackref > 0)
951 for (i = 0; i < init_nodes.nelem; ++i)
952 {
953 Idx node_idx = init_nodes.elems[i];
954 re_token_type_t type = dfa->nodes[node_idx].type;
955
956 Idx clexp_idx;
957 if (type != OP_BACK_REF)
958 continue;
959 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
960 {
961 re_token_t *clexp_node;
962 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
963 if (clexp_node->type == OP_CLOSE_SUBEXP
964 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
965 break;
966 }
967 if (clexp_idx == init_nodes.nelem)
968 continue;
969
970 if (type == OP_BACK_REF)
971 {
972 Idx dest_idx = dfa->edests[node_idx].elems[0];
973 if (!re_node_set_contains (&init_nodes, dest_idx))
974 {
975 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
976 i = 0;
977 }
978 }
979 }
980
981 /* It must be the first time to invoke acquire_state. */
982 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
983 /* We don't check ERR here, since the initial state must not be NULL. */
984 if (BE (dfa->init_state == NULL, 0))
985 return err;
986 if (dfa->init_state->has_constraint)
987 {
988 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
989 CONTEXT_WORD);
990 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
991 CONTEXT_NEWLINE);
992 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
993 &init_nodes,
994 CONTEXT_NEWLINE
995 | CONTEXT_BEGBUF);
996 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
997 || dfa->init_state_begbuf == NULL, 0))
998 return err;
999 }
1000 else
1001 dfa->init_state_word = dfa->init_state_nl
1002 = dfa->init_state_begbuf = dfa->init_state;
1003
1004 re_node_set_free (&init_nodes);
1005 return REG_NOERROR;
1006}
1007
1008#ifdef RE_ENABLE_I18N
1009/* If it is possible to do searching in single byte encoding instead of UTF-8
1010 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1011 DFA nodes where needed. */
1012
1013static void
1014optimize_utf8 (re_dfa_t *dfa)
1015{
1016 Idx node;
1017 int i;
1018 bool mb_chars = false;
1019 bool has_period = false;
1020
1021 for (node = 0; node < dfa->nodes_len; ++node)
1022 switch (dfa->nodes[node].type)
1023 {
1024 case CHARACTER:
1025 if (dfa->nodes[node].opr.c >= 0x80)
1026 mb_chars = true;
1027 break;
1028 case ANCHOR:
1029 switch (dfa->nodes[node].opr.idx)
1030 {
1031 case LINE_FIRST:
1032 case LINE_LAST:
1033 case BUF_FIRST:
1034 case BUF_LAST:
1035 break;
1036 default:
1037 /* Word anchors etc. cannot be handled. */
1038 return;
1039 }
1040 break;
1041 case OP_PERIOD:
1042 has_period = true;
1043 break;
1044 case OP_BACK_REF:
1045 case OP_ALT:
1046 case END_OF_RE:
1047 case OP_DUP_ASTERISK:
1048 case OP_OPEN_SUBEXP:
1049 case OP_CLOSE_SUBEXP:
1050 break;
1051 case COMPLEX_BRACKET:
1052 return;
1053 case SIMPLE_BRACKET:
1054 /* Just double check. */
1055 {
1056 int rshift =
1057 (SBC_MAX / 2 % BITSET_WORD_BITS == 0
1058 ? 0
1059 : BITSET_WORD_BITS - SBC_MAX / 2 % BITSET_WORD_BITS);
1060 for (i = SBC_MAX / 2 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1061 {
1062 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1063 return;
1064 rshift = 0;
1065 }
1066 }
1067 break;
1068 default:
1069 abort ();
1070 }
1071
1072 if (mb_chars || has_period)
1073 for (node = 0; node < dfa->nodes_len; ++node)
1074 {
1075 if (dfa->nodes[node].type == CHARACTER
1076 && dfa->nodes[node].opr.c >= 0x80)
1077 dfa->nodes[node].mb_partial = 0;
1078 else if (dfa->nodes[node].type == OP_PERIOD)
1079 dfa->nodes[node].type = OP_UTF8_PERIOD;
1080 }
1081
1082 /* The search can be in single byte locale. */
1083 dfa->mb_cur_max = 1;
1084 dfa->is_utf8 = 0;
1085 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1086}
1087#endif
1088
1089/* Analyze the structure tree, and calculate "first", "next", "edest",
1090 "eclosure", and "inveclosure". */
1091
1092static reg_errcode_t
1093analyze (regex_t *preg)
1094{
1095 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1096 reg_errcode_t ret;
1097
1098 /* Allocate arrays. */
1099 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1100 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1101 dfa->edests = re_xmalloc (re_node_set, dfa->nodes_alloc);
1102 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1103 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1104 || dfa->eclosures == NULL, 0))
1105 return REG_ESPACE;
1106
1107 dfa->subexp_map = re_xmalloc (Idx, preg->re_nsub);
1108 if (dfa->subexp_map != NULL)
1109 {
1110 Idx i;
1111 for (i = 0; i < preg->re_nsub; i++)
1112 dfa->subexp_map[i] = i;
1113 preorder (dfa->str_tree, optimize_subexps, dfa);
1114 for (i = 0; i < preg->re_nsub; i++)
1115 if (dfa->subexp_map[i] != i)
1116 break;
1117 if (i == preg->re_nsub)
1118 {
1119 free (dfa->subexp_map);
1120 dfa->subexp_map = NULL;
1121 }
1122 }
1123
1124 ret = postorder (dfa->str_tree, lower_subexps, preg);
1125 if (BE (ret != REG_NOERROR, 0))
1126 return ret;
1127 ret = postorder (dfa->str_tree, calc_first, dfa);
1128 if (BE (ret != REG_NOERROR, 0))
1129 return ret;
1130 preorder (dfa->str_tree, calc_next, dfa);
1131 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1132 if (BE (ret != REG_NOERROR, 0))
1133 return ret;
1134 ret = calc_eclosure (dfa);
1135 if (BE (ret != REG_NOERROR, 0))
1136 return ret;
1137
1138 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1139 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1140 if ((!preg->re_no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1141 || dfa->nbackref)
1142 {
1143 dfa->inveclosures = re_xmalloc (re_node_set, dfa->nodes_len);
1144 if (BE (dfa->inveclosures == NULL, 0))
1145 return REG_ESPACE;
1146 ret = calc_inveclosure (dfa);
1147 }
1148
1149 return ret;
1150}
1151
1152/* Our parse trees are very unbalanced, so we cannot use a stack to
1153 implement parse tree visits. Instead, we use parent pointers and
1154 some hairy code in these two functions. */
1155static reg_errcode_t
1156postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1157 void *extra)
1158{
1159 bin_tree_t *node, *prev;
1160
1161 for (node = root; ; )
1162 {
1163 /* Descend down the tree, preferably to the left (or to the right
1164 if that's the only child). */
1165 while (node->left || node->right)
1166 if (node->left)
1167 node = node->left;
1168 else
1169 node = node->right;
1170
1171 do
1172 {
1173 reg_errcode_t err = fn (extra, node);
1174 if (BE (err != REG_NOERROR, 0))
1175 return err;
1176 if (node->parent == NULL)
1177 return REG_NOERROR;
1178 prev = node;
1179 node = node->parent;
1180 }
1181 /* Go up while we have a node that is reached from the right. */
1182 while (node->right == prev || node->right == NULL);
1183 node = node->right;
1184 }
1185}
1186
1187static reg_errcode_t
1188preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1189 void *extra)
1190{
1191 bin_tree_t *node;
1192
1193 for (node = root; ; )
1194 {
1195 reg_errcode_t err = fn (extra, node);
1196 if (BE (err != REG_NOERROR, 0))
1197 return err;
1198
1199 /* Go to the left node, or up and to the right. */
1200 if (node->left)
1201 node = node->left;
1202 else
1203 {
1204 bin_tree_t *prev = NULL;
1205 while (node->right == prev || node->right == NULL)
1206 {
1207 prev = node;
1208 node = node->parent;
1209 if (!node)
1210 return REG_NOERROR;
1211 }
1212 node = node->right;
1213 }
1214 }
1215}
1216
1217/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1218 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1219 backreferences as well. Requires a preorder visit. */
1220static reg_errcode_t
1221optimize_subexps (void *extra, bin_tree_t *node)
1222{
1223 re_dfa_t *dfa = (re_dfa_t *) extra;
1224
1225 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1226 {
1227 int idx = node->token.opr.idx;
1228 node->token.opr.idx = dfa->subexp_map[idx];
1229 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1230 }
1231
1232 else if (node->token.type == SUBEXP
1233 && node->left && node->left->token.type == SUBEXP)
1234 {
1235 Idx other_idx = node->left->token.opr.idx;
1236
1237 node->left = node->left->left;
1238 if (node->left)
1239 node->left->parent = node;
1240
1241 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1242 if (other_idx < BITSET_WORD_BITS)
1243 dfa->used_bkref_map &= ~ ((bitset_word) 1 << other_idx);
1244 }
1245
1246 return REG_NOERROR;
1247}
1248
1249/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1250 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1251static reg_errcode_t
1252lower_subexps (void *extra, bin_tree_t *node)
1253{
1254 regex_t *preg = (regex_t *) extra;
1255 reg_errcode_t err = REG_NOERROR;
1256
1257 if (node->left && node->left->token.type == SUBEXP)
1258 {
1259 node->left = lower_subexp (&err, preg, node->left);
1260 if (node->left)
1261 node->left->parent = node;
1262 }
1263 if (node->right && node->right->token.type == SUBEXP)
1264 {
1265 node->right = lower_subexp (&err, preg, node->right);
1266 if (node->right)
1267 node->right->parent = node;
1268 }
1269
1270 return err;
1271}
1272
1273static bin_tree_t *
1274lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1275{
1276 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1277 bin_tree_t *body = node->left;
1278 bin_tree_t *op, *cls, *tree1, *tree;
1279
1280 if (preg->re_no_sub
1281 /* We do not optimize empty subexpressions, because otherwise we may
1282 have bad CONCAT nodes with NULL children. This is obviously not
1283 very common, so we do not lose much. An example that triggers
1284 this case is the sed "script" /\(\)/x. */
1285 && node->left != NULL
1286 && ! (node->token.opr.idx < BITSET_WORD_BITS
1287 && dfa->used_bkref_map & ((bitset_word) 1 << node->token.opr.idx)))
1288 return node->left;
1289
1290 /* Convert the SUBEXP node to the concatenation of an
1291 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1292 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1293 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1294 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1295 tree = create_tree (dfa, op, tree1, CONCAT);
1296 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1297 {
1298 *err = REG_ESPACE;
1299 return NULL;
1300 }
1301
1302 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1303 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1304 return tree;
1305}
1306
1307/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1308 nodes. Requires a postorder visit. */
1309static reg_errcode_t
1310calc_first (void *extra, bin_tree_t *node)
1311{
1312 re_dfa_t *dfa = (re_dfa_t *) extra;
1313 if (node->token.type == CONCAT)
1314 {
1315 node->first = node->left->first;
1316 node->node_idx = node->left->node_idx;
1317 }
1318 else
1319 {
1320 node->first = node;
1321 node->node_idx = re_dfa_add_node (dfa, node->token);
1322 if (BE (node->node_idx == REG_MISSING, 0))
1323 return REG_ESPACE;
1324 }
1325 return REG_NOERROR;
1326}
1327
1328/* Pass 2: compute NEXT on the tree. Preorder visit. */
1329static reg_errcode_t
1330calc_next (void *extra, bin_tree_t *node)
1331{
1332 switch (node->token.type)
1333 {
1334 case OP_DUP_ASTERISK:
1335 node->left->next = node;
1336 break;
1337 case CONCAT:
1338 node->left->next = node->right->first;
1339 node->right->next = node->next;
1340 break;
1341 default:
1342 if (node->left)
1343 node->left->next = node->next;
1344 if (node->right)
1345 node->right->next = node->next;
1346 break;
1347 }
1348 return REG_NOERROR;
1349}
1350
1351/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1352static reg_errcode_t
1353link_nfa_nodes (void *extra, bin_tree_t *node)
1354{
1355 re_dfa_t *dfa = (re_dfa_t *) extra;
1356 Idx idx = node->node_idx;
1357 reg_errcode_t err = REG_NOERROR;
1358
1359 switch (node->token.type)
1360 {
1361 case CONCAT:
1362 break;
1363
1364 case END_OF_RE:
1365 assert (node->next == NULL);
1366 break;
1367
1368 case OP_DUP_ASTERISK:
1369 case OP_ALT:
1370 {
1371 Idx left, right;
1372 dfa->has_plural_match = 1;
1373 if (node->left != NULL)
1374 left = node->left->first->node_idx;
1375 else
1376 left = node->next->node_idx;
1377 if (node->right != NULL)
1378 right = node->right->first->node_idx;
1379 else
1380 right = node->next->node_idx;
1381 assert (REG_VALID_INDEX (left));
1382 assert (REG_VALID_INDEX (right));
1383 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1384 }
1385 break;
1386
1387 case ANCHOR:
1388 case OP_OPEN_SUBEXP:
1389 case OP_CLOSE_SUBEXP:
1390 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1391 break;
1392
1393 case OP_BACK_REF:
1394 dfa->nexts[idx] = node->next->node_idx;
1395 if (node->token.type == OP_BACK_REF)
1396 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1397 break;
1398
1399 default:
1400 assert (!IS_EPSILON_NODE (node->token.type));
1401 dfa->nexts[idx] = node->next->node_idx;
1402 break;
1403 }
1404
1405 return err;
1406}
1407
1408/* Duplicate the epsilon closure of the node ROOT_NODE.
1409 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1410 to their own constraint. */
1411
1412static reg_errcode_t
1413duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node,
1414 Idx top_clone_node, Idx root_node,
1415 unsigned int init_constraint)
1416{
1417 Idx org_node, clone_node;
1418 bool ok;
1419 unsigned int constraint = init_constraint;
1420 for (org_node = top_org_node, clone_node = top_clone_node;;)
1421 {
1422 Idx org_dest, clone_dest;
1423 if (dfa->nodes[org_node].type == OP_BACK_REF)
1424 {
1425 /* If the back reference epsilon-transit, its destination must
1426 also have the constraint. Then duplicate the epsilon closure
1427 of the destination of the back reference, and store it in
1428 edests of the back reference. */
1429 org_dest = dfa->nexts[org_node];
1430 re_node_set_empty (dfa->edests + clone_node);
1431 clone_dest = duplicate_node (dfa, org_dest, constraint);
1432 if (BE (clone_dest == REG_MISSING, 0))
1433 return REG_ESPACE;
1434 dfa->nexts[clone_node] = dfa->nexts[org_node];
1435 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1436 if (BE (! ok, 0))
1437 return REG_ESPACE;
1438 }
1439 else if (dfa->edests[org_node].nelem == 0)
1440 {
1441 /* In case of the node can't epsilon-transit, don't duplicate the
1442 destination and store the original destination as the
1443 destination of the node. */
1444 dfa->nexts[clone_node] = dfa->nexts[org_node];
1445 break;
1446 }
1447 else if (dfa->edests[org_node].nelem == 1)
1448 {
1449 /* In case of the node can epsilon-transit, and it has only one
1450 destination. */
1451 org_dest = dfa->edests[org_node].elems[0];
1452 re_node_set_empty (dfa->edests + clone_node);
1453 if (dfa->nodes[org_node].type == ANCHOR)
1454 {
1455 /* In case of the node has another constraint, append it. */
1456 if (org_node == root_node && clone_node != org_node)
1457 {
1458 /* ...but if the node is root_node itself, it means the
1459 epsilon closure have a loop, then tie it to the
1460 destination of the root_node. */
1461 ok = re_node_set_insert (dfa->edests + clone_node,
1462 org_dest);
1463 if (BE (! ok, 0))
1464 return REG_ESPACE;
1465 break;
1466 }
1467 constraint |= dfa->nodes[org_node].opr.ctx_type;
1468 }
1469 clone_dest = duplicate_node (dfa, org_dest, constraint);
1470 if (BE (clone_dest == REG_MISSING, 0))
1471 return REG_ESPACE;
1472 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1473 if (BE (! ok, 0))
1474 return REG_ESPACE;
1475 }
1476 else /* dfa->edests[org_node].nelem == 2 */
1477 {
1478 /* In case of the node can epsilon-transit, and it has two
1479 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1480 org_dest = dfa->edests[org_node].elems[0];
1481 re_node_set_empty (dfa->edests + clone_node);
1482 /* Search for a duplicated node which satisfies the constraint. */
1483 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1484 if (clone_dest == REG_MISSING)
1485 {
1486 /* There are no such a duplicated node, create a new one. */
1487 reg_errcode_t err;
1488 clone_dest = duplicate_node (dfa, org_dest, constraint);
1489 if (BE (clone_dest == REG_MISSING, 0))
1490 return REG_ESPACE;
1491 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1492 if (BE (! ok, 0))
1493 return REG_ESPACE;
1494 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1495 root_node, constraint);
1496 if (BE (err != REG_NOERROR, 0))
1497 return err;
1498 }
1499 else
1500 {
1501 /* There are a duplicated node which satisfy the constraint,
1502 use it to avoid infinite loop. */
1503 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1504 if (BE (! ok, 0))
1505 return REG_ESPACE;
1506 }
1507
1508 org_dest = dfa->edests[org_node].elems[1];
1509 clone_dest = duplicate_node (dfa, org_dest, constraint);
1510 if (BE (clone_dest == REG_MISSING, 0))
1511 return REG_ESPACE;
1512 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1513 if (BE (! ok, 0))
1514 return REG_ESPACE;
1515 }
1516 org_node = org_dest;
1517 clone_node = clone_dest;
1518 }
1519 return REG_NOERROR;
1520}
1521
1522/* Search for a node which is duplicated from the node ORG_NODE, and
1523 satisfies the constraint CONSTRAINT. */
1524
1525static Idx
1526search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1527 unsigned int constraint)
1528{
1529 Idx idx;
1530 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1531 {
1532 if (org_node == dfa->org_indices[idx]
1533 && constraint == dfa->nodes[idx].constraint)
1534 return idx; /* Found. */
1535 }
1536 return REG_MISSING; /* Not found. */
1537}
1538
1539/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1540 Return the index of the new node, or REG_MISSING if insufficient storage is
1541 available. */
1542
1543static Idx
1544duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1545{
1546 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1547 if (BE (dup_idx != REG_MISSING, 1))
1548 {
1549 dfa->nodes[dup_idx].constraint = constraint;
1550 if (dfa->nodes[org_idx].type == ANCHOR)
1551 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1552 dfa->nodes[dup_idx].duplicated = 1;
1553
1554 /* Store the index of the original node. */
1555 dfa->org_indices[dup_idx] = org_idx;
1556 }
1557 return dup_idx;
1558}
1559
1560static reg_errcode_t
1561calc_inveclosure (re_dfa_t *dfa)
1562{
1563 Idx src, idx;
1564 bool ok;
1565 for (idx = 0; idx < dfa->nodes_len; ++idx)
1566 re_node_set_init_empty (dfa->inveclosures + idx);
1567
1568 for (src = 0; src < dfa->nodes_len; ++src)
1569 {
1570 Idx *elems = dfa->eclosures[src].elems;
1571 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1572 {
1573 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1574 if (BE (! ok, 0))
1575 return REG_ESPACE;
1576 }
1577 }
1578
1579 return REG_NOERROR;
1580}
1581
1582/* Calculate "eclosure" for all the node in DFA. */
1583
1584static reg_errcode_t
1585calc_eclosure (re_dfa_t *dfa)
1586{
1587 Idx node_idx;
1588 bool incomplete;
1589#ifdef DEBUG
1590 assert (dfa->nodes_len > 0);
1591#endif
1592 incomplete = false;
1593 /* For each nodes, calculate epsilon closure. */
1594 for (node_idx = 0; ; ++node_idx)
1595 {
1596 reg_errcode_t err;
1597 re_node_set eclosure_elem;
1598 if (node_idx == dfa->nodes_len)
1599 {
1600 if (!incomplete)
1601 break;
1602 incomplete = false;
1603 node_idx = 0;
1604 }
1605
1606#ifdef DEBUG
1607 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1608#endif
1609
1610 /* If we have already calculated, skip it. */
1611 if (dfa->eclosures[node_idx].nelem != 0)
1612 continue;
1613 /* Calculate epsilon closure of `node_idx'. */
1614 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1615 if (BE (err != REG_NOERROR, 0))
1616 return err;
1617
1618 if (dfa->eclosures[node_idx].nelem == 0)
1619 {
1620 incomplete = true;
1621 re_node_set_free (&eclosure_elem);
1622 }
1623 }
1624 return REG_NOERROR;
1625}
1626
1627/* Calculate epsilon closure of NODE. */
1628
1629static reg_errcode_t
1630calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1631{
1632 reg_errcode_t err;
1633 unsigned int constraint;
1634 Idx i;
1635 bool incomplete;
1636 bool ok;
1637 re_node_set eclosure;
1638 incomplete = false;
1639 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1640 if (BE (err != REG_NOERROR, 0))
1641 return err;
1642
1643 /* This indicates that we are calculating this node now.
1644 We reference this value to avoid infinite loop. */
1645 dfa->eclosures[node].nelem = REG_MISSING;
1646
1647 constraint = ((dfa->nodes[node].type == ANCHOR)
1648 ? dfa->nodes[node].opr.ctx_type : 0);
1649 /* If the current node has constraints, duplicate all nodes.
1650 Since they must inherit the constraints. */
1651 if (constraint
1652 && dfa->edests[node].nelem
1653 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1654 {
1655 Idx org_node, cur_node;
1656 org_node = cur_node = node;
1657 err = duplicate_node_closure (dfa, node, node, node, constraint);
1658 if (BE (err != REG_NOERROR, 0))
1659 return err;
1660 }
1661
1662 /* Expand each epsilon destination nodes. */
1663 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1664 for (i = 0; i < dfa->edests[node].nelem; ++i)
1665 {
1666 re_node_set eclosure_elem;
1667 Idx edest = dfa->edests[node].elems[i];
1668 /* If calculating the epsilon closure of `edest' is in progress,
1669 return intermediate result. */
1670 if (dfa->eclosures[edest].nelem == REG_MISSING)
1671 {
1672 incomplete = true;
1673 continue;
1674 }
1675 /* If we haven't calculated the epsilon closure of `edest' yet,
1676 calculate now. Otherwise use calculated epsilon closure. */
1677 if (dfa->eclosures[edest].nelem == 0)
1678 {
1679 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1680 if (BE (err != REG_NOERROR, 0))
1681 return err;
1682 }
1683 else
1684 eclosure_elem = dfa->eclosures[edest];
1685 /* Merge the epsilon closure of `edest'. */
1686 re_node_set_merge (&eclosure, &eclosure_elem);
1687 /* If the epsilon closure of `edest' is incomplete,
1688 the epsilon closure of this node is also incomplete. */
1689 if (dfa->eclosures[edest].nelem == 0)
1690 {
1691 incomplete = true;
1692 re_node_set_free (&eclosure_elem);
1693 }
1694 }
1695
1696 /* Epsilon closures include itself. */
1697 ok = re_node_set_insert (&eclosure, node);
1698 if (BE (! ok, 0))
1699 return REG_ESPACE;
1700 if (incomplete && !root)
1701 dfa->eclosures[node].nelem = 0;
1702 else
1703 dfa->eclosures[node] = eclosure;
1704 *new_set = eclosure;
1705 return REG_NOERROR;
1706}
1707
1708/* Functions for token which are used in the parser. */
1709
1710/* Fetch a token from INPUT.
1711 We must not use this function inside bracket expressions. */
1712
1713static void
1714fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1715{
1716 re_string_skip_bytes (input, peek_token (result, input, syntax));
1717}
1718
1719/* Peek a token from INPUT, and return the length of the token.
1720 We must not use this function inside bracket expressions. */
1721
1722static int
1723peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1724{
1725 unsigned char c;
1726
1727 if (re_string_eoi (input))
1728 {
1729 token->type = END_OF_RE;
1730 return 0;
1731 }
1732
1733 c = re_string_peek_byte (input, 0);
1734 token->opr.c = c;
1735
1736 token->word_char = 0;
1737#ifdef RE_ENABLE_I18N
1738 token->mb_partial = 0;
1739 if (input->mb_cur_max > 1 &&
1740 !re_string_first_byte (input, re_string_cur_idx (input)))
1741 {
1742 token->type = CHARACTER;
1743 token->mb_partial = 1;
1744 return 1;
1745 }
1746#endif
1747 if (c == '\\')
1748 {
1749 unsigned char c2;
1750 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1751 {
1752 token->type = BACK_SLASH;
1753 return 1;
1754 }
1755
1756 c2 = re_string_peek_byte_case (input, 1);
1757 token->opr.c = c2;
1758 token->type = CHARACTER;
1759#ifdef RE_ENABLE_I18N
1760 if (input->mb_cur_max > 1)
1761 {
1762 wint_t wc = re_string_wchar_at (input,
1763 re_string_cur_idx (input) + 1);
1764 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1765 }
1766 else
1767#endif
1768 token->word_char = IS_WORD_CHAR (c2) != 0;
1769
1770 switch (c2)
1771 {
1772 case '|':
1773 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_NO_BK_VBAR))
1774 token->type = OP_ALT;
1775 break;
1776 case '1': case '2': case '3': case '4': case '5':
1777 case '6': case '7': case '8': case '9':
1778 if (!(syntax & REG_NO_BK_REFS))
1779 {
1780 token->type = OP_BACK_REF;
1781 token->opr.idx = c2 - '1';
1782 }
1783 break;
1784 case '<':
1785 if (!(syntax & REG_NO_GNU_OPS))
1786 {
1787 token->type = ANCHOR;
1788 token->opr.ctx_type = WORD_FIRST;
1789 }
1790 break;
1791 case '>':
1792 if (!(syntax & REG_NO_GNU_OPS))
1793 {
1794 token->type = ANCHOR;
1795 token->opr.ctx_type = WORD_LAST;
1796 }
1797 break;
1798 case 'b':
1799 if (!(syntax & REG_NO_GNU_OPS))
1800 {
1801 token->type = ANCHOR;
1802 token->opr.ctx_type = WORD_DELIM;
1803 }
1804 break;
1805 case 'B':
1806 if (!(syntax & REG_NO_GNU_OPS))
1807 {
1808 token->type = ANCHOR;
1809 token->opr.ctx_type = NOT_WORD_DELIM;
1810 }
1811 break;
1812 case 'w':
1813 if (!(syntax & REG_NO_GNU_OPS))
1814 token->type = OP_WORD;
1815 break;
1816 case 'W':
1817 if (!(syntax & REG_NO_GNU_OPS))
1818 token->type = OP_NOTWORD;
1819 break;
1820 case 's':
1821 if (!(syntax & REG_NO_GNU_OPS))
1822 token->type = OP_SPACE;
1823 break;
1824 case 'S':
1825 if (!(syntax & REG_NO_GNU_OPS))
1826 token->type = OP_NOTSPACE;
1827 break;
1828 case '`':
1829 if (!(syntax & REG_NO_GNU_OPS))
1830 {
1831 token->type = ANCHOR;
1832 token->opr.ctx_type = BUF_FIRST;
1833 }
1834 break;
1835 case '\'':
1836 if (!(syntax & REG_NO_GNU_OPS))
1837 {
1838 token->type = ANCHOR;
1839 token->opr.ctx_type = BUF_LAST;
1840 }
1841 break;
1842 case '(':
1843 if (!(syntax & REG_NO_BK_PARENS))
1844 token->type = OP_OPEN_SUBEXP;
1845 break;
1846 case ')':
1847 if (!(syntax & REG_NO_BK_PARENS))
1848 token->type = OP_CLOSE_SUBEXP;
1849 break;
1850 case '+':
1851 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1852 token->type = OP_DUP_PLUS;
1853 break;
1854 case '?':
1855 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1856 token->type = OP_DUP_QUESTION;
1857 break;
1858 case '{':
1859 if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1860 token->type = OP_OPEN_DUP_NUM;
1861 break;
1862 case '}':
1863 if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1864 token->type = OP_CLOSE_DUP_NUM;
1865 break;
1866 default:
1867 break;
1868 }
1869 return 2;
1870 }
1871
1872 token->type = CHARACTER;
1873#ifdef RE_ENABLE_I18N
1874 if (input->mb_cur_max > 1)
1875 {
1876 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1877 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1878 }
1879 else
1880#endif
1881 token->word_char = IS_WORD_CHAR (token->opr.c);
1882
1883 switch (c)
1884 {
1885 case '\n':
1886 if (syntax & REG_NEWLINE_ALT)
1887 token->type = OP_ALT;
1888 break;
1889 case '|':
1890 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_NO_BK_VBAR))
1891 token->type = OP_ALT;
1892 break;
1893 case '*':
1894 token->type = OP_DUP_ASTERISK;
1895 break;
1896 case '+':
1897 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1898 token->type = OP_DUP_PLUS;
1899 break;
1900 case '?':
1901 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1902 token->type = OP_DUP_QUESTION;
1903 break;
1904 case '{':
1905 if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1906 token->type = OP_OPEN_DUP_NUM;
1907 break;
1908 case '}':
1909 if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1910 token->type = OP_CLOSE_DUP_NUM;
1911 break;
1912 case '(':
1913 if (syntax & REG_NO_BK_PARENS)
1914 token->type = OP_OPEN_SUBEXP;
1915 break;
1916 case ')':
1917 if (syntax & REG_NO_BK_PARENS)
1918 token->type = OP_CLOSE_SUBEXP;
1919 break;
1920 case '[':
1921 token->type = OP_OPEN_BRACKET;
1922 break;
1923 case '.':
1924 token->type = OP_PERIOD;
1925 break;
1926 case '^':
1927 if (!(syntax & (REG_CONTEXT_INDEP_ANCHORS | REG_CARET_ANCHORS_HERE)) &&
1928 re_string_cur_idx (input) != 0)
1929 {
1930 char prev = re_string_peek_byte (input, -1);
1931 if (!(syntax & REG_NEWLINE_ALT) || prev != '\n')
1932 break;
1933 }
1934 token->type = ANCHOR;
1935 token->opr.ctx_type = LINE_FIRST;
1936 break;
1937 case '$':
1938 if (!(syntax & REG_CONTEXT_INDEP_ANCHORS) &&
1939 re_string_cur_idx (input) + 1 != re_string_length (input))
1940 {
1941 re_token_t next;
1942 re_string_skip_bytes (input, 1);
1943 peek_token (&next, input, syntax);
1944 re_string_skip_bytes (input, -1);
1945 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1946 break;
1947 }
1948 token->type = ANCHOR;
1949 token->opr.ctx_type = LINE_LAST;
1950 break;
1951 default:
1952 break;
1953 }
1954 return 1;
1955}
1956
1957/* Peek a token from INPUT, and return the length of the token.
1958 We must not use this function out of bracket expressions. */
1959
1960static int
1961peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1962{
1963 unsigned char c;
1964 if (re_string_eoi (input))
1965 {
1966 token->type = END_OF_RE;
1967 return 0;
1968 }
1969 c = re_string_peek_byte (input, 0);
1970 token->opr.c = c;
1971
1972#ifdef RE_ENABLE_I18N
1973 if (input->mb_cur_max > 1 &&
1974 !re_string_first_byte (input, re_string_cur_idx (input)))
1975 {
1976 token->type = CHARACTER;
1977 return 1;
1978 }
1979#endif /* RE_ENABLE_I18N */
1980
1981 if (c == '\\' && (syntax & REG_BACKSLASH_ESCAPE_IN_LISTS)
1982 && re_string_cur_idx (input) + 1 < re_string_length (input))
1983 {
1984 /* In this case, '\' escape a character. */
1985 unsigned char c2;
1986 re_string_skip_bytes (input, 1);
1987 c2 = re_string_peek_byte (input, 0);
1988 token->opr.c = c2;
1989 token->type = CHARACTER;
1990 return 1;
1991 }
1992 if (c == '[') /* '[' is a special char in a bracket exps. */
1993 {
1994 unsigned char c2;
1995 int token_len;
1996 if (re_string_cur_idx (input) + 1 < re_string_length (input))
1997 c2 = re_string_peek_byte (input, 1);
1998 else
1999 c2 = 0;
2000 token->opr.c = c2;
2001 token_len = 2;
2002 switch (c2)
2003 {
2004 case '.':
2005 token->type = OP_OPEN_COLL_ELEM;
2006 break;
2007 case '=':
2008 token->type = OP_OPEN_EQUIV_CLASS;
2009 break;
2010 case ':':
2011 if (syntax & REG_CHAR_CLASSES)
2012 {
2013 token->type = OP_OPEN_CHAR_CLASS;
2014 break;
2015 }
2016 /* else fall through. */
2017 default:
2018 token->type = CHARACTER;
2019 token->opr.c = c;
2020 token_len = 1;
2021 break;
2022 }
2023 return token_len;
2024 }
2025 switch (c)
2026 {
2027 case '-':
2028 token->type = OP_CHARSET_RANGE;
2029 break;
2030 case ']':
2031 token->type = OP_CLOSE_BRACKET;
2032 break;
2033 case '^':
2034 token->type = OP_NON_MATCH_LIST;
2035 break;
2036 default:
2037 token->type = CHARACTER;
2038 }
2039 return 1;
2040}
2041
2042/* Functions for parser. */
2043
2044/* Entry point of the parser.
2045 Parse the regular expression REGEXP and return the structure tree.
2046 If an error is occured, ERR is set by error code, and return NULL.
2047 This function build the following tree, from regular expression <reg_exp>:
2048 CAT
2049 / \
2050 / \
2051 <reg_exp> EOR
2052
2053 CAT means concatenation.
2054 EOR means end of regular expression. */
2055
2056static bin_tree_t *
2057parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2058 reg_errcode_t *err)
2059{
2060 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2061 bin_tree_t *tree, *eor, *root;
2062 re_token_t current_token;
2063 dfa->syntax = syntax;
2064 fetch_token (&current_token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2065 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2066 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2067 return NULL;
2068 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2069 if (tree != NULL)
2070 root = create_tree (dfa, tree, eor, CONCAT);
2071 else
2072 root = eor;
2073 if (BE (eor == NULL || root == NULL, 0))
2074 {
2075 *err = REG_ESPACE;
2076 return NULL;
2077 }
2078 return root;
2079}
2080
2081/* This function build the following tree, from regular expression
2082 <branch1>|<branch2>:
2083 ALT
2084 / \
2085 / \
2086 <branch1> <branch2>
2087
2088 ALT means alternative, which represents the operator `|'. */
2089
2090static bin_tree_t *
2091parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2092 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2093{
2094 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2095 bin_tree_t *tree, *branch = NULL;
2096 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2097 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2098 return NULL;
2099
2100 while (token->type == OP_ALT)
2101 {
2102 fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2103 if (token->type != OP_ALT && token->type != END_OF_RE
2104 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2105 {
2106 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2107 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2108 return NULL;
2109 }
2110 else
2111 branch = NULL;
2112 tree = create_tree (dfa, tree, branch, OP_ALT);
2113 if (BE (tree == NULL, 0))
2114 {
2115 *err = REG_ESPACE;
2116 return NULL;
2117 }
2118 }
2119 return tree;
2120}
2121
2122/* This function build the following tree, from regular expression
2123 <exp1><exp2>:
2124 CAT
2125 / \
2126 / \
2127 <exp1> <exp2>
2128
2129 CAT means concatenation. */
2130
2131static bin_tree_t *
2132parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2133 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2134{
2135 bin_tree_t *tree, *exp;
2136 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2137 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2138 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2139 return NULL;
2140
2141 while (token->type != OP_ALT && token->type != END_OF_RE
2142 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2143 {
2144 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2145 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2146 {
2147 return NULL;
2148 }
2149 if (tree != NULL && exp != NULL)
2150 {
2151 tree = create_tree (dfa, tree, exp, CONCAT);
2152 if (tree == NULL)
2153 {
2154 *err = REG_ESPACE;
2155 return NULL;
2156 }
2157 }
2158 else if (tree == NULL)
2159 tree = exp;
2160 /* Otherwise exp == NULL, we don't need to create new tree. */
2161 }
2162 return tree;
2163}
2164
2165/* This function build the following tree, from regular expression a*:
2166 *
2167 |
2168 a
2169*/
2170
2171static bin_tree_t *
2172parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2173 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2174{
2175 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2176 bin_tree_t *tree;
2177 switch (token->type)
2178 {
2179 case CHARACTER:
2180 tree = create_token_tree (dfa, NULL, NULL, token);
2181 if (BE (tree == NULL, 0))
2182 {
2183 *err = REG_ESPACE;
2184 return NULL;
2185 }
2186#ifdef RE_ENABLE_I18N
2187 if (dfa->mb_cur_max > 1)
2188 {
2189 while (!re_string_eoi (regexp)
2190 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2191 {
2192 bin_tree_t *mbc_remain;
2193 fetch_token (token, regexp, syntax);
2194 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2195 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2196 if (BE (mbc_remain == NULL || tree == NULL, 0))
2197 {
2198 *err = REG_ESPACE;
2199 return NULL;
2200 }
2201 }
2202 }
2203#endif
2204 break;
2205 case OP_OPEN_SUBEXP:
2206 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2207 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2208 return NULL;
2209 break;
2210 case OP_OPEN_BRACKET:
2211 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2212 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2213 return NULL;
2214 break;
2215 case OP_BACK_REF:
2216 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2217 {
2218 *err = REG_ESUBREG;
2219 return NULL;
2220 }
2221 dfa->used_bkref_map |= 1 << token->opr.idx;
2222 tree = create_token_tree (dfa, NULL, NULL, token);
2223 if (BE (tree == NULL, 0))
2224 {
2225 *err = REG_ESPACE;
2226 return NULL;
2227 }
2228 ++dfa->nbackref;
2229 dfa->has_mb_node = 1;
2230 break;
2231 case OP_OPEN_DUP_NUM:
2232 if (syntax & REG_CONTEXT_INVALID_DUP)
2233 {
2234 *err = REG_BADRPT;
2235 return NULL;
2236 }
2237 /* FALLTHROUGH */
2238 case OP_DUP_ASTERISK:
2239 case OP_DUP_PLUS:
2240 case OP_DUP_QUESTION:
2241 if (syntax & REG_CONTEXT_INVALID_OPS)
2242 {
2243 *err = REG_BADRPT;
2244 return NULL;
2245 }
2246 else if (syntax & REG_CONTEXT_INDEP_OPS)
2247 {
2248 fetch_token (token, regexp, syntax);
2249 return parse_expression (regexp, preg, token, syntax, nest, err);
2250 }
2251 /* else fall through */
2252 case OP_CLOSE_SUBEXP:
2253 if ((token->type == OP_CLOSE_SUBEXP) &&
2254 !(syntax & REG_UNMATCHED_RIGHT_PAREN_ORD))
2255 {
2256 *err = REG_ERPAREN;
2257 return NULL;
2258 }
2259 /* else fall through */
2260 case OP_CLOSE_DUP_NUM:
2261 /* We treat it as a normal character. */
2262
2263 /* Then we can these characters as normal characters. */
2264 token->type = CHARACTER;
2265 /* mb_partial and word_char bits should be initialized already
2266 by peek_token. */
2267 tree = create_token_tree (dfa, NULL, NULL, token);
2268 if (BE (tree == NULL, 0))
2269 {
2270 *err = REG_ESPACE;
2271 return NULL;
2272 }
2273 break;
2274 case ANCHOR:
2275 if ((token->opr.ctx_type
2276 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2277 && dfa->word_ops_used == 0)
2278 init_word_char (dfa);
2279 if (token->opr.ctx_type == WORD_DELIM
2280 || token->opr.ctx_type == NOT_WORD_DELIM)
2281 {
2282 bin_tree_t *tree_first, *tree_last;
2283 if (token->opr.ctx_type == WORD_DELIM)
2284 {
2285 token->opr.ctx_type = WORD_FIRST;
2286 tree_first = create_token_tree (dfa, NULL, NULL, token);
2287 token->opr.ctx_type = WORD_LAST;
2288 }
2289 else
2290 {
2291 token->opr.ctx_type = INSIDE_WORD;
2292 tree_first = create_token_tree (dfa, NULL, NULL, token);
2293 token->opr.ctx_type = INSIDE_NOTWORD;
2294 }
2295 tree_last = create_token_tree (dfa, NULL, NULL, token);
2296 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2297 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2298 {
2299 *err = REG_ESPACE;
2300 return NULL;
2301 }
2302 }
2303 else
2304 {
2305 tree = create_token_tree (dfa, NULL, NULL, token);
2306 if (BE (tree == NULL, 0))
2307 {
2308 *err = REG_ESPACE;
2309 return NULL;
2310 }
2311 }
2312 /* We must return here, since ANCHORs can't be followed
2313 by repetition operators.
2314 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2315 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2316 fetch_token (token, regexp, syntax);
2317 return tree;
2318 case OP_PERIOD:
2319 tree = create_token_tree (dfa, NULL, NULL, token);
2320 if (BE (tree == NULL, 0))
2321 {
2322 *err = REG_ESPACE;
2323 return NULL;
2324 }
2325 if (dfa->mb_cur_max > 1)
2326 dfa->has_mb_node = 1;
2327 break;
2328 case OP_WORD:
2329 case OP_NOTWORD:
2330 tree = build_charclass_op (dfa, regexp->trans,
2331 (const unsigned char *) "alnum",
2332 (const unsigned char *) "_",
2333 token->type == OP_NOTWORD, err);
2334 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2335 return NULL;
2336 break;
2337 case OP_SPACE:
2338 case OP_NOTSPACE:
2339 tree = build_charclass_op (dfa, regexp->trans,
2340 (const unsigned char *) "space",
2341 (const unsigned char *) "",
2342 token->type == OP_NOTSPACE, err);
2343 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2344 return NULL;
2345 break;
2346 case OP_ALT:
2347 case END_OF_RE:
2348 return NULL;
2349 case BACK_SLASH:
2350 *err = REG_EESCAPE;
2351 return NULL;
2352 default:
2353 /* Must not happen? */
2354#ifdef DEBUG
2355 assert (0);
2356#endif
2357 return NULL;
2358 }
2359 fetch_token (token, regexp, syntax);
2360
2361 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2362 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2363 {
2364 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2365 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2366 return NULL;
2367 /* In BRE consecutive duplications are not allowed. */
2368 if ((syntax & REG_CONTEXT_INVALID_DUP)
2369 && (token->type == OP_DUP_ASTERISK
2370 || token->type == OP_OPEN_DUP_NUM))
2371 {
2372 *err = REG_BADRPT;
2373 return NULL;
2374 }
2375 }
2376
2377 return tree;
2378}
2379
2380/* This function build the following tree, from regular expression
2381 (<reg_exp>):
2382 SUBEXP
2383 |
2384 <reg_exp>
2385*/
2386
2387static bin_tree_t *
2388parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2389 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2390{
2391 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2392 bin_tree_t *tree;
2393 size_t cur_nsub;
2394 cur_nsub = preg->re_nsub++;
2395
2396 fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2397
2398 /* The subexpression may be a null string. */
2399 if (token->type == OP_CLOSE_SUBEXP)
2400 tree = NULL;
2401 else
2402 {
2403 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2404 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2405 *err = REG_EPAREN;
2406 if (BE (*err != REG_NOERROR, 0))
2407 return NULL;
2408 }
2409
2410 if (cur_nsub <= '9' - '1')
2411 dfa->completed_bkref_map |= 1 << cur_nsub;
2412
2413 tree = create_tree (dfa, tree, NULL, SUBEXP);
2414 if (BE (tree == NULL, 0))
2415 {
2416 *err = REG_ESPACE;
2417 return NULL;
2418 }
2419 tree->token.opr.idx = cur_nsub;
2420 return tree;
2421}
2422
2423/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2424
2425static bin_tree_t *
2426parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2427 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2428{
2429 bin_tree_t *tree = NULL, *old_tree = NULL;
2430 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2431 re_token_t start_token = *token;
2432
2433 if (token->type == OP_OPEN_DUP_NUM)
2434 {
2435 end = 0;
2436 start = fetch_number (regexp, token, syntax);
2437 if (start == REG_MISSING)
2438 {
2439 if (token->type == CHARACTER && token->opr.c == ',')
2440 start = 0; /* We treat "{,m}" as "{0,m}". */
2441 else
2442 {
2443 *err = REG_BADBR; /* <re>{} is invalid. */
2444 return NULL;
2445 }
2446 }
2447 if (BE (start != REG_ERROR, 1))
2448 {
2449 /* We treat "{n}" as "{n,n}". */
2450 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2451 : ((token->type == CHARACTER && token->opr.c == ',')
2452 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2453 }
2454 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2455 {
2456 /* Invalid sequence. */
2457 if (BE (!(syntax & REG_INVALID_INTERVAL_ORD), 0))
2458 {
2459 if (token->type == END_OF_RE)
2460 *err = REG_EBRACE;
2461 else
2462 *err = REG_BADBR;
2463
2464 return NULL;
2465 }
2466
2467 /* If the syntax bit is set, rollback. */
2468 re_string_set_index (regexp, start_idx);
2469 *token = start_token;
2470 token->type = CHARACTER;
2471 /* mb_partial and word_char bits should be already initialized by
2472 peek_token. */
2473 return elem;
2474 }
2475
2476 if (BE (end != REG_MISSING && start > end, 0))
2477 {
2478 /* First number greater than second. */
2479 *err = REG_BADBR;
2480 return NULL;
2481 }
2482 }
2483 else
2484 {
2485 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2486 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2487 }
2488
2489 fetch_token (token, regexp, syntax);
2490
2491 if (BE (elem == NULL, 0))
2492 return NULL;
2493 if (BE (start == 0 && end == 0, 0))
2494 {
2495 postorder (elem, free_tree, NULL);
2496 return NULL;
2497 }
2498
2499 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2500 if (BE (start > 0, 0))
2501 {
2502 tree = elem;
2503 for (i = 2; i <= start; ++i)
2504 {
2505 elem = duplicate_tree (elem, dfa);
2506 tree = create_tree (dfa, tree, elem, CONCAT);
2507 if (BE (elem == NULL || tree == NULL, 0))
2508 goto parse_dup_op_espace;
2509 }
2510
2511 if (start == end)
2512 return tree;
2513
2514 /* Duplicate ELEM before it is marked optional. */
2515 elem = duplicate_tree (elem, dfa);
2516 old_tree = tree;
2517 }
2518 else
2519 old_tree = NULL;
2520
2521 if (elem->token.type == SUBEXP)
2522 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2523
2524 tree = create_tree (dfa, elem, NULL,
2525 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2526 if (BE (tree == NULL, 0))
2527 goto parse_dup_op_espace;
2528
2529 /* This loop is actually executed only when end != REG_MISSING,
2530 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2531 already created the start+1-th copy. */
2532 if ((Idx) -1 < 0 || end != REG_MISSING)
2533 for (i = start + 2; i <= end; ++i)
2534 {
2535 elem = duplicate_tree (elem, dfa);
2536 tree = create_tree (dfa, tree, elem, CONCAT);
2537 if (BE (elem == NULL || tree == NULL, 0))
2538 goto parse_dup_op_espace;
2539
2540 tree = create_tree (dfa, tree, NULL, OP_ALT);
2541 if (BE (tree == NULL, 0))
2542 goto parse_dup_op_espace;
2543 }
2544
2545 if (old_tree)
2546 tree = create_tree (dfa, old_tree, tree, CONCAT);
2547
2548 return tree;
2549
2550 parse_dup_op_espace:
2551 *err = REG_ESPACE;
2552 return NULL;
2553}
2554
2555/* Size of the names for collating symbol/equivalence_class/character_class.
2556 I'm not sure, but maybe enough. */
2557#define BRACKET_NAME_BUF_SIZE 32
2558
2559#ifndef _LIBC
2560 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2561 Build the range expression which starts from START_ELEM, and ends
2562 at END_ELEM. The result are written to MBCSET and SBCSET.
2563 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2564 mbcset->range_ends, is a pointer argument sinse we may
2565 update it. */
2566
2567static reg_errcode_t
2568build_range_exp (bitset sbcset,
2569# ifdef RE_ENABLE_I18N
2570 re_charset_t *mbcset, Idx *range_alloc,
2571# endif
2572 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2573{
2574 unsigned int start_ch, end_ch;
2575 /* Equivalence Classes and Character Classes can't be a range start/end. */
2576 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2577 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2578 0))
2579 return REG_ERANGE;
2580
2581 /* We can handle no multi character collating elements without libc
2582 support. */
2583 if (BE ((start_elem->type == COLL_SYM
2584 && strlen ((char *) start_elem->opr.name) > 1)
2585 || (end_elem->type == COLL_SYM
2586 && strlen ((char *) end_elem->opr.name) > 1), 0))
2587 return REG_ECOLLATE;
2588
2589# ifdef RE_ENABLE_I18N
2590 {
2591 wchar_t wc;
2592 wint_t start_wc, end_wc;
2593 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2594
2595 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2596 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2597 : 0));
2598 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2599 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2600 : 0));
2601 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2602 ? __btowc (start_ch) : start_elem->opr.wch);
2603 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2604 ? __btowc (end_ch) : end_elem->opr.wch);
2605 if (start_wc == WEOF || end_wc == WEOF)
2606 return REG_ECOLLATE;
2607 cmp_buf[0] = start_wc;
2608 cmp_buf[4] = end_wc;
2609 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2610 return REG_ERANGE;
2611
2612 /* Got valid collation sequence values, add them as a new entry.
2613 However, for !_LIBC we have no collation elements: if the
2614 character set is single byte, the single byte character set
2615 that we build below suffices. parse_bracket_exp passes
2616 no MBCSET if dfa->mb_cur_max == 1. */
2617 if (mbcset)
2618 {
2619 /* Check the space of the arrays. */
2620 if (BE (*range_alloc == mbcset->nranges, 0))
2621 {
2622 /* There is not enough space, need realloc. */
2623 wchar_t *new_array_start, *new_array_end;
2624 Idx new_nranges;
2625
2626 new_nranges = mbcset->nranges;
2627 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2628 are NULL if *range_alloc == 0. */
2629 new_array_start = re_x2realloc (mbcset->range_starts, wchar_t,
2630 &new_nranges);
2631 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2632 new_nranges);
2633
2634 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2635 return REG_ESPACE;
2636
2637 mbcset->range_starts = new_array_start;
2638 mbcset->range_ends = new_array_end;
2639 *range_alloc = new_nranges;
2640 }
2641
2642 mbcset->range_starts[mbcset->nranges] = start_wc;
2643 mbcset->range_ends[mbcset->nranges++] = end_wc;
2644 }
2645
2646 /* Build the table for single byte characters. */
2647 for (wc = 0; wc < SBC_MAX; ++wc)
2648 {
2649 cmp_buf[2] = wc;
2650 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2651 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2652 bitset_set (sbcset, wc);
2653 }
2654 }
2655# else /* not RE_ENABLE_I18N */
2656 {
2657 unsigned int ch;
2658 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2659 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2660 : 0));
2661 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2662 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2663 : 0));
2664 if (start_ch > end_ch)
2665 return REG_ERANGE;
2666 /* Build the table for single byte characters. */
2667 for (ch = 0; ch < SBC_MAX; ++ch)
2668 if (start_ch <= ch && ch <= end_ch)
2669 bitset_set (sbcset, ch);
2670 }
2671# endif /* not RE_ENABLE_I18N */
2672 return REG_NOERROR;
2673}
2674#endif /* not _LIBC */
2675
2676#ifndef _LIBC
2677/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2678 Build the collating element which is represented by NAME.
2679 The result are written to MBCSET and SBCSET.
2680 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2681 pointer argument since we may update it. */
2682
2683static reg_errcode_t
2684build_collating_symbol (bitset sbcset,
2685# ifdef RE_ENABLE_I18N
2686 re_charset_t *mbcset, Idx *coll_sym_alloc,
2687# endif
2688 const unsigned char *name)
2689{
2690 size_t name_len = strlen ((const char *) name);
2691 if (BE (name_len != 1, 0))
2692 return REG_ECOLLATE;
2693 else
2694 {
2695 bitset_set (sbcset, name[0]);
2696 return REG_NOERROR;
2697 }
2698}
2699#endif /* not _LIBC */
2700
2701/* This function parse bracket expression like "[abc]", "[a-c]",
2702 "[[.a-a.]]" etc. */
2703
2704static bin_tree_t *
2705parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2706 reg_syntax_t syntax, reg_errcode_t *err)
2707{
2708#ifdef _LIBC
2709 const unsigned char *collseqmb;
2710 const char *collseqwc;
2711 uint32_t nrules;
2712 int32_t table_size;
2713 const int32_t *symb_table;
2714 const unsigned char *extra;
2715
2716 /* Local function for parse_bracket_exp used in _LIBC environement.
2717 Seek the collating symbol entry correspondings to NAME.
2718 Return the index of the symbol in the SYMB_TABLE. */
2719
2720 auto inline int32_t
2721 __attribute ((always_inline))
2722 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2723 {
2724 int32_t hash = elem_hash ((const char *) name, name_len);
2725 int32_t elem = hash % table_size;
2726 int32_t second = hash % (table_size - 2);
2727 while (symb_table[2 * elem] != 0)
2728 {
2729 /* First compare the hashing value. */
2730 if (symb_table[2 * elem] == hash
2731 /* Compare the length of the name. */
2732 && name_len == extra[symb_table[2 * elem + 1]]
2733 /* Compare the name. */
2734 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2735 name_len) == 0)
2736 {
2737 /* Yep, this is the entry. */
2738 break;
2739 }
2740
2741 /* Next entry. */
2742 elem += second;
2743 }
2744 return elem;
2745 }
2746
2747 /* Local function for parse_bracket_exp used in _LIBC environement.
2748 Look up the collation sequence value of BR_ELEM.
2749 Return the value if succeeded, UINT_MAX otherwise. */
2750
2751 auto inline unsigned int
2752 __attribute ((always_inline))
2753 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2754 {
2755 if (br_elem->type == SB_CHAR)
2756 {
2757 /*
2758 if (MB_CUR_MAX == 1)
2759 */
2760 if (nrules == 0)
2761 return collseqmb[br_elem->opr.ch];
2762 else
2763 {
2764 wint_t wc = __btowc (br_elem->opr.ch);
2765 return __collseq_table_lookup (collseqwc, wc);
2766 }
2767 }
2768 else if (br_elem->type == MB_CHAR)
2769 {
2770 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2771 }
2772 else if (br_elem->type == COLL_SYM)
2773 {
2774 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2775 if (nrules != 0)
2776 {
2777 int32_t elem, idx;
2778 elem = seek_collating_symbol_entry (br_elem->opr.name,
2779 sym_name_len);
2780 if (symb_table[2 * elem] != 0)
2781 {
2782 /* We found the entry. */
2783 idx = symb_table[2 * elem + 1];
2784 /* Skip the name of collating element name. */
2785 idx += 1 + extra[idx];
2786 /* Skip the byte sequence of the collating element. */
2787 idx += 1 + extra[idx];
2788 /* Adjust for the alignment. */
2789 idx = (idx + 3) & ~3;
2790 /* Skip the multibyte collation sequence value. */
2791 idx += sizeof (unsigned int);
2792 /* Skip the wide char sequence of the collating element. */
2793 idx += sizeof (unsigned int) *
2794 (1 + *(unsigned int *) (extra + idx));
2795 /* Return the collation sequence value. */
2796 return *(unsigned int *) (extra + idx);
2797 }
2798 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2799 {
2800 /* No valid character. Match it as a single byte
2801 character. */
2802 return collseqmb[br_elem->opr.name[0]];
2803 }
2804 }
2805 else if (sym_name_len == 1)
2806 return collseqmb[br_elem->opr.name[0]];
2807 }
2808 return UINT_MAX;
2809 }
2810
2811 /* Local function for parse_bracket_exp used in _LIBC environement.
2812 Build the range expression which starts from START_ELEM, and ends
2813 at END_ELEM. The result are written to MBCSET and SBCSET.
2814 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2815 mbcset->range_ends, is a pointer argument sinse we may
2816 update it. */
2817
2818 auto inline reg_errcode_t
2819 __attribute ((always_inline))
2820 build_range_exp (bitset sbcset, re_charset_t *mbcset,
2821 Idx *range_alloc,
2822 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2823 {
2824 unsigned int ch;
2825 uint32_t start_collseq;
2826 uint32_t end_collseq;
2827
2828 /* Equivalence Classes and Character Classes can't be a range
2829 start/end. */
2830 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2831 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2832 0))
2833 return REG_ERANGE;
2834
2835 start_collseq = lookup_collation_sequence_value (start_elem);
2836 end_collseq = lookup_collation_sequence_value (end_elem);
2837 /* Check start/end collation sequence values. */
2838 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2839 return REG_ECOLLATE;
2840 if (BE ((syntax & REG_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2841 return REG_ERANGE;
2842
2843 /* Got valid collation sequence values, add them as a new entry.
2844 However, if we have no collation elements, and the character set
2845 is single byte, the single byte character set that we
2846 build below suffices. */
2847 if (nrules > 0 || dfa->mb_cur_max > 1)
2848 {
2849 /* Check the space of the arrays. */
2850 if (BE (*range_alloc == mbcset->nranges, 0))
2851 {
2852 /* There is not enough space, need realloc. */
2853 uint32_t *new_array_start;
2854 uint32_t *new_array_end;
2855 Idx new_nranges;
2856
2857 new_nranges = mbcset->nranges;
2858 new_array_start = re_x2realloc (mbcset->range_starts, uint32_t,
2859 &new_nranges);
2860 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2861 new_nranges);
2862
2863 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2864 return REG_ESPACE;
2865
2866 mbcset->range_starts = new_array_start;
2867 mbcset->range_ends = new_array_end;
2868 *range_alloc = new_nranges;
2869 }
2870
2871 mbcset->range_starts[mbcset->nranges] = start_collseq;
2872 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2873 }
2874
2875 /* Build the table for single byte characters. */
2876 for (ch = 0; ch < SBC_MAX; ch++)
2877 {
2878 uint32_t ch_collseq;
2879 /*
2880 if (MB_CUR_MAX == 1)
2881 */
2882 if (nrules == 0)
2883 ch_collseq = collseqmb[ch];
2884 else
2885 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2886 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2887 bitset_set (sbcset, ch);
2888 }
2889 return REG_NOERROR;
2890 }
2891
2892 /* Local function for parse_bracket_exp used in _LIBC environement.
2893 Build the collating element which is represented by NAME.
2894 The result are written to MBCSET and SBCSET.
2895 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2896 pointer argument sinse we may update it. */
2897
2898 auto inline reg_errcode_t
2899 __attribute ((always_inline))
2900 build_collating_symbol (bitset sbcset, re_charset_t *mbcset,
2901 Idx *coll_sym_alloc, const unsigned char *name)
2902 {
2903 int32_t elem, idx;
2904 size_t name_len = strlen ((const char *) name);
2905 if (nrules != 0)
2906 {
2907 elem = seek_collating_symbol_entry (name, name_len);
2908 if (symb_table[2 * elem] != 0)
2909 {
2910 /* We found the entry. */
2911 idx = symb_table[2 * elem + 1];
2912 /* Skip the name of collating element name. */
2913 idx += 1 + extra[idx];
2914 }
2915 else if (symb_table[2 * elem] == 0 && name_len == 1)
2916 {
2917 /* No valid character, treat it as a normal
2918 character. */
2919 bitset_set (sbcset, name[0]);
2920 return REG_NOERROR;
2921 }
2922 else
2923 return REG_ECOLLATE;
2924
2925 /* Got valid collation sequence, add it as a new entry. */
2926 /* Check the space of the arrays. */
2927 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2928 {
2929 /* Not enough, realloc it. */
2930 Idx new_coll_sym_alloc = mbcset->ncoll_syms;
2931 /* Use realloc since mbcset->coll_syms is NULL
2932 if *alloc == 0. */
2933 int32_t *new_coll_syms = re_x2realloc (mbcset->coll_syms, int32_t,
2934 &new_coll_sym_alloc);
2935 if (BE (new_coll_syms == NULL, 0))
2936 return REG_ESPACE;
2937 mbcset->coll_syms = new_coll_syms;
2938 *coll_sym_alloc = new_coll_sym_alloc;
2939 }
2940 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2941 return REG_NOERROR;
2942 }
2943 else
2944 {
2945 if (BE (name_len != 1, 0))
2946 return REG_ECOLLATE;
2947 else
2948 {
2949 bitset_set (sbcset, name[0]);
2950 return REG_NOERROR;
2951 }
2952 }
2953 }
2954#endif
2955
2956 re_token_t br_token;
2957 re_bitset_ptr_t sbcset;
2958#ifdef RE_ENABLE_I18N
2959 re_charset_t *mbcset;
2960 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2961 Idx equiv_class_alloc = 0, char_class_alloc = 0;
2962#endif /* not RE_ENABLE_I18N */
2963 bool non_match = false;
2964 bin_tree_t *work_tree;
2965 int token_len;
2966 bool first_round = true;
2967#ifdef _LIBC
2968 collseqmb = (const unsigned char *)
2969 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2970 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2971 if (nrules)
2972 {
2973 /*
2974 if (MB_CUR_MAX > 1)
2975 */
2976 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2977 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2978 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2979 _NL_COLLATE_SYMB_TABLEMB);
2980 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2981 _NL_COLLATE_SYMB_EXTRAMB);
2982 }
2983#endif
2984 sbcset = re_calloc (bitset_word, BITSET_WORDS);
2985#ifdef RE_ENABLE_I18N
2986 mbcset = re_calloc (re_charset_t, 1);
2987#endif /* RE_ENABLE_I18N */
2988#ifdef RE_ENABLE_I18N
2989 if (BE (sbcset == NULL || mbcset == NULL, 0))
2990#else
2991 if (BE (sbcset == NULL, 0))
2992#endif /* RE_ENABLE_I18N */
2993 {
2994 *err = REG_ESPACE;
2995 return NULL;
2996 }
2997
2998 token_len = peek_token_bracket (token, regexp, syntax);
2999 if (BE (token->type == END_OF_RE, 0))
3000 {
3001 *err = REG_BADPAT;
3002 goto parse_bracket_exp_free_return;
3003 }
3004 if (token->type == OP_NON_MATCH_LIST)
3005 {
3006#ifdef RE_ENABLE_I18N
3007 mbcset->non_match = 1;
3008#endif /* not RE_ENABLE_I18N */
3009 non_match = true;
3010 if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
3011 bitset_set (sbcset, '\0');
3012 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3013 token_len = peek_token_bracket (token, regexp, syntax);
3014 if (BE (token->type == END_OF_RE, 0))
3015 {
3016 *err = REG_BADPAT;
3017 goto parse_bracket_exp_free_return;
3018 }
3019 }
3020
3021 /* We treat the first ']' as a normal character. */
3022 if (token->type == OP_CLOSE_BRACKET)
3023 token->type = CHARACTER;
3024
3025 while (1)
3026 {
3027 bracket_elem_t start_elem, end_elem;
3028 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3029 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3030 reg_errcode_t ret;
3031 int token_len2 = 0;
3032 bool is_range_exp = false;
3033 re_token_t token2;
3034
3035 start_elem.opr.name = start_name_buf;
3036 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3037 syntax, first_round);
3038 if (BE (ret != REG_NOERROR, 0))
3039 {
3040 *err = ret;
3041 goto parse_bracket_exp_free_return;
3042 }
3043 first_round = false;
3044
3045 /* Get information about the next token. We need it in any case. */
3046 token_len = peek_token_bracket (token, regexp, syntax);
3047
3048 /* Do not check for ranges if we know they are not allowed. */
3049 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3050 {
3051 if (BE (token->type == END_OF_RE, 0))
3052 {
3053 *err = REG_EBRACK;
3054 goto parse_bracket_exp_free_return;
3055 }
3056 if (token->type == OP_CHARSET_RANGE)
3057 {
3058 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3059 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3060 if (BE (token2.type == END_OF_RE, 0))
3061 {
3062 *err = REG_EBRACK;
3063 goto parse_bracket_exp_free_return;
3064 }
3065 if (token2.type == OP_CLOSE_BRACKET)
3066 {
3067 /* We treat the last '-' as a normal character. */
3068 re_string_skip_bytes (regexp, -token_len);
3069 token->type = CHARACTER;
3070 }
3071 else
3072 is_range_exp = true;
3073 }
3074 }
3075
3076 if (is_range_exp == true)
3077 {
3078 end_elem.opr.name = end_name_buf;
3079 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3080 dfa, syntax, true);
3081 if (BE (ret != REG_NOERROR, 0))
3082 {
3083 *err = ret;
3084 goto parse_bracket_exp_free_return;
3085 }
3086
3087 token_len = peek_token_bracket (token, regexp, syntax);
3088
3089#ifdef _LIBC
3090 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3091 &start_elem, &end_elem);
3092#else
3093# ifdef RE_ENABLE_I18N
3094 *err = build_range_exp (sbcset,
3095 dfa->mb_cur_max > 1 ? mbcset : NULL,
3096 &range_alloc, &start_elem, &end_elem);
3097# else
3098 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3099# endif
3100#endif /* RE_ENABLE_I18N */
3101 if (BE (*err != REG_NOERROR, 0))
3102 goto parse_bracket_exp_free_return;
3103 }
3104 else
3105 {
3106 switch (start_elem.type)
3107 {
3108 case SB_CHAR:
3109 bitset_set (sbcset, start_elem.opr.ch);
3110 break;
3111#ifdef RE_ENABLE_I18N
3112 case MB_CHAR:
3113 /* Check whether the array has enough space. */
3114 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3115 {
3116 wchar_t *new_mbchars;
3117 /* Not enough, realloc it. */
3118 mbchar_alloc = mbcset->nmbchars;
3119 /* Use realloc since array is NULL if *alloc == 0. */
3120 new_mbchars = re_x2realloc (mbcset->mbchars, wchar_t,
3121 &mbchar_alloc);
3122 if (BE (new_mbchars == NULL, 0))
3123 goto parse_bracket_exp_espace;
3124 mbcset->mbchars = new_mbchars;
3125 }
3126 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3127 break;
3128#endif /* RE_ENABLE_I18N */
3129 case EQUIV_CLASS:
3130 *err = build_equiv_class (sbcset,
3131#ifdef RE_ENABLE_I18N
3132 mbcset, &equiv_class_alloc,
3133#endif /* RE_ENABLE_I18N */
3134 start_elem.opr.name);
3135 if (BE (*err != REG_NOERROR, 0))
3136 goto parse_bracket_exp_free_return;
3137 break;
3138 case COLL_SYM:
3139 *err = build_collating_symbol (sbcset,
3140#ifdef RE_ENABLE_I18N
3141 mbcset, &coll_sym_alloc,
3142#endif /* RE_ENABLE_I18N */
3143 start_elem.opr.name);
3144 if (BE (*err != REG_NOERROR, 0))
3145 goto parse_bracket_exp_free_return;
3146 break;
3147 case CHAR_CLASS:
3148 *err = build_charclass (regexp->trans, sbcset,
3149#ifdef RE_ENABLE_I18N
3150 mbcset, &char_class_alloc,
3151#endif /* RE_ENABLE_I18N */
3152 start_elem.opr.name, syntax);
3153 if (BE (*err != REG_NOERROR, 0))
3154 goto parse_bracket_exp_free_return;
3155 break;
3156 default:
3157 assert (0);
3158 break;
3159 }
3160 }
3161 if (BE (token->type == END_OF_RE, 0))
3162 {
3163 *err = REG_EBRACK;
3164 goto parse_bracket_exp_free_return;
3165 }
3166 if (token->type == OP_CLOSE_BRACKET)
3167 break;
3168 }
3169
3170 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3171
3172 /* If it is non-matching list. */
3173 if (non_match)
3174 bitset_not (sbcset);
3175
3176#ifdef RE_ENABLE_I18N
3177 /* Ensure only single byte characters are set. */
3178 if (dfa->mb_cur_max > 1)
3179 bitset_mask (sbcset, dfa->sb_char);
3180
3181 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3182 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3183 || mbcset->non_match)))
3184 {
3185 bin_tree_t *mbc_tree;
3186 int sbc_idx;
3187 /* Build a tree for complex bracket. */
3188 dfa->has_mb_node = 1;
3189 br_token.type = COMPLEX_BRACKET;
3190 br_token.opr.mbcset = mbcset;
3191 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3192 if (BE (mbc_tree == NULL, 0))
3193 goto parse_bracket_exp_espace;
3194 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3195 if (sbcset[sbc_idx])
3196 break;
3197 /* If there are no bits set in sbcset, there is no point
3198 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3199 if (sbc_idx < BITSET_WORDS)
3200 {
3201 /* Build a tree for simple bracket. */
3202 br_token.type = SIMPLE_BRACKET;
3203 br_token.opr.sbcset = sbcset;
3204 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3205 if (BE (work_tree == NULL, 0))
3206 goto parse_bracket_exp_espace;
3207
3208 /* Then join them by ALT node. */
3209 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3210 if (BE (work_tree == NULL, 0))
3211 goto parse_bracket_exp_espace;
3212 }
3213 else
3214 {
3215 re_free (sbcset);
3216 work_tree = mbc_tree;
3217 }
3218 }
3219 else
3220#endif /* not RE_ENABLE_I18N */
3221 {
3222#ifdef RE_ENABLE_I18N
3223 free_charset (mbcset);
3224#endif
3225 /* Build a tree for simple bracket. */
3226 br_token.type = SIMPLE_BRACKET;
3227 br_token.opr.sbcset = sbcset;
3228 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3229 if (BE (work_tree == NULL, 0))
3230 goto parse_bracket_exp_espace;
3231 }
3232 return work_tree;
3233
3234 parse_bracket_exp_espace:
3235 *err = REG_ESPACE;
3236 parse_bracket_exp_free_return:
3237 re_free (sbcset);
3238#ifdef RE_ENABLE_I18N
3239 free_charset (mbcset);
3240#endif /* RE_ENABLE_I18N */
3241 return NULL;
3242}
3243
3244/* Parse an element in the bracket expression. */
3245
3246static reg_errcode_t
3247parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3248 re_token_t *token, int token_len, re_dfa_t *dfa,
3249 reg_syntax_t syntax, bool accept_hyphen)
3250{
3251#ifdef RE_ENABLE_I18N
3252 int cur_char_size;
3253 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3254 if (cur_char_size > 1)
3255 {
3256 elem->type = MB_CHAR;
3257 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3258 re_string_skip_bytes (regexp, cur_char_size);
3259 return REG_NOERROR;
3260 }
3261#endif /* RE_ENABLE_I18N */
3262 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3263 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3264 || token->type == OP_OPEN_EQUIV_CLASS)
3265 return parse_bracket_symbol (elem, regexp, token);
3266 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3267 {
3268 /* A '-' must only appear as anything but a range indicator before
3269 the closing bracket. Everything else is an error. */
3270 re_token_t token2;
3271 (void) peek_token_bracket (&token2, regexp, syntax);
3272 if (token2.type != OP_CLOSE_BRACKET)
3273 /* The actual error value is not standardized since this whole
3274 case is undefined. But ERANGE makes good sense. */
3275 return REG_ERANGE;
3276 }
3277 elem->type = SB_CHAR;
3278 elem->opr.ch = token->opr.c;
3279 return REG_NOERROR;
3280}
3281
3282/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3283 such as [:<character_class>:], [.<collating_element>.], and
3284 [=<equivalent_class>=]. */
3285
3286static reg_errcode_t
3287parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3288 re_token_t *token)
3289{
3290 unsigned char ch, delim = token->opr.c;
3291 int i = 0;
3292 if (re_string_eoi(regexp))
3293 return REG_EBRACK;
3294 for (;; ++i)
3295 {
3296 if (i >= BRACKET_NAME_BUF_SIZE)
3297 return REG_EBRACK;
3298 if (token->type == OP_OPEN_CHAR_CLASS)
3299 ch = re_string_fetch_byte_case (regexp);
3300 else
3301 ch = re_string_fetch_byte (regexp);
3302 if (re_string_eoi(regexp))
3303 return REG_EBRACK;
3304 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3305 break;
3306 elem->opr.name[i] = ch;
3307 }
3308 re_string_skip_bytes (regexp, 1);
3309 elem->opr.name[i] = '\0';
3310 switch (token->type)
3311 {
3312 case OP_OPEN_COLL_ELEM:
3313 elem->type = COLL_SYM;
3314 break;
3315 case OP_OPEN_EQUIV_CLASS:
3316 elem->type = EQUIV_CLASS;
3317 break;
3318 case OP_OPEN_CHAR_CLASS:
3319 elem->type = CHAR_CLASS;
3320 break;
3321 default:
3322 break;
3323 }
3324 return REG_NOERROR;
3325}
3326
3327 /* Helper function for parse_bracket_exp.
3328 Build the equivalence class which is represented by NAME.
3329 The result are written to MBCSET and SBCSET.
3330 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3331 is a pointer argument sinse we may update it. */
3332
3333static reg_errcode_t
3334build_equiv_class (bitset sbcset,
3335#ifdef RE_ENABLE_I18N
3336 re_charset_t *mbcset, Idx *equiv_class_alloc,
3337#endif
3338 const unsigned char *name)
3339{
3340#if defined _LIBC
3341 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3342 if (nrules != 0)
3343 {
3344 const int32_t *table, *indirect;
3345 const unsigned char *weights, *extra, *cp;
3346 unsigned char char_buf[2];
3347 int32_t idx1, idx2;
3348 unsigned int ch;
3349 size_t len;
3350 /* This #include defines a local function! */
3351# include <locale/weight.h>
3352 /* Calculate the index for equivalence class. */
3353 cp = name;
3354 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3355 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3356 _NL_COLLATE_WEIGHTMB);
3357 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3358 _NL_COLLATE_EXTRAMB);
3359 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3360 _NL_COLLATE_INDIRECTMB);
3361 idx1 = findidx (&cp);
3362 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3363 /* This isn't a valid character. */
3364 return REG_ECOLLATE;
3365
3366 /* Build single byte matcing table for this equivalence class. */
3367 char_buf[1] = (unsigned char) '\0';
3368 len = weights[idx1];
3369 for (ch = 0; ch < SBC_MAX; ++ch)
3370 {
3371 char_buf[0] = ch;
3372 cp = char_buf;
3373 idx2 = findidx (&cp);
3374/*
3375 idx2 = table[ch];
3376*/
3377 if (idx2 == 0)
3378 /* This isn't a valid character. */
3379 continue;
3380 if (len == weights[idx2])
3381 {
3382 int cnt = 0;
3383 while (cnt <= len &&
3384 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3385 ++cnt;
3386
3387 if (cnt > len)
3388 bitset_set (sbcset, ch);
3389 }
3390 }
3391 /* Check whether the array has enough space. */
3392 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3393 {
3394 /* Not enough, realloc it. */
3395 Idx new_equiv_class_alloc = mbcset->nequiv_classes;
3396 /* Use realloc since the array is NULL if *alloc == 0. */
3397 int32_t *new_equiv_classes = re_x2realloc (mbcset->equiv_classes,
3398 int32_t,
3399 &new_equiv_class_alloc);
3400 if (BE (new_equiv_classes == NULL, 0))
3401 return REG_ESPACE;
3402 mbcset->equiv_classes = new_equiv_classes;
3403 *equiv_class_alloc = new_equiv_class_alloc;
3404 }
3405 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3406 }
3407 else
3408#endif /* _LIBC */
3409 {
3410 if (BE (strlen ((const char *) name) != 1, 0))
3411 return REG_ECOLLATE;
3412 bitset_set (sbcset, *name);
3413 }
3414 return REG_NOERROR;
3415}
3416
3417 /* Helper function for parse_bracket_exp.
3418 Build the character class which is represented by NAME.
3419 The result are written to MBCSET and SBCSET.
3420 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3421 is a pointer argument sinse we may update it. */
3422
3423static reg_errcode_t
3424build_charclass (unsigned REG_TRANSLATE_TYPE trans, bitset sbcset,
3425#ifdef RE_ENABLE_I18N
3426 re_charset_t *mbcset, Idx *char_class_alloc,
3427#endif
3428 const unsigned char *class_name, reg_syntax_t syntax)
3429{
3430 int i;
3431 const char *name = (const char *) class_name;
3432
3433 /* In case of REG_ICASE "upper" and "lower" match the both of
3434 upper and lower cases. */
3435 if ((syntax & REG_IGNORE_CASE)
3436 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3437 name = "alpha";
3438
3439#ifdef RE_ENABLE_I18N
3440 /* Check the space of the arrays. */
3441 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3442 {
3443 /* Not enough, realloc it. */
3444 Idx new_char_class_alloc = mbcset->nchar_classes;
3445 /* Use realloc since array is NULL if *alloc == 0. */
3446 wctype_t *new_char_classes = re_x2realloc (mbcset->char_classes, wctype_t,
3447 &new_char_class_alloc);
3448 if (BE (new_char_classes == NULL, 0))
3449 return REG_ESPACE;
3450 mbcset->char_classes = new_char_classes;
3451 *char_class_alloc = new_char_class_alloc;
3452 }
3453 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3454#endif /* RE_ENABLE_I18N */
3455
3456#define BUILD_CHARCLASS_LOOP(ctype_func) \
3457 for (i = 0; i < SBC_MAX; ++i) \
3458 { \
3459 if (ctype_func (i)) \
3460 { \
3461 int ch = trans ? trans[i] : i; \
3462 bitset_set (sbcset, ch); \
3463 } \
3464 }
3465
3466 if (strcmp (name, "alnum") == 0)
3467 BUILD_CHARCLASS_LOOP (isalnum)
3468 else if (strcmp (name, "cntrl") == 0)
3469 BUILD_CHARCLASS_LOOP (iscntrl)
3470 else if (strcmp (name, "lower") == 0)
3471 BUILD_CHARCLASS_LOOP (islower)
3472 else if (strcmp (name, "space") == 0)
3473 BUILD_CHARCLASS_LOOP (isspace)
3474 else if (strcmp (name, "alpha") == 0)
3475 BUILD_CHARCLASS_LOOP (isalpha)
3476 else if (strcmp (name, "digit") == 0)
3477 BUILD_CHARCLASS_LOOP (isdigit)
3478 else if (strcmp (name, "print") == 0)
3479 BUILD_CHARCLASS_LOOP (isprint)
3480 else if (strcmp (name, "upper") == 0)
3481 BUILD_CHARCLASS_LOOP (isupper)
3482 else if (strcmp (name, "blank") == 0)
3483 BUILD_CHARCLASS_LOOP (isblank)
3484 else if (strcmp (name, "graph") == 0)
3485 BUILD_CHARCLASS_LOOP (isgraph)
3486 else if (strcmp (name, "punct") == 0)
3487 BUILD_CHARCLASS_LOOP (ispunct)
3488 else if (strcmp (name, "xdigit") == 0)
3489 BUILD_CHARCLASS_LOOP (isxdigit)
3490 else
3491 return REG_ECTYPE;
3492
3493 return REG_NOERROR;
3494}
3495
3496static bin_tree_t *
3497build_charclass_op (re_dfa_t *dfa, unsigned REG_TRANSLATE_TYPE trans,
3498 const unsigned char *class_name,
3499 const unsigned char *extra,
3500 bool non_match, reg_errcode_t *err)
3501{
3502 re_bitset_ptr_t sbcset;
3503#ifdef RE_ENABLE_I18N
3504 re_charset_t *mbcset;
3505 Idx alloc = 0;
3506#endif /* not RE_ENABLE_I18N */
3507 reg_errcode_t ret;
3508 re_token_t br_token;
3509 bin_tree_t *tree;
3510
3511 sbcset = re_calloc (bitset_word, BITSET_WORDS);
3512#ifdef RE_ENABLE_I18N
3513 mbcset = re_calloc (re_charset_t, 1);
3514#endif /* RE_ENABLE_I18N */
3515
3516#ifdef RE_ENABLE_I18N
3517 if (BE (sbcset == NULL || mbcset == NULL, 0))
3518#else /* not RE_ENABLE_I18N */
3519 if (BE (sbcset == NULL, 0))
3520#endif /* not RE_ENABLE_I18N */
3521 {
3522 *err = REG_ESPACE;
3523 return NULL;
3524 }
3525
3526 if (non_match)
3527 {
3528#ifdef RE_ENABLE_I18N
3529 /*
3530 if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
3531 bitset_set(cset->sbcset, '\0');
3532 */
3533 mbcset->non_match = 1;
3534#endif /* not RE_ENABLE_I18N */
3535 }
3536
3537 /* We don't care the syntax in this case. */
3538 ret = build_charclass (trans, sbcset,
3539#ifdef RE_ENABLE_I18N
3540 mbcset, &alloc,
3541#endif /* RE_ENABLE_I18N */
3542 class_name, 0);
3543
3544 if (BE (ret != REG_NOERROR, 0))
3545 {
3546 re_free (sbcset);
3547#ifdef RE_ENABLE_I18N
3548 free_charset (mbcset);
3549#endif /* RE_ENABLE_I18N */
3550 *err = ret;
3551 return NULL;
3552 }
3553 /* \w match '_' also. */
3554 for (; *extra; extra++)
3555 bitset_set (sbcset, *extra);
3556
3557 /* If it is non-matching list. */
3558 if (non_match)
3559 bitset_not (sbcset);
3560
3561#ifdef RE_ENABLE_I18N
3562 /* Ensure only single byte characters are set. */
3563 if (dfa->mb_cur_max > 1)
3564 bitset_mask (sbcset, dfa->sb_char);
3565#endif
3566
3567 /* Build a tree for simple bracket. */
3568 br_token.type = SIMPLE_BRACKET;
3569 br_token.opr.sbcset = sbcset;
3570 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3571 if (BE (tree == NULL, 0))
3572 goto build_word_op_espace;
3573
3574#ifdef RE_ENABLE_I18N
3575 if (dfa->mb_cur_max > 1)
3576 {
3577 bin_tree_t *mbc_tree;
3578 /* Build a tree for complex bracket. */
3579 br_token.type = COMPLEX_BRACKET;
3580 br_token.opr.mbcset = mbcset;
3581 dfa->has_mb_node = 1;
3582 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3583 if (BE (mbc_tree == NULL, 0))
3584 goto build_word_op_espace;
3585 /* Then join them by ALT node. */
3586 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3587 if (BE (mbc_tree != NULL, 1))
3588 return tree;
3589 }
3590 else
3591 {
3592 free_charset (mbcset);
3593 return tree;
3594 }
3595#else /* not RE_ENABLE_I18N */
3596 return tree;
3597#endif /* not RE_ENABLE_I18N */
3598
3599 build_word_op_espace:
3600 re_free (sbcset);
3601#ifdef RE_ENABLE_I18N
3602 free_charset (mbcset);
3603#endif /* RE_ENABLE_I18N */
3604 *err = REG_ESPACE;
3605 return NULL;
3606}
3607
3608/* This is intended for the expressions like "a{1,3}".
3609 Fetch a number from `input', and return the number.
3610 Return REG_MISSING if the number field is empty like "{,1}".
3611 Return REG_ERROR if an error occurred. */
3612
3613static Idx
3614fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3615{
3616 Idx num = REG_MISSING;
3617 unsigned char c;
3618 while (1)
3619 {
3620 fetch_token (token, input, syntax);
3621 c = token->opr.c;
3622 if (BE (token->type == END_OF_RE, 0))
3623 return REG_ERROR;
3624 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3625 break;
3626 num = ((token->type != CHARACTER || c < '0' || '9' < c
3627 || num == REG_ERROR)
3628 ? REG_ERROR
3629 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3630 num = (num > REG_DUP_MAX) ? REG_ERROR : num;
3631 }
3632 return num;
3633}
3634
3635#ifdef RE_ENABLE_I18N
3636static void
3637free_charset (re_charset_t *cset)
3638{
3639 re_free (cset->mbchars);
3640# ifdef _LIBC
3641 re_free (cset->coll_syms);
3642 re_free (cset->equiv_classes);
3643 re_free (cset->range_starts);
3644 re_free (cset->range_ends);
3645# endif
3646 re_free (cset->char_classes);
3647 re_free (cset);
3648}
3649#endif /* RE_ENABLE_I18N */
3650
3651/* Functions for binary tree operation. */
3652
3653/* Create a tree node. */
3654
3655static bin_tree_t *
3656create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3657 re_token_type_t type)
3658{
3659 re_token_t t;
3660 t.type = type;
3661 return create_token_tree (dfa, left, right, &t);
3662}
3663
3664static bin_tree_t *
3665create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3666 const re_token_t *token)
3667{
3668 bin_tree_t *tree;
3669 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3670 {
3671 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3672
3673 if (storage == NULL)
3674 return NULL;
3675 storage->next = dfa->str_tree_storage;
3676 dfa->str_tree_storage = storage;
3677 dfa->str_tree_storage_idx = 0;
3678 }
3679 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3680
3681 tree->parent = NULL;
3682 tree->left = left;
3683 tree->right = right;
3684 tree->token = *token;
3685 tree->token.duplicated = 0;
3686 tree->token.opt_subexp = 0;
3687 tree->first = NULL;
3688 tree->next = NULL;
3689 tree->node_idx = REG_MISSING;
3690
3691 if (left != NULL)
3692 left->parent = tree;
3693 if (right != NULL)
3694 right->parent = tree;
3695 return tree;
3696}
3697
3698/* Mark the tree SRC as an optional subexpression.
3699 To be called from preorder or postorder. */
3700
3701static reg_errcode_t
3702mark_opt_subexp (void *extra, bin_tree_t *node)
3703{
3704 Idx idx = (Idx) (long) extra;
3705 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3706 node->token.opt_subexp = 1;
3707
3708 return REG_NOERROR;
3709}
3710
3711/* Free the allocated memory inside NODE. */
3712
3713static void
3714free_token (re_token_t *node)
3715{
3716#ifdef RE_ENABLE_I18N
3717 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3718 free_charset (node->opr.mbcset);
3719 else
3720#endif /* RE_ENABLE_I18N */
3721 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3722 re_free (node->opr.sbcset);
3723}
3724
3725/* Worker function for tree walking. Free the allocated memory inside NODE
3726 and its children. */
3727
3728static reg_errcode_t
3729free_tree (void *extra, bin_tree_t *node)
3730{
3731 free_token (&node->token);
3732 return REG_NOERROR;
3733}
3734
3735
3736/* Duplicate the node SRC, and return new node. This is a preorder
3737 visit similar to the one implemented by the generic visitor, but
3738 we need more infrastructure to maintain two parallel trees --- so,
3739 it's easier to duplicate. */
3740
3741static bin_tree_t *
3742duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3743{
3744 const bin_tree_t *node;
3745 bin_tree_t *dup_root;
3746 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3747
3748 for (node = root; ; )
3749 {
3750 /* Create a new tree and link it back to the current parent. */
3751 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3752 if (*p_new == NULL)
3753 return NULL;
3754 (*p_new)->parent = dup_node;
3755 (*p_new)->token.duplicated = 1;
3756 dup_node = *p_new;
3757
3758 /* Go to the left node, or up and to the right. */
3759 if (node->left)
3760 {
3761 node = node->left;
3762 p_new = &dup_node->left;
3763 }
3764 else
3765 {
3766 const bin_tree_t *prev = NULL;
3767 while (node->right == prev || node->right == NULL)
3768 {
3769 prev = node;
3770 node = node->parent;
3771 dup_node = dup_node->parent;
3772 if (!node)
3773 return dup_root;
3774 }
3775 node = node->right;
3776 p_new = &dup_node->right;
3777 }
3778 }
3779}
diff --git a/lib/regex.c b/lib/regex.c
new file mode 100644
index 0000000..82e76c0
--- /dev/null
+++ b/lib/regex.c
@@ -0,0 +1,68 @@
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19
20#ifdef HAVE_CONFIG_H
21# include <config.h>
22#endif
23
24#ifdef _LIBC
25/* We have to keep the namespace clean. */
26# define regfree(preg) __regfree (preg)
27# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
28# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
29# define regerror(errcode, preg, errbuf, errbuf_size) \
30 __regerror(errcode, preg, errbuf, errbuf_size)
31# define re_set_registers(bu, re, nu, st, en) \
32 __re_set_registers (bu, re, nu, st, en)
33# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
34 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
35# define re_match(bufp, string, size, pos, regs) \
36 __re_match (bufp, string, size, pos, regs)
37# define re_search(bufp, string, size, startpos, range, regs) \
38 __re_search (bufp, string, size, startpos, range, regs)
39# define re_compile_pattern(pattern, length, bufp) \
40 __re_compile_pattern (pattern, length, bufp)
41# define re_set_syntax(syntax) __re_set_syntax (syntax)
42# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
43 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
44# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
45
46# include "../locale/localeinfo.h"
47#endif
48
49/* On some systems, limits.h sets RE_DUP_MAX to a lower value than
50 GNU regex allows. Include it before <regex.h>, which correctly
51 #undefs RE_DUP_MAX and sets it to the right value. */
52#include <limits.h>
53
54#include <regex.h>
55#include "regex_internal.h"
56
57#include "regex_internal.c"
58#include "regcomp.c"
59#include "regexec.c"
60
61/* Binary backward compatibility. */
62#if _LIBC
63# include <shlib-compat.h>
64# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3)
65link_warning (re_max_failures, "the 're_max_failures' variable is obsolete and will go away.")
66int re_max_failures = 2000;
67# endif
68#endif
diff --git a/lib/regex.h b/lib/regex.h
new file mode 100644
index 0000000..c06a062
--- /dev/null
+++ b/lib/regex.h
@@ -0,0 +1,769 @@
1/* Definitions for data structures and routines for the regular
2 expression library.
3 Copyright (C) 1985,1989-93,1995-98,2000,2001,2002,2003,2005
4 Free Software Foundation, Inc.
5 This file is part of the GNU C Library.
6
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License along
18 with this program; if not, write to the Free Software Foundation,
19 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20
21#ifndef _REGEX_H
22#define _REGEX_H 1
23
24#include <sys/types.h>
25
26/* Allow the use in C++ code. */
27#ifdef __cplusplus
28extern "C" {
29#endif
30
31/* Define _REGEX_SOURCE to get definitions that are incompatible with
32 POSIX. */
33#if (!defined _REGEX_SOURCE \
34 && (defined _GNU_SOURCE \
35 || (!defined _POSIX_C_SOURCE && !defined _POSIX_SOURCE \
36 && !defined _XOPEN_SOURCE)))
37# define _REGEX_SOURCE 1
38#endif
39
40#if defined _REGEX_SOURCE && defined VMS
41/* VMS doesn't have `size_t' in <sys/types.h>, even though POSIX says it
42 should be there. */
43# include <stddef.h>
44#endif
45
46#ifdef _REGEX_LARGE_OFFSETS
47
48/* Use types and values that are wide enough to represent signed and
49 unsigned byte offsets in memory. This currently works only when
50 the regex code is used outside of the GNU C library; it is not yet
51 supported within glibc itself, and glibc users should not define
52 _REGEX_LARGE_OFFSETS. */
53
54/* The type of the offset of a byte within a string.
55 For historical reasons POSIX 1003.1-2004 requires that regoff_t be
56 at least as wide as off_t. This is a bit odd (and many common
57 POSIX platforms set it to the more-sensible ssize_t) but we might
58 as well conform. We don't know of any hosts where ssize_t is wider
59 than off_t, so off_t is safe. */
60typedef off_t regoff_t;
61
62/* The type of nonnegative object indexes. Traditionally, GNU regex
63 uses 'int' for these. Code that uses __re_idx_t should work
64 regardless of whether the type is signed. */
65typedef size_t __re_idx_t;
66
67/* The type of object sizes. */
68typedef size_t __re_size_t;
69
70/* The type of object sizes, in places where the traditional code
71 uses unsigned long int. */
72typedef size_t __re_long_size_t;
73
74#else
75
76/* Use types that are binary-compatible with the traditional GNU regex
77 implementation, which mishandles strings longer than INT_MAX. */
78
79typedef int regoff_t;
80typedef int __re_idx_t;
81typedef unsigned int __re_size_t;
82typedef unsigned long int __re_long_size_t;
83
84#endif
85
86/* The following two types have to be signed and unsigned integer type
87 wide enough to hold a value of a pointer. For most ANSI compilers
88 ptrdiff_t and size_t should be likely OK. Still size of these two
89 types is 2 for Microsoft C. Ugh... */
90typedef long int s_reg_t;
91typedef unsigned long int active_reg_t;
92
93/* The following bits are used to determine the regexp syntax we
94 recognize. The set/not-set meanings are chosen so that Emacs syntax
95 remains the value 0. The bits are given in alphabetical order, and
96 the definitions shifted by one from the previous bit; thus, when we
97 add or remove a bit, only one other definition need change. */
98typedef unsigned long int reg_syntax_t;
99
100/* If this bit is not set, then \ inside a bracket expression is literal.
101 If set, then such a \ quotes the following character. */
102#define REG_BACKSLASH_ESCAPE_IN_LISTS 1ul
103
104/* If this bit is not set, then + and ? are operators, and \+ and \? are
105 literals.
106 If set, then \+ and \? are operators and + and ? are literals. */
107#define REG_BK_PLUS_QM (1ul << 1)
108
109/* If this bit is set, then character classes are supported. They are:
110 [:alpha:], [:upper:], [:lower:], [:digit:], [:alnum:], [:xdigit:],
111 [:space:], [:print:], [:punct:], [:graph:], and [:cntrl:].
112 If not set, then character classes are not supported. */
113#define REG_CHAR_CLASSES (1ul << 2)
114
115/* If this bit is set, then ^ and $ are always anchors (outside bracket
116 expressions, of course).
117 If this bit is not set, then it depends:
118 ^ is an anchor if it is at the beginning of a regular
119 expression or after an open-group or an alternation operator;
120 $ is an anchor if it is at the end of a regular expression, or
121 before a close-group or an alternation operator.
122
123 This bit could be (re)combined with REG_CONTEXT_INDEP_OPS, because
124 POSIX draft 11.2 says that * etc. in leading positions is undefined.
125 We already implemented a previous draft which made those constructs
126 invalid, though, so we haven't changed the code back. */
127#define REG_CONTEXT_INDEP_ANCHORS (1ul << 3)
128
129/* If this bit is set, then special characters are always special
130 regardless of where they are in the pattern.
131 If this bit is not set, then special characters are special only in
132 some contexts; otherwise they are ordinary. Specifically,
133 * + ? and intervals are only special when not after the beginning,
134 open-group, or alternation operator. */
135#define REG_CONTEXT_INDEP_OPS (1ul << 4)
136
137/* If this bit is set, then *, +, ?, and { cannot be first in an re or
138 immediately after an alternation or begin-group operator. */
139#define REG_CONTEXT_INVALID_OPS (1ul << 5)
140
141/* If this bit is set, then . matches newline.
142 If not set, then it doesn't. */
143#define REG_DOT_NEWLINE (1ul << 6)
144
145/* If this bit is set, then . doesn't match NUL.
146 If not set, then it does. */
147#define REG_DOT_NOT_NULL (1ul << 7)
148
149/* If this bit is set, nonmatching lists [^...] do not match newline.
150 If not set, they do. */
151#define REG_HAT_LISTS_NOT_NEWLINE (1ul << 8)
152
153/* If this bit is set, either \{...\} or {...} defines an
154 interval, depending on REG_NO_BK_BRACES.
155 If not set, \{, \}, {, and } are literals. */
156#define REG_INTERVALS (1ul << 9)
157
158/* If this bit is set, +, ? and | aren't recognized as operators.
159 If not set, they are. */
160#define REG_LIMITED_OPS (1ul << 10)
161
162/* If this bit is set, newline is an alternation operator.
163 If not set, newline is literal. */
164#define REG_NEWLINE_ALT (1ul << 11)
165
166/* If this bit is set, then `{...}' defines an interval, and \{ and \}
167 are literals.
168 If not set, then `\{...\}' defines an interval. */
169#define REG_NO_BK_BRACES (1ul << 12)
170
171/* If this bit is set, (...) defines a group, and \( and \) are literals.
172 If not set, \(...\) defines a group, and ( and ) are literals. */
173#define REG_NO_BK_PARENS (1ul << 13)
174
175/* If this bit is set, then \<digit> matches <digit>.
176 If not set, then \<digit> is a back-reference. */
177#define REG_NO_BK_REFS (1ul << 14)
178
179/* If this bit is set, then | is an alternation operator, and \| is literal.
180 If not set, then \| is an alternation operator, and | is literal. */
181#define REG_NO_BK_VBAR (1ul << 15)
182
183/* If this bit is set, then an ending range point collating higher
184 than the starting range point, as in [z-a], is invalid.
185 If not set, the containing range is empty and does not match any string. */
186#define REG_NO_EMPTY_RANGES (1ul << 16)
187
188/* If this bit is set, then an unmatched ) is ordinary.
189 If not set, then an unmatched ) is invalid. */
190#define REG_UNMATCHED_RIGHT_PAREN_ORD (1ul << 17)
191
192/* If this bit is set, succeed as soon as we match the whole pattern,
193 without further backtracking. */
194#define REG_NO_POSIX_BACKTRACKING (1ul << 18)
195
196/* If this bit is set, do not process the GNU regex operators.
197 If not set, then the GNU regex operators are recognized. */
198#define REG_NO_GNU_OPS (1ul << 19)
199
200/* If this bit is set, turn on internal regex debugging.
201 If not set, and debugging was on, turn it off.
202 This only works if regex.c is compiled -DDEBUG.
203 We define this bit always, so that all that's needed to turn on
204 debugging is to recompile regex.c; the calling code can always have
205 this bit set, and it won't affect anything in the normal case. */
206#define REG_DEBUG (1ul << 20)
207
208/* If this bit is set, a syntactically invalid interval is treated as
209 a string of ordinary characters. For example, the ERE 'a{1' is
210 treated as 'a\{1'. */
211#define REG_INVALID_INTERVAL_ORD (1ul << 21)
212
213/* If this bit is set, then ignore case when matching.
214 If not set, then case is significant. */
215#define REG_IGNORE_CASE (1ul << 22)
216
217/* This bit is used internally like REG_CONTEXT_INDEP_ANCHORS but only
218 for ^, because it is difficult to scan the regex backwards to find
219 whether ^ should be special. */
220#define REG_CARET_ANCHORS_HERE (1ul << 23)
221
222/* If this bit is set, then \{ cannot be first in an bre or
223 immediately after an alternation or begin-group operator. */
224#define REG_CONTEXT_INVALID_DUP (1ul << 24)
225
226/* If this bit is set, then no_sub will be set to 1 during
227 re_compile_pattern. */
228#define REG_NO_SUB (1ul << 25)
229
230/* This global variable defines the particular regexp syntax to use (for
231 some interfaces). When a regexp is compiled, the syntax used is
232 stored in the pattern buffer, so changing this does not affect
233 already-compiled regexps. */
234extern reg_syntax_t re_syntax_options;
235
236/* Define combinations of the above bits for the standard possibilities.
237 (The [[[ comments delimit what gets put into the Texinfo file, so
238 don't delete them!) */
239/* [[[begin syntaxes]]] */
240#define REG_SYNTAX_EMACS 0
241
242#define REG_SYNTAX_AWK \
243 (REG_BACKSLASH_ESCAPE_IN_LISTS | REG_DOT_NOT_NULL \
244 | REG_NO_BK_PARENS | REG_NO_BK_REFS \
245 | REG_NO_BK_VBAR | REG_NO_EMPTY_RANGES \
246 | REG_DOT_NEWLINE | REG_CONTEXT_INDEP_ANCHORS \
247 | REG_UNMATCHED_RIGHT_PAREN_ORD | REG_NO_GNU_OPS)
248
249#define REG_SYNTAX_GNU_AWK \
250 ((REG_SYNTAX_POSIX_EXTENDED | REG_BACKSLASH_ESCAPE_IN_LISTS \
251 | REG_DEBUG) \
252 & ~(REG_DOT_NOT_NULL | REG_INTERVALS | REG_CONTEXT_INDEP_OPS \
253 | REG_CONTEXT_INVALID_OPS ))
254
255#define REG_SYNTAX_POSIX_AWK \
256 (REG_SYNTAX_POSIX_EXTENDED | REG_BACKSLASH_ESCAPE_IN_LISTS \
257 | REG_INTERVALS | REG_NO_GNU_OPS)
258
259#define REG_SYNTAX_GREP \
260 (REG_BK_PLUS_QM | REG_CHAR_CLASSES \
261 | REG_HAT_LISTS_NOT_NEWLINE | REG_INTERVALS \
262 | REG_NEWLINE_ALT)
263
264#define REG_SYNTAX_EGREP \
265 (REG_CHAR_CLASSES | REG_CONTEXT_INDEP_ANCHORS \
266 | REG_CONTEXT_INDEP_OPS | REG_HAT_LISTS_NOT_NEWLINE \
267 | REG_NEWLINE_ALT | REG_NO_BK_PARENS \
268 | REG_NO_BK_VBAR)
269
270#define REG_SYNTAX_POSIX_EGREP \
271 (REG_SYNTAX_EGREP | REG_INTERVALS | REG_NO_BK_BRACES \
272 | REG_INVALID_INTERVAL_ORD)
273
274/* P1003.2/D11.2, section 4.20.7.1, lines 5078ff. */
275#define REG_SYNTAX_ED REG_SYNTAX_POSIX_BASIC
276
277#define REG_SYNTAX_SED REG_SYNTAX_POSIX_BASIC
278
279/* Syntax bits common to both basic and extended POSIX regex syntax. */
280#define _REG_SYNTAX_POSIX_COMMON \
281 (REG_CHAR_CLASSES | REG_DOT_NEWLINE | REG_DOT_NOT_NULL \
282 | REG_INTERVALS | REG_NO_EMPTY_RANGES)
283
284#define REG_SYNTAX_POSIX_BASIC \
285 (_REG_SYNTAX_POSIX_COMMON | REG_BK_PLUS_QM | REG_CONTEXT_INVALID_DUP)
286
287/* Differs from ..._POSIX_BASIC only in that REG_BK_PLUS_QM becomes
288 REG_LIMITED_OPS, i.e., \? \+ \| are not recognized. Actually, this
289 isn't minimal, since other operators, such as \`, aren't disabled. */
290#define REG_SYNTAX_POSIX_MINIMAL_BASIC \
291 (_REG_SYNTAX_POSIX_COMMON | REG_LIMITED_OPS)
292
293#define REG_SYNTAX_POSIX_EXTENDED \
294 (_REG_SYNTAX_POSIX_COMMON | REG_CONTEXT_INDEP_ANCHORS \
295 | REG_CONTEXT_INDEP_OPS | REG_NO_BK_BRACES \
296 | REG_NO_BK_PARENS | REG_NO_BK_VBAR \
297 | REG_CONTEXT_INVALID_OPS | REG_UNMATCHED_RIGHT_PAREN_ORD)
298
299/* Differs from ..._POSIX_EXTENDED in that REG_CONTEXT_INDEP_OPS is
300 removed and REG_NO_BK_REFS is added. */
301#define REG_SYNTAX_POSIX_MINIMAL_EXTENDED \
302 (_REG_SYNTAX_POSIX_COMMON | REG_CONTEXT_INDEP_ANCHORS \
303 | REG_CONTEXT_INVALID_OPS | REG_NO_BK_BRACES \
304 | REG_NO_BK_PARENS | REG_NO_BK_REFS \
305 | REG_NO_BK_VBAR | REG_UNMATCHED_RIGHT_PAREN_ORD)
306/* [[[end syntaxes]]] */
307
308/* Maximum number of duplicates an interval can allow. This is
309 distinct from RE_DUP_MAX, to conform to POSIX name space rules and
310 to avoid collisions with <limits.h>. */
311#define REG_DUP_MAX 32767
312
313
314/* POSIX `cflags' bits (i.e., information for `regcomp'). */
315
316/* If this bit is set, then use extended regular expression syntax.
317 If not set, then use basic regular expression syntax. */
318#define REG_EXTENDED 1
319
320/* If this bit is set, then ignore case when matching.
321 If not set, then case is significant. */
322#define REG_ICASE (1 << 1)
323
324/* If this bit is set, then anchors do not match at newline
325 characters in the string.
326 If not set, then anchors do match at newlines. */
327#define REG_NEWLINE (1 << 2)
328
329/* If this bit is set, then report only success or fail in regexec.
330 If not set, then returns differ between not matching and errors. */
331#define REG_NOSUB (1 << 3)
332
333
334/* POSIX `eflags' bits (i.e., information for regexec). */
335
336/* If this bit is set, then the beginning-of-line operator doesn't match
337 the beginning of the string (presumably because it's not the
338 beginning of a line).
339 If not set, then the beginning-of-line operator does match the
340 beginning of the string. */
341#define REG_NOTBOL 1
342
343/* Like REG_NOTBOL, except for the end-of-line. */
344#define REG_NOTEOL (1 << 1)
345
346/* Use PMATCH[0] to delimit the start and end of the search in the
347 buffer. */
348#define REG_STARTEND (1 << 2)
349
350
351/* If any error codes are removed, changed, or added, update the
352 `__re_error_msgid' table in regcomp.c. */
353
354typedef enum
355{
356 _REG_ENOSYS = -1, /* This will never happen for this implementation. */
357#define REG_ENOSYS _REG_ENOSYS
358
359 _REG_NOERROR, /* Success. */
360#define REG_NOERROR _REG_NOERROR
361
362 _REG_NOMATCH, /* Didn't find a match (for regexec). */
363#define REG_NOMATCH _REG_NOMATCH
364
365 /* POSIX regcomp return error codes. (In the order listed in the
366 standard.) */
367
368 _REG_BADPAT, /* Invalid pattern. */
369#define REG_BADPAT _REG_BADPAT
370
371 _REG_ECOLLATE, /* Inalid collating element. */
372#define REG_ECOLLATE _REG_ECOLLATE
373
374 _REG_ECTYPE, /* Invalid character class name. */
375#define REG_ECTYPE _REG_ECTYPE
376
377 _REG_EESCAPE, /* Trailing backslash. */
378#define REG_EESCAPE _REG_EESCAPE
379
380 _REG_ESUBREG, /* Invalid back reference. */
381#define REG_ESUBREG _REG_ESUBREG
382
383 _REG_EBRACK, /* Unmatched left bracket. */
384#define REG_EBRACK _REG_EBRACK
385
386 _REG_EPAREN, /* Parenthesis imbalance. */
387#define REG_EPAREN _REG_EPAREN
388
389 _REG_EBRACE, /* Unmatched \{. */
390#define REG_EBRACE _REG_EBRACE
391
392 _REG_BADBR, /* Invalid contents of \{\}. */
393#define REG_BADBR _REG_BADBR
394
395 _REG_ERANGE, /* Invalid range end. */
396#define REG_ERANGE _REG_ERANGE
397
398 _REG_ESPACE, /* Ran out of memory. */
399#define REG_ESPACE _REG_ESPACE
400
401 _REG_BADRPT, /* No preceding re for repetition op. */
402#define REG_BADRPT _REG_BADRPT
403
404 /* Error codes we've added. */
405
406 _REG_EEND, /* Premature end. */
407#define REG_EEND _REG_EEND
408
409 _REG_ESIZE, /* Compiled pattern bigger than 2^16 bytes. */
410#define REG_ESIZE _REG_ESIZE
411
412 _REG_ERPAREN /* Unmatched ) or \); not returned from regcomp. */
413#define REG_ERPAREN _REG_ERPAREN
414
415} reg_errcode_t;
416
417/* In the traditional GNU implementation, regex.h defined member names
418 like `buffer' that POSIX does not allow. These members now have
419 names with leading `re_' (e.g., `re_buffer'). Support the old
420 names only if _REGEX_SOURCE is defined. New programs should use
421 the new names. */
422#ifdef _REGEX_SOURCE
423# define _REG_RE_NAME(id) id
424# define _REG_RM_NAME(id) id
425#else
426# define _REG_RE_NAME(id) re_##id
427# define _REG_RM_NAME(id) rm_##id
428#endif
429
430/* The user can specify the type of the re_translate member by
431 defining the macro REG_TRANSLATE_TYPE. In the traditional GNU
432 implementation, this macro was named RE_TRANSLATE_TYPE, but POSIX
433 does not allow this. Support the old name only if _REGEX_SOURCE
434 and if the new name is not defined. New programs should use the new
435 name. */
436#ifndef REG_TRANSLATE_TYPE
437# if defined _REGEX_SOURCE && defined RE_TRANSLATE_TYPE
438# define REG_TRANSLATE_TYPE RE_TRANSLATE_TYPE
439# else
440# define REG_TRANSLATE_TYPE char *
441# endif
442#endif
443
444/* This data structure represents a compiled pattern. Before calling
445 the pattern compiler), the fields `re_buffer', `re_allocated', `re_fastmap',
446 `re_translate', and `re_no_sub' can be set. After the pattern has been
447 compiled, the `re_nsub' field is available. All other fields are
448 private to the regex routines. */
449
450struct re_pattern_buffer
451{
452/* [[[begin pattern_buffer]]] */
453 /* Space that holds the compiled pattern. It is declared as
454 `unsigned char *' because its elements are
455 sometimes used as array indexes. */
456 unsigned char *_REG_RE_NAME (buffer);
457
458 /* Number of bytes to which `re_buffer' points. */
459 __re_long_size_t _REG_RE_NAME (allocated);
460
461 /* Number of bytes actually used in `re_buffer'. */
462 __re_long_size_t _REG_RE_NAME (used);
463
464 /* Syntax setting with which the pattern was compiled. */
465 reg_syntax_t _REG_RE_NAME (syntax);
466
467 /* Pointer to a fastmap, if any, otherwise zero. re_search uses
468 the fastmap, if there is one, to skip over impossible
469 starting points for matches. */
470 char *_REG_RE_NAME (fastmap);
471
472 /* Either a translate table to apply to all characters before
473 comparing them, or zero for no translation. The translation
474 is applied to a pattern when it is compiled and to a string
475 when it is matched. */
476 REG_TRANSLATE_TYPE _REG_RE_NAME (translate);
477
478 /* Number of subexpressions found by the compiler. */
479 size_t re_nsub;
480
481 /* Zero if this pattern cannot match the empty string, one else.
482 Well, in truth it's used only in `re_search_2', to see
483 whether or not we should use the fastmap, so we don't set
484 this absolutely perfectly; see `re_compile_fastmap' (the
485 `duplicate' case). */
486 unsigned int _REG_RE_NAME (can_be_null) : 1;
487
488 /* If REG_UNALLOCATED, allocate space in the `regs' structure
489 for `max (REG_NREGS, re_nsub + 1)' groups.
490 If REG_REALLOCATE, reallocate space if necessary.
491 If REG_FIXED, use what's there. */
492#define REG_UNALLOCATED 0
493#define REG_REALLOCATE 1
494#define REG_FIXED 2
495 unsigned int _REG_RE_NAME (regs_allocated) : 2;
496
497 /* Set to zero when `regex_compile' compiles a pattern; set to one
498 by `re_compile_fastmap' if it updates the fastmap. */
499 unsigned int _REG_RE_NAME (fastmap_accurate) : 1;
500
501 /* If set, `re_match_2' does not return information about
502 subexpressions. */
503 unsigned int _REG_RE_NAME (no_sub) : 1;
504
505 /* If set, a beginning-of-line anchor doesn't match at the
506 beginning of the string. */
507 unsigned int _REG_RE_NAME (not_bol) : 1;
508
509 /* Similarly for an end-of-line anchor. */
510 unsigned int _REG_RE_NAME (not_eol) : 1;
511
512 /* If true, an anchor at a newline matches. */
513 unsigned int _REG_RE_NAME (newline_anchor) : 1;
514
515/* [[[end pattern_buffer]]] */
516};
517
518typedef struct re_pattern_buffer regex_t;
519
520/* This is the structure we store register match data in. See
521 regex.texinfo for a full description of what registers match. */
522struct re_registers
523{
524 __re_size_t _REG_RM_NAME (num_regs);
525 regoff_t *_REG_RM_NAME (start);
526 regoff_t *_REG_RM_NAME (end);
527};
528
529
530/* If `regs_allocated' is REG_UNALLOCATED in the pattern buffer,
531 `re_match_2' returns information about at least this many registers
532 the first time a `regs' structure is passed. */
533#ifndef REG_NREGS
534# define REG_NREGS 30
535#endif
536
537
538/* POSIX specification for registers. Aside from the different names than
539 `re_registers', POSIX uses an array of structures, instead of a
540 structure of arrays. */
541typedef struct
542{
543 regoff_t rm_so; /* Byte offset from string's start to substring's start. */
544 regoff_t rm_eo; /* Byte offset from string's start to substring's end. */
545} regmatch_t;
546
547/* Declarations for routines. */
548
549/* Sets the current default syntax to SYNTAX, and return the old syntax.
550 You can also simply assign to the `re_syntax_options' variable. */
551extern reg_syntax_t re_set_syntax (reg_syntax_t __syntax);
552
553/* Compile the regular expression PATTERN, with length LENGTH
554 and syntax given by the global `re_syntax_options', into the buffer
555 BUFFER. Return NULL if successful, and an error string if not. */
556extern const char *re_compile_pattern (const char *__pattern, size_t __length,
557 struct re_pattern_buffer *__buffer);
558
559
560/* Compile a fastmap for the compiled pattern in BUFFER; used to
561 accelerate searches. Return 0 if successful and -2 if was an
562 internal error. */
563extern int re_compile_fastmap (struct re_pattern_buffer *__buffer);
564
565
566/* Search in the string STRING (with length LENGTH) for the pattern
567 compiled into BUFFER. Start searching at position START, for RANGE
568 characters. Return the starting position of the match, -1 for no
569 match, or -2 for an internal error. Also return register
570 information in REGS (if REGS and BUFFER->re_no_sub are nonzero). */
571extern regoff_t re_search (struct re_pattern_buffer *__buffer,
572 const char *__string, __re_idx_t __length,
573 __re_idx_t __start, regoff_t __range,
574 struct re_registers *__regs);
575
576
577/* Like `re_search', but search in the concatenation of STRING1 and
578 STRING2. Also, stop searching at index START + STOP. */
579extern regoff_t re_search_2 (struct re_pattern_buffer *__buffer,
580 const char *__string1, __re_idx_t __length1,
581 const char *__string2, __re_idx_t __length2,
582 __re_idx_t __start, regoff_t __range,
583 struct re_registers *__regs,
584 __re_idx_t __stop);
585
586
587/* Like `re_search', but return how many characters in STRING the regexp
588 in BUFFER matched, starting at position START. */
589extern regoff_t re_match (struct re_pattern_buffer *__buffer,
590 const char *__string, __re_idx_t __length,
591 __re_idx_t __start, struct re_registers *__regs);
592
593
594/* Relates to `re_match' as `re_search_2' relates to `re_search'. */
595extern regoff_t re_match_2 (struct re_pattern_buffer *__buffer,
596 const char *__string1, __re_idx_t __length1,
597 const char *__string2, __re_idx_t __length2,
598 __re_idx_t __start, struct re_registers *__regs,
599 __re_idx_t __stop);
600
601
602/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
603 ENDS. Subsequent matches using BUFFER and REGS will use this memory
604 for recording register information. STARTS and ENDS must be
605 allocated with malloc, and must each be at least `NUM_REGS * sizeof
606 (regoff_t)' bytes long.
607
608 If NUM_REGS == 0, then subsequent matches should allocate their own
609 register data.
610
611 Unless this function is called, the first search or match using
612 PATTERN_BUFFER will allocate its own register data, without
613 freeing the old data. */
614extern void re_set_registers (struct re_pattern_buffer *__buffer,
615 struct re_registers *__regs,
616 __re_size_t __num_regs,
617 regoff_t *__starts, regoff_t *__ends);
618
619#if defined _REGEX_RE_COMP || defined _LIBC
620# ifndef _CRAY
621/* 4.2 bsd compatibility. */
622extern char *re_comp (const char *);
623extern int re_exec (const char *);
624# endif
625#endif
626
627/* GCC 2.95 and later have "__restrict"; C99 compilers have
628 "restrict", and "configure" may have defined "restrict". */
629#ifndef __restrict
630# if ! (2 < __GNUC__ || (2 == __GNUC__ && 95 <= __GNUC_MINOR__))
631# if defined restrict || 199901L <= __STDC_VERSION__
632# define __restrict restrict
633# else
634# define __restrict
635# endif
636# endif
637#endif
638/* gcc 3.1 and up support the [restrict] syntax, but g++ doesn't. */
639#ifndef __restrict_arr
640# if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) && !defined __cplusplus
641# define __restrict_arr __restrict
642# else
643# define __restrict_arr
644# endif
645#endif
646
647/* POSIX compatibility. */
648extern int regcomp (regex_t *__restrict __preg,
649 const char *__restrict __pattern,
650 int __cflags);
651
652extern int regexec (const regex_t *__restrict __preg,
653 const char *__restrict __string, size_t __nmatch,
654 regmatch_t __pmatch[__restrict_arr],
655 int __eflags);
656
657extern size_t regerror (int __errcode, const regex_t *__restrict __preg,
658 char *__restrict __errbuf, size_t __errbuf_size);
659
660extern void regfree (regex_t *__preg);
661
662
663#ifdef _REGEX_SOURCE
664
665/* Define the POSIX-compatible member names in terms of the
666 incompatible (and deprecated) names established by _REG_RE_NAME.
667 New programs should use the re_* names. */
668
669# define re_allocated allocated
670# define re_buffer buffer
671# define re_can_be_null can_be_null
672# define re_fastmap fastmap
673# define re_fastmap_accurate fastmap_accurate
674# define re_newline_anchor newline_anchor
675# define re_no_sub no_sub
676# define re_not_bol not_bol
677# define re_not_eol not_eol
678# define re_regs_allocated regs_allocated
679# define re_syntax syntax
680# define re_translate translate
681# define re_used used
682
683/* Similarly for _REG_RM_NAME. */
684
685# define rm_end end
686# define rm_num_regs num_regs
687# define rm_start start
688
689/* Undef RE_DUP_MAX first, in case the user has already included a
690 <limits.h> with an incompatible definition.
691
692 On GNU systems, the most common spelling for RE_DUP_MAX's value in
693 <limits.h> is (0x7ffff), so define RE_DUP_MAX to that, not to
694 REG_DUP_MAX. This avoid some duplicate-macro-definition warnings
695 with programs that include <limits.h> after this file.
696
697 New programs should not assume that regex.h defines RE_DUP_MAX; to
698 get the value of RE_DUP_MAX, they should instead include <limits.h>
699 and possibly invoke the sysconf function. */
700
701# undef RE_DUP_MAX
702# define RE_DUP_MAX (0x7fff)
703
704/* Define the following symbols for backward source compatibility.
705 These symbols violate the POSIX name space rules, and new programs
706 should avoid them. */
707
708# define REGS_FIXED REG_FIXED
709# define REGS_REALLOCATE REG_REALLOCATE
710# define REGS_UNALLOCATED REG_UNALLOCATED
711# define RE_BACKSLASH_ESCAPE_IN_LISTS REG_BACKSLASH_ESCAPE_IN_LISTS
712# define RE_BK_PLUS_QM REG_BK_PLUS_QM
713# define RE_CARET_ANCHORS_HERE REG_CARET_ANCHORS_HERE
714# define RE_CHAR_CLASSES REG_CHAR_CLASSES
715# define RE_CONTEXT_INDEP_ANCHORS REG_CONTEXT_INDEP_ANCHORS
716# define RE_CONTEXT_INDEP_OPS REG_CONTEXT_INDEP_OPS
717# define RE_CONTEXT_INVALID_DUP REG_CONTEXT_INVALID_DUP
718# define RE_CONTEXT_INVALID_OPS REG_CONTEXT_INVALID_OPS
719# define RE_DEBUG REG_DEBUG
720# define RE_DOT_NEWLINE REG_DOT_NEWLINE
721# define RE_DOT_NOT_NULL REG_DOT_NOT_NULL
722# define RE_HAT_LISTS_NOT_NEWLINE REG_HAT_LISTS_NOT_NEWLINE
723# define RE_ICASE REG_IGNORE_CASE /* avoid collision with REG_ICASE */
724# define RE_INTERVALS REG_INTERVALS
725# define RE_INVALID_INTERVAL_ORD REG_INVALID_INTERVAL_ORD
726# define RE_LIMITED_OPS REG_LIMITED_OPS
727# define RE_NEWLINE_ALT REG_NEWLINE_ALT
728# define RE_NO_BK_BRACES REG_NO_BK_BRACES
729# define RE_NO_BK_PARENS REG_NO_BK_PARENS
730# define RE_NO_BK_REFS REG_NO_BK_REFS
731# define RE_NO_BK_VBAR REG_NO_BK_VBAR
732# define RE_NO_EMPTY_RANGES REG_NO_EMPTY_RANGES
733# define RE_NO_GNU_OPS REG_NO_GNU_OPS
734# define RE_NO_POSIX_BACKTRACKING REG_NO_POSIX_BACKTRACKING
735# define RE_NO_SUB REG_NO_SUB
736# define RE_NREGS REG_NREGS
737# define RE_SYNTAX_AWK REG_SYNTAX_AWK
738# define RE_SYNTAX_ED REG_SYNTAX_ED
739# define RE_SYNTAX_EGREP REG_SYNTAX_EGREP
740# define RE_SYNTAX_EMACS REG_SYNTAX_EMACS
741# define RE_SYNTAX_GNU_AWK REG_SYNTAX_GNU_AWK
742# define RE_SYNTAX_GREP REG_SYNTAX_GREP
743# define RE_SYNTAX_POSIX_AWK REG_SYNTAX_POSIX_AWK
744# define RE_SYNTAX_POSIX_BASIC REG_SYNTAX_POSIX_BASIC
745# define RE_SYNTAX_POSIX_EGREP REG_SYNTAX_POSIX_EGREP
746# define RE_SYNTAX_POSIX_EXTENDED REG_SYNTAX_POSIX_EXTENDED
747# define RE_SYNTAX_POSIX_MINIMAL_BASIC REG_SYNTAX_POSIX_MINIMAL_BASIC
748# define RE_SYNTAX_POSIX_MINIMAL_EXTENDED REG_SYNTAX_POSIX_MINIMAL_EXTENDED
749# define RE_SYNTAX_SED REG_SYNTAX_SED
750# define RE_UNMATCHED_RIGHT_PAREN_ORD REG_UNMATCHED_RIGHT_PAREN_ORD
751# ifndef RE_TRANSLATE_TYPE
752# define RE_TRANSLATE_TYPE REG_TRANSLATE_TYPE
753# endif
754
755#endif /* defined _REGEX_SOURCE */
756
757#ifdef __cplusplus
758}
759#endif /* C++ */
760
761#endif /* regex.h */
762
763/*
764Local variables:
765make-backup-files: t
766version-control: t
767trim-versions-without-asking: nil
768End:
769*/
diff --git a/lib/regex.o b/lib/regex.o
new file mode 100644
index 0000000..746448c
--- /dev/null
+++ b/lib/regex.o
Binary files differ
diff --git a/lib/regex_internal.c b/lib/regex_internal.c
new file mode 100644
index 0000000..ad618cf
--- /dev/null
+++ b/lib/regex_internal.c
@@ -0,0 +1,1656 @@
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19
20static void re_string_construct_common (const char *str, Idx len,
21 re_string_t *pstr,
22 REG_TRANSLATE_TYPE trans, bool icase,
23 const re_dfa_t *dfa) internal_function;
24static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25 const re_node_set *nodes,
26 re_hashval_t hash) internal_function;
27static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28 const re_node_set *nodes,
29 unsigned int context,
30 re_hashval_t hash) internal_function;
31
32/* Functions for string operation. */
33
34/* This function allocate the buffers. It is necessary to call
35 re_string_reconstruct before using the object. */
36
37static reg_errcode_t
38internal_function
39re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
40 REG_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
41{
42 reg_errcode_t ret;
43 Idx init_buf_len;
44
45 /* Ensure at least one character fits into the buffers. */
46 if (init_len < dfa->mb_cur_max)
47 init_len = dfa->mb_cur_max;
48 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
49 re_string_construct_common (str, len, pstr, trans, icase, dfa);
50
51 ret = re_string_realloc_buffers (pstr, init_buf_len);
52 if (BE (ret != REG_NOERROR, 0))
53 return ret;
54
55 pstr->word_char = dfa->word_char;
56 pstr->word_ops_used = dfa->word_ops_used;
57 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
58 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
59 pstr->valid_raw_len = pstr->valid_len;
60 return REG_NOERROR;
61}
62
63/* This function allocate the buffers, and initialize them. */
64
65static reg_errcode_t
66internal_function
67re_string_construct (re_string_t *pstr, const char *str, Idx len,
68 REG_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
69{
70 reg_errcode_t ret;
71 memset (pstr, '\0', sizeof (re_string_t));
72 re_string_construct_common (str, len, pstr, trans, icase, dfa);
73
74 if (len > 0)
75 {
76 ret = re_string_realloc_buffers (pstr, len + 1);
77 if (BE (ret != REG_NOERROR, 0))
78 return ret;
79 }
80 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
81
82 if (icase)
83 {
84#ifdef RE_ENABLE_I18N
85 if (dfa->mb_cur_max > 1)
86 {
87 while (1)
88 {
89 ret = build_wcs_upper_buffer (pstr);
90 if (BE (ret != REG_NOERROR, 0))
91 return ret;
92 if (pstr->valid_raw_len >= len)
93 break;
94 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
95 break;
96 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
97 if (BE (ret != REG_NOERROR, 0))
98 return ret;
99 }
100 }
101 else
102#endif /* RE_ENABLE_I18N */
103 build_upper_buffer (pstr);
104 }
105 else
106 {
107#ifdef RE_ENABLE_I18N
108 if (dfa->mb_cur_max > 1)
109 build_wcs_buffer (pstr);
110 else
111#endif /* RE_ENABLE_I18N */
112 {
113 if (trans != NULL)
114 re_string_translate_buffer (pstr);
115 else
116 {
117 pstr->valid_len = pstr->bufs_len;
118 pstr->valid_raw_len = pstr->bufs_len;
119 }
120 }
121 }
122
123 return REG_NOERROR;
124}
125
126/* Helper functions for re_string_allocate, and re_string_construct. */
127
128static reg_errcode_t
129internal_function
130re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
131{
132#ifdef RE_ENABLE_I18N
133 if (pstr->mb_cur_max > 1)
134 {
135 wint_t *new_wcs = re_xrealloc (pstr->wcs, wint_t, new_buf_len);
136 if (BE (new_wcs == NULL, 0))
137 return REG_ESPACE;
138 pstr->wcs = new_wcs;
139 if (pstr->offsets != NULL)
140 {
141 Idx *new_offsets = re_xrealloc (pstr->offsets, Idx, new_buf_len);
142 if (BE (new_offsets == NULL, 0))
143 return REG_ESPACE;
144 pstr->offsets = new_offsets;
145 }
146 }
147#endif /* RE_ENABLE_I18N */
148 if (pstr->mbs_allocated)
149 {
150 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
151 new_buf_len);
152 if (BE (new_mbs == NULL, 0))
153 return REG_ESPACE;
154 pstr->mbs = new_mbs;
155 }
156 pstr->bufs_len = new_buf_len;
157 return REG_NOERROR;
158}
159
160
161static void
162internal_function
163re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
164 REG_TRANSLATE_TYPE trans, bool icase,
165 const re_dfa_t *dfa)
166{
167 pstr->raw_mbs = (const unsigned char *) str;
168 pstr->len = len;
169 pstr->raw_len = len;
170 pstr->trans = (unsigned REG_TRANSLATE_TYPE) trans;
171 pstr->icase = icase;
172 pstr->mbs_allocated = (trans != NULL || icase);
173 pstr->mb_cur_max = dfa->mb_cur_max;
174 pstr->is_utf8 = dfa->is_utf8;
175 pstr->map_notascii = dfa->map_notascii;
176 pstr->stop = pstr->len;
177 pstr->raw_stop = pstr->stop;
178}
179
180#ifdef RE_ENABLE_I18N
181
182/* Build wide character buffer PSTR->WCS.
183 If the byte sequence of the string are:
184 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
185 Then wide character buffer will be:
186 <wc1> , WEOF , <wc2> , WEOF , <wc3>
187 We use WEOF for padding, they indicate that the position isn't
188 a first byte of a multibyte character.
189
190 Note that this function assumes PSTR->VALID_LEN elements are already
191 built and starts from PSTR->VALID_LEN. */
192
193static void
194internal_function
195build_wcs_buffer (re_string_t *pstr)
196{
197#ifdef _LIBC
198 unsigned char buf[MB_LEN_MAX];
199 assert (MB_LEN_MAX >= pstr->mb_cur_max);
200#else
201 unsigned char buf[64];
202#endif
203 mbstate_t prev_st;
204 Idx byte_idx, end_idx, remain_len;
205 size_t mbclen;
206
207 /* Build the buffers from pstr->valid_len to either pstr->len or
208 pstr->bufs_len. */
209 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
210 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
211 {
212 wchar_t wc;
213 const char *p;
214
215 remain_len = end_idx - byte_idx;
216 prev_st = pstr->cur_state;
217 /* Apply the translation if we need. */
218 if (BE (pstr->trans != NULL, 0))
219 {
220 int i, ch;
221
222 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
223 {
224 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
225 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
226 }
227 p = (const char *) buf;
228 }
229 else
230 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
231 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
232 if (BE (mbclen == (size_t) -2, 0))
233 {
234 /* The buffer doesn't have enough space, finish to build. */
235 pstr->cur_state = prev_st;
236 break;
237 }
238 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
239 {
240 /* We treat these cases as a singlebyte character. */
241 mbclen = 1;
242 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
243 if (BE (pstr->trans != NULL, 0))
244 wc = pstr->trans[wc];
245 pstr->cur_state = prev_st;
246 }
247
248 /* Write wide character and padding. */
249 pstr->wcs[byte_idx++] = wc;
250 /* Write paddings. */
251 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
252 pstr->wcs[byte_idx++] = WEOF;
253 }
254 pstr->valid_len = byte_idx;
255 pstr->valid_raw_len = byte_idx;
256}
257
258/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
259 but for REG_ICASE. */
260
261static reg_errcode_t
262internal_function
263build_wcs_upper_buffer (re_string_t *pstr)
264{
265 mbstate_t prev_st;
266 Idx src_idx, byte_idx, end_idx, remain_len;
267 size_t mbclen;
268#ifdef _LIBC
269 char buf[MB_LEN_MAX];
270 assert (MB_LEN_MAX >= pstr->mb_cur_max);
271#else
272 char buf[64];
273#endif
274
275 byte_idx = pstr->valid_len;
276 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
277
278 /* The following optimization assumes that ASCII characters can be
279 mapped to wide characters with a simple cast. */
280 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
281 {
282 while (byte_idx < end_idx)
283 {
284 wchar_t wc;
285
286 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
287 && mbsinit (&pstr->cur_state))
288 {
289 /* In case of a singlebyte character. */
290 pstr->mbs[byte_idx]
291 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
292 /* The next step uses the assumption that wchar_t is encoded
293 ASCII-safe: all ASCII values can be converted like this. */
294 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
295 ++byte_idx;
296 continue;
297 }
298
299 remain_len = end_idx - byte_idx;
300 prev_st = pstr->cur_state;
301 mbclen = mbrtowc (&wc,
302 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
303 + byte_idx), remain_len, &pstr->cur_state);
304 if (BE ((size_t) (mbclen + 2) > 2, 1))
305 {
306 wchar_t wcu = wc;
307 if (iswlower (wc))
308 {
309 size_t mbcdlen;
310
311 wcu = towupper (wc);
312 mbcdlen = wcrtomb (buf, wcu, &prev_st);
313 if (BE (mbclen == mbcdlen, 1))
314 memcpy (pstr->mbs + byte_idx, buf, mbclen);
315 else
316 {
317 src_idx = byte_idx;
318 goto offsets_needed;
319 }
320 }
321 else
322 memcpy (pstr->mbs + byte_idx,
323 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
324 pstr->wcs[byte_idx++] = wcu;
325 /* Write paddings. */
326 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
327 pstr->wcs[byte_idx++] = WEOF;
328 }
329 else if (mbclen == (size_t) -1 || mbclen == 0)
330 {
331 /* It is an invalid character or '\0'. Just use the byte. */
332 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
333 pstr->mbs[byte_idx] = ch;
334 /* And also cast it to wide char. */
335 pstr->wcs[byte_idx++] = (wchar_t) ch;
336 if (BE (mbclen == (size_t) -1, 0))
337 pstr->cur_state = prev_st;
338 }
339 else
340 {
341 /* The buffer doesn't have enough space, finish to build. */
342 pstr->cur_state = prev_st;
343 break;
344 }
345 }
346 pstr->valid_len = byte_idx;
347 pstr->valid_raw_len = byte_idx;
348 return REG_NOERROR;
349 }
350 else
351 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
352 {
353 wchar_t wc;
354 const char *p;
355 offsets_needed:
356 remain_len = end_idx - byte_idx;
357 prev_st = pstr->cur_state;
358 if (BE (pstr->trans != NULL, 0))
359 {
360 int i, ch;
361
362 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
363 {
364 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
365 buf[i] = pstr->trans[ch];
366 }
367 p = (const char *) buf;
368 }
369 else
370 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
371 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
372 if (BE ((size_t) (mbclen + 2) > 2, 1))
373 {
374 wchar_t wcu = wc;
375 if (iswlower (wc))
376 {
377 size_t mbcdlen;
378
379 wcu = towupper (wc);
380 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
381 if (BE (mbclen == mbcdlen, 1))
382 memcpy (pstr->mbs + byte_idx, buf, mbclen);
383 else if (mbcdlen != (size_t) -1)
384 {
385 size_t i;
386
387 if (byte_idx + mbcdlen > pstr->bufs_len)
388 {
389 pstr->cur_state = prev_st;
390 break;
391 }
392
393 if (pstr->offsets == NULL)
394 {
395 pstr->offsets = re_xmalloc (Idx, pstr->bufs_len);
396
397 if (pstr->offsets == NULL)
398 return REG_ESPACE;
399 }
400 if (!pstr->offsets_needed)
401 {
402 for (i = 0; i < (size_t) byte_idx; ++i)
403 pstr->offsets[i] = i;
404 pstr->offsets_needed = 1;
405 }
406
407 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
408 pstr->wcs[byte_idx] = wcu;
409 pstr->offsets[byte_idx] = src_idx;
410 for (i = 1; i < mbcdlen; ++i)
411 {
412 pstr->offsets[byte_idx + i]
413 = src_idx + (i < mbclen ? i : mbclen - 1);
414 pstr->wcs[byte_idx + i] = WEOF;
415 }
416 pstr->len += mbcdlen - mbclen;
417 if (pstr->raw_stop > src_idx)
418 pstr->stop += mbcdlen - mbclen;
419 end_idx = (pstr->bufs_len > pstr->len)
420 ? pstr->len : pstr->bufs_len;
421 byte_idx += mbcdlen;
422 src_idx += mbclen;
423 continue;
424 }
425 else
426 memcpy (pstr->mbs + byte_idx, p, mbclen);
427 }
428 else
429 memcpy (pstr->mbs + byte_idx, p, mbclen);
430
431 if (BE (pstr->offsets_needed != 0, 0))
432 {
433 size_t i;
434 for (i = 0; i < mbclen; ++i)
435 pstr->offsets[byte_idx + i] = src_idx + i;
436 }
437 src_idx += mbclen;
438
439 pstr->wcs[byte_idx++] = wcu;
440 /* Write paddings. */
441 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
442 pstr->wcs[byte_idx++] = WEOF;
443 }
444 else if (mbclen == (size_t) -1 || mbclen == 0)
445 {
446 /* It is an invalid character or '\0'. Just use the byte. */
447 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
448
449 if (BE (pstr->trans != NULL, 0))
450 ch = pstr->trans [ch];
451 pstr->mbs[byte_idx] = ch;
452
453 if (BE (pstr->offsets_needed != 0, 0))
454 pstr->offsets[byte_idx] = src_idx;
455 ++src_idx;
456
457 /* And also cast it to wide char. */
458 pstr->wcs[byte_idx++] = (wchar_t) ch;
459 if (BE (mbclen == (size_t) -1, 0))
460 pstr->cur_state = prev_st;
461 }
462 else
463 {
464 /* The buffer doesn't have enough space, finish to build. */
465 pstr->cur_state = prev_st;
466 break;
467 }
468 }
469 pstr->valid_len = byte_idx;
470 pstr->valid_raw_len = src_idx;
471 return REG_NOERROR;
472}
473
474/* Skip characters until the index becomes greater than NEW_RAW_IDX.
475 Return the index. */
476
477static Idx
478internal_function
479re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
480{
481 mbstate_t prev_st;
482 Idx rawbuf_idx;
483 size_t mbclen;
484 wchar_t wc = 0;
485
486 /* Skip the characters which are not necessary to check. */
487 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
488 rawbuf_idx < new_raw_idx;)
489 {
490 Idx remain_len;
491 remain_len = pstr->len - rawbuf_idx;
492 prev_st = pstr->cur_state;
493 mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
494 remain_len, &pstr->cur_state);
495 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
496 {
497 /* We treat these cases as a singlebyte character. */
498 mbclen = 1;
499 pstr->cur_state = prev_st;
500 }
501 /* Then proceed the next character. */
502 rawbuf_idx += mbclen;
503 }
504 *last_wc = (wint_t) wc;
505 return rawbuf_idx;
506}
507#endif /* RE_ENABLE_I18N */
508
509/* Build the buffer PSTR->MBS, and apply the translation if we need.
510 This function is used in case of REG_ICASE. */
511
512static void
513internal_function
514build_upper_buffer (re_string_t *pstr)
515{
516 Idx char_idx, end_idx;
517 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
518
519 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
520 {
521 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
522 if (BE (pstr->trans != NULL, 0))
523 ch = pstr->trans[ch];
524 if (islower (ch))
525 pstr->mbs[char_idx] = toupper (ch);
526 else
527 pstr->mbs[char_idx] = ch;
528 }
529 pstr->valid_len = char_idx;
530 pstr->valid_raw_len = char_idx;
531}
532
533/* Apply TRANS to the buffer in PSTR. */
534
535static void
536internal_function
537re_string_translate_buffer (re_string_t *pstr)
538{
539 Idx buf_idx, end_idx;
540 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
541
542 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
543 {
544 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
545 pstr->mbs[buf_idx] = pstr->trans[ch];
546 }
547
548 pstr->valid_len = buf_idx;
549 pstr->valid_raw_len = buf_idx;
550}
551
552/* This function re-construct the buffers.
553 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
554 convert to upper case in case of REG_ICASE, apply translation. */
555
556static reg_errcode_t
557internal_function
558re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
559{
560 Idx offset;
561
562 if (BE (pstr->raw_mbs_idx <= idx, 0))
563 offset = idx - pstr->raw_mbs_idx;
564 else
565 {
566 /* Reset buffer. */
567#ifdef RE_ENABLE_I18N
568 if (pstr->mb_cur_max > 1)
569 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
570#endif /* RE_ENABLE_I18N */
571 pstr->len = pstr->raw_len;
572 pstr->stop = pstr->raw_stop;
573 pstr->valid_len = 0;
574 pstr->raw_mbs_idx = 0;
575 pstr->valid_raw_len = 0;
576 pstr->offsets_needed = 0;
577 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
578 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
579 if (!pstr->mbs_allocated)
580 pstr->mbs = (unsigned char *) pstr->raw_mbs;
581 offset = idx;
582 }
583
584 if (BE (offset != 0, 1))
585 {
586 /* Are the characters which are already checked remain? */
587 if (BE (offset < pstr->valid_raw_len, 1)
588#ifdef RE_ENABLE_I18N
589 /* Handling this would enlarge the code too much.
590 Accept a slowdown in that case. */
591 && pstr->offsets_needed == 0
592#endif
593 )
594 {
595 /* Yes, move them to the front of the buffer. */
596 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
597#ifdef RE_ENABLE_I18N
598 if (pstr->mb_cur_max > 1)
599 memmove (pstr->wcs, pstr->wcs + offset,
600 (pstr->valid_len - offset) * sizeof (wint_t));
601#endif /* RE_ENABLE_I18N */
602 if (BE (pstr->mbs_allocated, 0))
603 memmove (pstr->mbs, pstr->mbs + offset,
604 pstr->valid_len - offset);
605 pstr->valid_len -= offset;
606 pstr->valid_raw_len -= offset;
607#if DEBUG
608 assert (pstr->valid_len > 0);
609#endif
610 }
611 else
612 {
613 /* No, skip all characters until IDX. */
614#ifdef RE_ENABLE_I18N
615 if (BE (pstr->offsets_needed, 0))
616 {
617 pstr->len = pstr->raw_len - idx + offset;
618 pstr->stop = pstr->raw_stop - idx + offset;
619 pstr->offsets_needed = 0;
620 }
621#endif
622 pstr->valid_len = 0;
623 pstr->valid_raw_len = 0;
624#ifdef RE_ENABLE_I18N
625 if (pstr->mb_cur_max > 1)
626 {
627 Idx wcs_idx;
628 wint_t wc = WEOF;
629
630 if (pstr->is_utf8)
631 {
632 const unsigned char *raw, *p, *q, *end;
633
634 /* Special case UTF-8. Multi-byte chars start with any
635 byte other than 0x80 - 0xbf. */
636 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
637 end = raw + (offset - pstr->mb_cur_max);
638 for (p = raw + offset - 1; p >= end; --p)
639 if ((*p & 0xc0) != 0x80)
640 {
641 mbstate_t cur_state;
642 wchar_t wc2;
643 Idx mlen = raw + pstr->len - p;
644 unsigned char buf[6];
645 size_t mbclen;
646
647 q = p;
648 if (BE (pstr->trans != NULL, 0))
649 {
650 int i = mlen < 6 ? mlen : 6;
651 while (--i >= 0)
652 buf[i] = pstr->trans[p[i]];
653 q = buf;
654 }
655 /* XXX Don't use mbrtowc, we know which conversion
656 to use (UTF-8 -> UCS4). */
657 memset (&cur_state, 0, sizeof (cur_state));
658 mbclen = mbrtowc (&wc2, (const char *) p, mlen,
659 &cur_state);
660 if (raw + offset - p <= mbclen && mbclen < (size_t) -2)
661 {
662 memset (&pstr->cur_state, '\0',
663 sizeof (mbstate_t));
664 pstr->valid_len = mbclen - (raw + offset - p);
665 wc = wc2;
666 }
667 break;
668 }
669 }
670
671 if (wc == WEOF)
672 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
673 if (BE (pstr->valid_len, 0))
674 {
675 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
676 pstr->wcs[wcs_idx] = WEOF;
677 if (pstr->mbs_allocated)
678 memset (pstr->mbs, -1, pstr->valid_len);
679 }
680 pstr->valid_raw_len = pstr->valid_len;
681 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
682 && IS_WIDE_WORD_CHAR (wc))
683 ? CONTEXT_WORD
684 : ((IS_WIDE_NEWLINE (wc)
685 && pstr->newline_anchor)
686 ? CONTEXT_NEWLINE : 0));
687 }
688 else
689#endif /* RE_ENABLE_I18N */
690 {
691 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
692 if (pstr->trans)
693 c = pstr->trans[c];
694 pstr->tip_context = (bitset_contain (pstr->word_char, c)
695 ? CONTEXT_WORD
696 : ((IS_NEWLINE (c) && pstr->newline_anchor)
697 ? CONTEXT_NEWLINE : 0));
698 }
699 }
700 if (!BE (pstr->mbs_allocated, 0))
701 pstr->mbs += offset;
702 }
703 pstr->raw_mbs_idx = idx;
704 pstr->len -= offset;
705 pstr->stop -= offset;
706
707 /* Then build the buffers. */
708#ifdef RE_ENABLE_I18N
709 if (pstr->mb_cur_max > 1)
710 {
711 if (pstr->icase)
712 {
713 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
714 if (BE (ret != REG_NOERROR, 0))
715 return ret;
716 }
717 else
718 build_wcs_buffer (pstr);
719 }
720 else
721#endif /* RE_ENABLE_I18N */
722 if (BE (pstr->mbs_allocated, 0))
723 {
724 if (pstr->icase)
725 build_upper_buffer (pstr);
726 else if (pstr->trans != NULL)
727 re_string_translate_buffer (pstr);
728 }
729 else
730 pstr->valid_len = pstr->len;
731
732 pstr->cur_idx = 0;
733 return REG_NOERROR;
734}
735
736static unsigned char
737internal_function __attribute ((pure))
738re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
739{
740 int ch;
741 Idx off;
742
743 /* Handle the common (easiest) cases first. */
744 if (BE (!pstr->mbs_allocated, 1))
745 return re_string_peek_byte (pstr, idx);
746
747#ifdef RE_ENABLE_I18N
748 if (pstr->mb_cur_max > 1
749 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
750 return re_string_peek_byte (pstr, idx);
751#endif
752
753 off = pstr->cur_idx + idx;
754#ifdef RE_ENABLE_I18N
755 if (pstr->offsets_needed)
756 off = pstr->offsets[off];
757#endif
758
759 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
760
761#ifdef RE_ENABLE_I18N
762 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
763 this function returns CAPITAL LETTER I instead of first byte of
764 DOTLESS SMALL LETTER I. The latter would confuse the parser,
765 since peek_byte_case doesn't advance cur_idx in any way. */
766 if (pstr->offsets_needed && !isascii (ch))
767 return re_string_peek_byte (pstr, idx);
768#endif
769
770 return ch;
771}
772
773static unsigned char
774internal_function __attribute ((pure))
775re_string_fetch_byte_case (re_string_t *pstr)
776{
777 if (BE (!pstr->mbs_allocated, 1))
778 return re_string_fetch_byte (pstr);
779
780#ifdef RE_ENABLE_I18N
781 if (pstr->offsets_needed)
782 {
783 Idx off;
784 int ch;
785
786 /* For tr_TR.UTF-8 [[:islower:]] there is
787 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
788 in that case the whole multi-byte character and return
789 the original letter. On the other side, with
790 [[: DOTLESS SMALL LETTER I return [[:I, as doing
791 anything else would complicate things too much. */
792
793 if (!re_string_first_byte (pstr, pstr->cur_idx))
794 return re_string_fetch_byte (pstr);
795
796 off = pstr->offsets[pstr->cur_idx];
797 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
798
799 if (! isascii (ch))
800 return re_string_fetch_byte (pstr);
801
802 re_string_skip_bytes (pstr,
803 re_string_char_size_at (pstr, pstr->cur_idx));
804 return ch;
805 }
806#endif
807
808 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
809}
810
811static void
812internal_function
813re_string_destruct (re_string_t *pstr)
814{
815#ifdef RE_ENABLE_I18N
816 re_free (pstr->wcs);
817 re_free (pstr->offsets);
818#endif /* RE_ENABLE_I18N */
819 if (pstr->mbs_allocated)
820 re_free (pstr->mbs);
821}
822
823/* Return the context at IDX in INPUT. */
824
825static unsigned int
826internal_function
827re_string_context_at (const re_string_t *input, Idx idx, int eflags)
828{
829 int c;
830 if (BE (! REG_VALID_INDEX (idx), 0))
831 /* In this case, we use the value stored in input->tip_context,
832 since we can't know the character in input->mbs[-1] here. */
833 return input->tip_context;
834 if (BE (idx == input->len, 0))
835 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
836 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
837#ifdef RE_ENABLE_I18N
838 if (input->mb_cur_max > 1)
839 {
840 wint_t wc;
841 Idx wc_idx = idx;
842 while(input->wcs[wc_idx] == WEOF)
843 {
844#ifdef DEBUG
845 /* It must not happen. */
846 assert (REG_VALID_INDEX (wc_idx));
847#endif
848 --wc_idx;
849 if (! REG_VALID_INDEX (wc_idx))
850 return input->tip_context;
851 }
852 wc = input->wcs[wc_idx];
853 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
854 return CONTEXT_WORD;
855 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
856 ? CONTEXT_NEWLINE : 0);
857 }
858 else
859#endif
860 {
861 c = re_string_byte_at (input, idx);
862 if (bitset_contain (input->word_char, c))
863 return CONTEXT_WORD;
864 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
865 }
866}
867
868/* Functions for set operation. */
869
870static reg_errcode_t
871internal_function
872re_node_set_alloc (re_node_set *set, Idx size)
873{
874 set->alloc = size;
875 set->nelem = 0;
876 set->elems = re_xmalloc (Idx, size);
877 if (BE (set->elems == NULL, 0))
878 return REG_ESPACE;
879 return REG_NOERROR;
880}
881
882static reg_errcode_t
883internal_function
884re_node_set_init_1 (re_node_set *set, Idx elem)
885{
886 set->alloc = 1;
887 set->nelem = 1;
888 set->elems = re_malloc (Idx, 1);
889 if (BE (set->elems == NULL, 0))
890 {
891 set->alloc = set->nelem = 0;
892 return REG_ESPACE;
893 }
894 set->elems[0] = elem;
895 return REG_NOERROR;
896}
897
898static reg_errcode_t
899internal_function
900re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
901{
902 set->alloc = 2;
903 set->elems = re_malloc (Idx, 2);
904 if (BE (set->elems == NULL, 0))
905 return REG_ESPACE;
906 if (elem1 == elem2)
907 {
908 set->nelem = 1;
909 set->elems[0] = elem1;
910 }
911 else
912 {
913 set->nelem = 2;
914 if (elem1 < elem2)
915 {
916 set->elems[0] = elem1;
917 set->elems[1] = elem2;
918 }
919 else
920 {
921 set->elems[0] = elem2;
922 set->elems[1] = elem1;
923 }
924 }
925 return REG_NOERROR;
926}
927
928static reg_errcode_t
929internal_function
930re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
931{
932 dest->nelem = src->nelem;
933 if (src->nelem > 0)
934 {
935 dest->alloc = dest->nelem;
936 dest->elems = re_malloc (Idx, dest->alloc);
937 if (BE (dest->elems == NULL, 0))
938 {
939 dest->alloc = dest->nelem = 0;
940 return REG_ESPACE;
941 }
942 memcpy (dest->elems, src->elems, src->nelem * sizeof dest->elems[0]);
943 }
944 else
945 re_node_set_init_empty (dest);
946 return REG_NOERROR;
947}
948
949/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
950 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
951 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
952
953static reg_errcode_t
954internal_function
955re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
956 const re_node_set *src2)
957{
958 Idx i1, i2, is, id, delta, sbase;
959 if (src1->nelem == 0 || src2->nelem == 0)
960 return REG_NOERROR;
961
962 /* We need dest->nelem + 2 * elems_in_intersection; this is a
963 conservative estimate. */
964 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
965 {
966 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
967 Idx *new_elems;
968 if (sizeof (Idx) < 3
969 && (new_alloc < dest->alloc
970 || ((Idx) (src1->nelem + src2->nelem) < src1->nelem)))
971 return REG_ESPACE;
972 new_elems = re_xrealloc (dest->elems, Idx, new_alloc);
973 if (BE (new_elems == NULL, 0))
974 return REG_ESPACE;
975 dest->elems = new_elems;
976 dest->alloc = new_alloc;
977 }
978
979 /* Find the items in the intersection of SRC1 and SRC2, and copy
980 into the top of DEST those that are not already in DEST itself. */
981 sbase = dest->nelem + src1->nelem + src2->nelem;
982 i1 = src1->nelem - 1;
983 i2 = src2->nelem - 1;
984 id = dest->nelem - 1;
985 for (;;)
986 {
987 if (src1->elems[i1] == src2->elems[i2])
988 {
989 /* Try to find the item in DEST. Maybe we could binary search? */
990 while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
991 --id;
992
993 if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
994 dest->elems[--sbase] = src1->elems[i1];
995
996 if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
997 break;
998 }
999
1000 /* Lower the highest of the two items. */
1001 else if (src1->elems[i1] < src2->elems[i2])
1002 {
1003 if (! REG_VALID_INDEX (--i2))
1004 break;
1005 }
1006 else
1007 {
1008 if (! REG_VALID_INDEX (--i1))
1009 break;
1010 }
1011 }
1012
1013 id = dest->nelem - 1;
1014 is = dest->nelem + src1->nelem + src2->nelem - 1;
1015 delta = is - sbase + 1;
1016
1017 /* Now copy. When DELTA becomes zero, the remaining
1018 DEST elements are already in place; this is more or
1019 less the same loop that is in re_node_set_merge. */
1020 dest->nelem += delta;
1021 if (delta > 0 && REG_VALID_INDEX (id))
1022 for (;;)
1023 {
1024 if (dest->elems[is] > dest->elems[id])
1025 {
1026 /* Copy from the top. */
1027 dest->elems[id + delta--] = dest->elems[is--];
1028 if (delta == 0)
1029 break;
1030 }
1031 else
1032 {
1033 /* Slide from the bottom. */
1034 dest->elems[id + delta] = dest->elems[id];
1035 if (! REG_VALID_INDEX (--id))
1036 break;
1037 }
1038 }
1039
1040 /* Copy remaining SRC elements. */
1041 memcpy (dest->elems, dest->elems + sbase, delta * sizeof dest->elems[0]);
1042
1043 return REG_NOERROR;
1044}
1045
1046/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1047 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1048
1049static reg_errcode_t
1050internal_function
1051re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1052 const re_node_set *src2)
1053{
1054 Idx i1, i2, id;
1055 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1056 {
1057 dest->alloc = src1->nelem + src2->nelem;
1058 if (sizeof (Idx) < 2 && dest->alloc < src1->nelem)
1059 return REG_ESPACE;
1060 dest->elems = re_xmalloc (Idx, dest->alloc);
1061 if (BE (dest->elems == NULL, 0))
1062 return REG_ESPACE;
1063 }
1064 else
1065 {
1066 if (src1 != NULL && src1->nelem > 0)
1067 return re_node_set_init_copy (dest, src1);
1068 else if (src2 != NULL && src2->nelem > 0)
1069 return re_node_set_init_copy (dest, src2);
1070 else
1071 re_node_set_init_empty (dest);
1072 return REG_NOERROR;
1073 }
1074 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1075 {
1076 if (src1->elems[i1] > src2->elems[i2])
1077 {
1078 dest->elems[id++] = src2->elems[i2++];
1079 continue;
1080 }
1081 if (src1->elems[i1] == src2->elems[i2])
1082 ++i2;
1083 dest->elems[id++] = src1->elems[i1++];
1084 }
1085 if (i1 < src1->nelem)
1086 {
1087 memcpy (dest->elems + id, src1->elems + i1,
1088 (src1->nelem - i1) * sizeof dest->elems[0]);
1089 id += src1->nelem - i1;
1090 }
1091 else if (i2 < src2->nelem)
1092 {
1093 memcpy (dest->elems + id, src2->elems + i2,
1094 (src2->nelem - i2) * sizeof dest->elems[0]);
1095 id += src2->nelem - i2;
1096 }
1097 dest->nelem = id;
1098 return REG_NOERROR;
1099}
1100
1101/* Calculate the union set of the sets DEST and SRC. And store it to
1102 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1103
1104static reg_errcode_t
1105internal_function
1106re_node_set_merge (re_node_set *dest, const re_node_set *src)
1107{
1108 Idx is, id, sbase, delta;
1109 if (src == NULL || src->nelem == 0)
1110 return REG_NOERROR;
1111 if (sizeof (Idx) < 3
1112 && ((Idx) (2 * src->nelem) < src->nelem
1113 || (Idx) (2 * src->nelem + dest->nelem) < dest->nelem))
1114 return REG_ESPACE;
1115 if (dest->alloc < 2 * src->nelem + dest->nelem)
1116 {
1117 Idx new_alloc = src->nelem + dest->alloc;
1118 Idx *new_buffer;
1119 if (sizeof (Idx) < 4 && new_alloc < dest->alloc)
1120 return REG_ESPACE;
1121 new_buffer = re_x2realloc (dest->elems, Idx, &new_alloc);
1122 if (BE (new_buffer == NULL, 0))
1123 return REG_ESPACE;
1124 dest->elems = new_buffer;
1125 dest->alloc = new_alloc;
1126 }
1127
1128 if (BE (dest->nelem == 0, 0))
1129 {
1130 dest->nelem = src->nelem;
1131 memcpy (dest->elems, src->elems, src->nelem * sizeof dest->elems[0]);
1132 return REG_NOERROR;
1133 }
1134
1135 /* Copy into the top of DEST the items of SRC that are not
1136 found in DEST. Maybe we could binary search in DEST? */
1137 for (sbase = dest->nelem + 2 * src->nelem,
1138 is = src->nelem - 1, id = dest->nelem - 1;
1139 REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1140 {
1141 if (dest->elems[id] == src->elems[is])
1142 is--, id--;
1143 else if (dest->elems[id] < src->elems[is])
1144 dest->elems[--sbase] = src->elems[is--];
1145 else /* if (dest->elems[id] > src->elems[is]) */
1146 --id;
1147 }
1148
1149 if (REG_VALID_INDEX (is))
1150 {
1151 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1152 sbase -= is + 1;
1153 memcpy (dest->elems + sbase, src->elems,
1154 (is + 1) * sizeof dest->elems[0]);
1155 }
1156
1157 id = dest->nelem - 1;
1158 is = dest->nelem + 2 * src->nelem - 1;
1159 delta = is - sbase + 1;
1160 if (delta == 0)
1161 return REG_NOERROR;
1162
1163 /* Now copy. When DELTA becomes zero, the remaining
1164 DEST elements are already in place. */
1165 dest->nelem += delta;
1166 for (;;)
1167 {
1168 if (dest->elems[is] > dest->elems[id])
1169 {
1170 /* Copy from the top. */
1171 dest->elems[id + delta--] = dest->elems[is--];
1172 if (delta == 0)
1173 break;
1174 }
1175 else
1176 {
1177 /* Slide from the bottom. */
1178 dest->elems[id + delta] = dest->elems[id];
1179 if (! REG_VALID_INDEX (--id))
1180 {
1181 /* Copy remaining SRC elements. */
1182 memcpy (dest->elems, dest->elems + sbase,
1183 delta * sizeof dest->elems[0]);
1184 break;
1185 }
1186 }
1187 }
1188
1189 return REG_NOERROR;
1190}
1191
1192/* Insert the new element ELEM to the re_node_set* SET.
1193 SET should not already have ELEM.
1194 Return true if successful. */
1195
1196static bool
1197internal_function
1198re_node_set_insert (re_node_set *set, Idx elem)
1199{
1200 Idx idx;
1201 /* In case the set is empty. */
1202 if (set->alloc == 0)
1203 return re_node_set_init_1 (set, elem) == REG_NOERROR;
1204
1205 if (BE (set->nelem, 0) == 0)
1206 {
1207 /* We already guaranteed above that set->alloc != 0. */
1208 set->elems[0] = elem;
1209 ++set->nelem;
1210 return true;
1211 }
1212
1213 /* Realloc if we need. */
1214 if (set->alloc == set->nelem)
1215 {
1216 Idx *new_elems = re_x2realloc (set->elems, Idx, &set->alloc);
1217 if (BE (new_elems == NULL, 0))
1218 return false;
1219 set->elems = new_elems;
1220 }
1221
1222 /* Move the elements which follows the new element. Test the
1223 first element separately to skip a check in the inner loop. */
1224 if (elem < set->elems[0])
1225 {
1226 idx = 0;
1227 for (idx = set->nelem; idx > 0; idx--)
1228 set->elems[idx] = set->elems[idx - 1];
1229 }
1230 else
1231 {
1232 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1233 set->elems[idx] = set->elems[idx - 1];
1234 }
1235
1236 /* Insert the new element. */
1237 set->elems[idx] = elem;
1238 ++set->nelem;
1239 return true;
1240}
1241
1242/* Insert the new element ELEM to the re_node_set* SET.
1243 SET should not already have any element greater than or equal to ELEM.
1244 Return true if successful. */
1245
1246static bool
1247internal_function
1248re_node_set_insert_last (re_node_set *set, Idx elem)
1249{
1250 /* Realloc if we need. */
1251 if (set->alloc == set->nelem)
1252 {
1253 Idx *new_elems;
1254 new_elems = re_x2realloc (set->elems, Idx, &set->alloc);
1255 if (BE (new_elems == NULL, 0))
1256 return false;
1257 set->elems = new_elems;
1258 }
1259
1260 /* Insert the new element. */
1261 set->elems[set->nelem++] = elem;
1262 return true;
1263}
1264
1265/* Compare two node sets SET1 and SET2.
1266 Return true if SET1 and SET2 are equivalent. */
1267
1268static bool
1269internal_function __attribute ((pure))
1270re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1271{
1272 Idx i;
1273 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1274 return false;
1275 for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1276 if (set1->elems[i] != set2->elems[i])
1277 return false;
1278 return true;
1279}
1280
1281/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1282
1283static Idx
1284internal_function __attribute ((pure))
1285re_node_set_contains (const re_node_set *set, Idx elem)
1286{
1287 __re_size_t idx, right, mid;
1288 if (! REG_VALID_NONZERO_INDEX (set->nelem))
1289 return 0;
1290
1291 /* Binary search the element. */
1292 idx = 0;
1293 right = set->nelem - 1;
1294 while (idx < right)
1295 {
1296 mid = (idx + right) / 2;
1297 if (set->elems[mid] < elem)
1298 idx = mid + 1;
1299 else
1300 right = mid;
1301 }
1302 return set->elems[idx] == elem ? idx + 1 : 0;
1303}
1304
1305static void
1306internal_function
1307re_node_set_remove_at (re_node_set *set, Idx idx)
1308{
1309 if (idx < 0 || idx >= set->nelem)
1310 return;
1311 --set->nelem;
1312 for (; idx < set->nelem; idx++)
1313 set->elems[idx] = set->elems[idx + 1];
1314}
1315
1316
1317/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1318 Or return REG_MISSING if an error occurred. */
1319
1320static Idx
1321internal_function
1322re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1323{
1324 int type = token.type;
1325 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1326 {
1327 Idx new_nodes_alloc = dfa->nodes_alloc;
1328 Idx *new_nexts, *new_indices;
1329 re_node_set *new_edests, *new_eclosures;
1330
1331 re_token_t *new_nodes = re_x2realloc (dfa->nodes, re_token_t,
1332 &new_nodes_alloc);
1333 if (BE (new_nodes == NULL, 0))
1334 return REG_MISSING;
1335 dfa->nodes = new_nodes;
1336 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1337 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1338 new_edests = re_xrealloc (dfa->edests, re_node_set, new_nodes_alloc);
1339 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1340 if (BE (new_nexts == NULL || new_indices == NULL
1341 || new_edests == NULL || new_eclosures == NULL, 0))
1342 return REG_MISSING;
1343 dfa->nexts = new_nexts;
1344 dfa->org_indices = new_indices;
1345 dfa->edests = new_edests;
1346 dfa->eclosures = new_eclosures;
1347 dfa->nodes_alloc = new_nodes_alloc;
1348 }
1349 dfa->nodes[dfa->nodes_len] = token;
1350 dfa->nodes[dfa->nodes_len].constraint = 0;
1351#ifdef RE_ENABLE_I18N
1352 dfa->nodes[dfa->nodes_len].accept_mb =
1353 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1354#endif
1355 dfa->nexts[dfa->nodes_len] = REG_MISSING;
1356 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1357 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1358 return dfa->nodes_len++;
1359}
1360
1361static inline re_hashval_t
1362internal_function
1363calc_state_hash (const re_node_set *nodes, unsigned int context)
1364{
1365 re_hashval_t hash = nodes->nelem + context;
1366 Idx i;
1367 for (i = 0 ; i < nodes->nelem ; i++)
1368 hash += nodes->elems[i];
1369 return hash;
1370}
1371
1372/* Search for the state whose node_set is equivalent to NODES.
1373 Return the pointer to the state, if we found it in the DFA.
1374 Otherwise create the new one and return it. In case of an error
1375 return NULL and set the error code in ERR.
1376 Note: - We assume NULL as the invalid state, then it is possible that
1377 return value is NULL and ERR is REG_NOERROR.
1378 - We never return non-NULL value in case of any errors, it is for
1379 optimization. */
1380
1381static re_dfastate_t*
1382internal_function
1383re_acquire_state (reg_errcode_t *err, re_dfa_t *dfa, const re_node_set *nodes)
1384{
1385 re_hashval_t hash;
1386 re_dfastate_t *new_state;
1387 struct re_state_table_entry *spot;
1388 Idx i;
1389#ifdef lint
1390 /* Suppress bogus uninitialized-variable warnings. */
1391 *err = REG_NOERROR;
1392#endif
1393 if (BE (nodes->nelem == 0, 0))
1394 {
1395 *err = REG_NOERROR;
1396 return NULL;
1397 }
1398 hash = calc_state_hash (nodes, 0);
1399 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1400
1401 for (i = 0 ; i < spot->num ; i++)
1402 {
1403 re_dfastate_t *state = spot->array[i];
1404 if (hash != state->hash)
1405 continue;
1406 if (re_node_set_compare (&state->nodes, nodes))
1407 return state;
1408 }
1409
1410 /* There are no appropriate state in the dfa, create the new one. */
1411 new_state = create_ci_newstate (dfa, nodes, hash);
1412 if (BE (new_state != NULL, 1))
1413 return new_state;
1414 else
1415 {
1416 *err = REG_ESPACE;
1417 return NULL;
1418 }
1419}
1420
1421/* Search for the state whose node_set is equivalent to NODES and
1422 whose context is equivalent to CONTEXT.
1423 Return the pointer to the state, if we found it in the DFA.
1424 Otherwise create the new one and return it. In case of an error
1425 return NULL and set the error code in ERR.
1426 Note: - We assume NULL as the invalid state, then it is possible that
1427 return value is NULL and ERR is REG_NOERROR.
1428 - We never return non-NULL value in case of any errors, it is for
1429 optimization. */
1430
1431static re_dfastate_t*
1432internal_function
1433re_acquire_state_context (reg_errcode_t *err, re_dfa_t *dfa,
1434 const re_node_set *nodes, unsigned int context)
1435{
1436 re_hashval_t hash;
1437 re_dfastate_t *new_state;
1438 struct re_state_table_entry *spot;
1439 Idx i;
1440#ifdef lint
1441 /* Suppress bogus uninitialized-variable warnings. */
1442 *err = REG_NOERROR;
1443#endif
1444 if (nodes->nelem == 0)
1445 {
1446 *err = REG_NOERROR;
1447 return NULL;
1448 }
1449 hash = calc_state_hash (nodes, context);
1450 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1451
1452 for (i = 0 ; i < spot->num ; i++)
1453 {
1454 re_dfastate_t *state = spot->array[i];
1455 if (state->hash == hash
1456 && state->context == context
1457 && re_node_set_compare (state->entrance_nodes, nodes))
1458 return state;
1459 }
1460 /* There are no appropriate state in `dfa', create the new one. */
1461 new_state = create_cd_newstate (dfa, nodes, context, hash);
1462 if (BE (new_state != NULL, 1))
1463 return new_state;
1464 else
1465 {
1466 *err = REG_ESPACE;
1467 return NULL;
1468 }
1469}
1470
1471/* Finish initialization of the new state NEWSTATE, and using its hash value
1472 HASH put in the appropriate bucket of DFA's state table. Return value
1473 indicates the error code if failed. */
1474
1475static reg_errcode_t
1476internal_function
1477register_state (const re_dfa_t *dfa, re_dfastate_t *newstate, re_hashval_t hash)
1478{
1479 struct re_state_table_entry *spot;
1480 reg_errcode_t err;
1481 Idx i;
1482
1483 newstate->hash = hash;
1484 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1485 if (BE (err != REG_NOERROR, 0))
1486 return REG_ESPACE;
1487 for (i = 0; i < newstate->nodes.nelem; i++)
1488 {
1489 Idx elem = newstate->nodes.elems[i];
1490 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1491 {
1492 bool ok = re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1493 if (BE (! ok, 0))
1494 return REG_ESPACE;
1495 }
1496 }
1497
1498 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1499 if (BE (spot->alloc <= spot->num, 0))
1500 {
1501 Idx new_alloc = spot->num;
1502 re_dfastate_t **new_array = re_x2realloc (spot->array, re_dfastate_t *,
1503 &new_alloc);
1504 if (BE (new_array == NULL, 0))
1505 return REG_ESPACE;
1506 spot->array = new_array;
1507 spot->alloc = new_alloc;
1508 }
1509 spot->array[spot->num++] = newstate;
1510 return REG_NOERROR;
1511}
1512
1513/* Create the new state which is independ of contexts.
1514 Return the new state if succeeded, otherwise return NULL. */
1515
1516static re_dfastate_t *
1517internal_function
1518create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1519 re_hashval_t hash)
1520{
1521 Idx i;
1522 reg_errcode_t err;
1523 re_dfastate_t *newstate;
1524
1525 newstate = re_calloc (re_dfastate_t, 1);
1526 if (BE (newstate == NULL, 0))
1527 return NULL;
1528 err = re_node_set_init_copy (&newstate->nodes, nodes);
1529 if (BE (err != REG_NOERROR, 0))
1530 {
1531 re_free (newstate);
1532 return NULL;
1533 }
1534
1535 newstate->entrance_nodes = &newstate->nodes;
1536 for (i = 0 ; i < nodes->nelem ; i++)
1537 {
1538 re_token_t *node = dfa->nodes + nodes->elems[i];
1539 re_token_type_t type = node->type;
1540 if (type == CHARACTER && !node->constraint)
1541 continue;
1542#ifdef RE_ENABLE_I18N
1543 newstate->accept_mb |= node->accept_mb;
1544#endif /* RE_ENABLE_I18N */
1545
1546 /* If the state has the halt node, the state is a halt state. */
1547 if (type == END_OF_RE)
1548 newstate->halt = 1;
1549 else if (type == OP_BACK_REF)
1550 newstate->has_backref = 1;
1551 else if (type == ANCHOR || node->constraint)
1552 newstate->has_constraint = 1;
1553 }
1554 err = register_state (dfa, newstate, hash);
1555 if (BE (err != REG_NOERROR, 0))
1556 {
1557 free_state (newstate);
1558 newstate = NULL;
1559 }
1560 return newstate;
1561}
1562
1563/* Create the new state which is depend on the context CONTEXT.
1564 Return the new state if succeeded, otherwise return NULL. */
1565
1566static re_dfastate_t *
1567internal_function
1568create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1569 unsigned int context, re_hashval_t hash)
1570{
1571 Idx i, nctx_nodes = 0;
1572 reg_errcode_t err;
1573 re_dfastate_t *newstate;
1574
1575 newstate = re_calloc (re_dfastate_t, 1);
1576 if (BE (newstate == NULL, 0))
1577 return NULL;
1578 err = re_node_set_init_copy (&newstate->nodes, nodes);
1579 if (BE (err != REG_NOERROR, 0))
1580 {
1581 re_free (newstate);
1582 return NULL;
1583 }
1584
1585 newstate->context = context;
1586 newstate->entrance_nodes = &newstate->nodes;
1587
1588 for (i = 0 ; i < nodes->nelem ; i++)
1589 {
1590 unsigned int constraint = 0;
1591 re_token_t *node = dfa->nodes + nodes->elems[i];
1592 re_token_type_t type = node->type;
1593 if (node->constraint)
1594 constraint = node->constraint;
1595
1596 if (type == CHARACTER && !constraint)
1597 continue;
1598#ifdef RE_ENABLE_I18N
1599 newstate->accept_mb |= node->accept_mb;
1600#endif /* RE_ENABLE_I18N */
1601
1602 /* If the state has the halt node, the state is a halt state. */
1603 if (type == END_OF_RE)
1604 newstate->halt = 1;
1605 else if (type == OP_BACK_REF)
1606 newstate->has_backref = 1;
1607 else if (type == ANCHOR)
1608 constraint = node->opr.ctx_type;
1609
1610 if (constraint)
1611 {
1612 if (newstate->entrance_nodes == &newstate->nodes)
1613 {
1614 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1615 if (BE (newstate->entrance_nodes == NULL, 0))
1616 {
1617 free_state (newstate);
1618 return NULL;
1619 }
1620 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1621 nctx_nodes = 0;
1622 newstate->has_constraint = 1;
1623 }
1624
1625 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1626 {
1627 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1628 ++nctx_nodes;
1629 }
1630 }
1631 }
1632 err = register_state (dfa, newstate, hash);
1633 if (BE (err != REG_NOERROR, 0))
1634 {
1635 free_state (newstate);
1636 newstate = NULL;
1637 }
1638 return newstate;
1639}
1640
1641static void
1642internal_function
1643free_state (re_dfastate_t *state)
1644{
1645 re_node_set_free (&state->non_eps_nodes);
1646 re_node_set_free (&state->inveclosure);
1647 if (state->entrance_nodes != &state->nodes)
1648 {
1649 re_node_set_free (state->entrance_nodes);
1650 re_free (state->entrance_nodes);
1651 }
1652 re_node_set_free (&state->nodes);
1653 re_free (state->word_trtable);
1654 re_free (state->trtable);
1655 re_free (state);
1656}
diff --git a/lib/regex_internal.h b/lib/regex_internal.h
new file mode 100644
index 0000000..a36ae4c
--- /dev/null
+++ b/lib/regex_internal.h
@@ -0,0 +1,911 @@
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19
20#ifndef _REGEX_INTERNAL_H
21#define _REGEX_INTERNAL_H 1
22
23#include <assert.h>
24#include <ctype.h>
25#include <stdbool.h>
26#include <stdio.h>
27#include <stdlib.h>
28#include <string.h>
29
30#ifndef _LIBC
31# include "strcase.h"
32#endif
33
34#if defined HAVE_LANGINFO_H || defined HAVE_LANGINFO_CODESET || defined _LIBC
35# include <langinfo.h>
36#endif
37#if defined HAVE_LOCALE_H || defined _LIBC
38# include <locale.h>
39#endif
40#if defined HAVE_WCHAR_H || defined _LIBC
41# include <wchar.h>
42#endif /* HAVE_WCHAR_H || _LIBC */
43#if defined HAVE_WCTYPE_H || defined _LIBC
44# include <wctype.h>
45#endif /* HAVE_WCTYPE_H || _LIBC */
46#if defined _LIBC
47# include <bits/libc-lock.h>
48#else
49# define __libc_lock_define(CLASS,NAME)
50# define __libc_lock_init(NAME) do { } while (0)
51# define __libc_lock_lock(NAME) do { } while (0)
52# define __libc_lock_unlock(NAME) do { } while (0)
53#endif
54
55/* In case that the system doesn't have isblank(). */
56#if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank
57# define isblank(ch) ((ch) == ' ' || (ch) == '\t')
58#endif
59
60#ifdef _LIBC
61# ifndef _RE_DEFINE_LOCALE_FUNCTIONS
62# define _RE_DEFINE_LOCALE_FUNCTIONS 1
63# include <locale/localeinfo.h>
64# include <locale/elem-hash.h>
65# include <locale/coll-lookup.h>
66# endif
67#endif
68
69/* This is for other GNU distributions with internationalized messages. */
70#if (HAVE_LIBINTL_H && ENABLE_NLS) || defined _LIBC
71# include <libintl.h>
72# ifdef _LIBC
73# undef gettext
74# define gettext(msgid) \
75 INTUSE(__dcgettext) (_libc_intl_domainname, msgid, LC_MESSAGES)
76# endif
77#else
78# define gettext(msgid) (msgid)
79#endif
80
81#ifndef gettext_noop
82/* This define is so xgettext can find the internationalizable
83 strings. */
84# define gettext_noop(String) String
85#endif
86
87#if (defined MB_CUR_MAX && HAVE_LOCALE_H && HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_WCRTOMB && HAVE_MBRTOWC && HAVE_WCSCOLL) || _LIBC
88# define RE_ENABLE_I18N
89#endif
90
91#if __GNUC__ >= 3
92# define BE(expr, val) __builtin_expect (expr, val)
93#else
94# define BE(expr, val) (expr)
95#endif
96
97/* Number of single byte character. */
98#define SBC_MAX 256
99
100#define COLL_ELEM_LEN_MAX 8
101
102/* The character which represents newline. */
103#define NEWLINE_CHAR '\n'
104#define WIDE_NEWLINE_CHAR L'\n'
105
106/* Rename to standard API for using out of glibc. */
107#ifndef _LIBC
108# define __wctype wctype
109# define __iswctype iswctype
110# define __btowc btowc
111# ifndef __mempcpy
112# define __mempcpy mempcpy
113# endif
114# define __wcrtomb wcrtomb
115# define __regfree regfree
116# define attribute_hidden
117#endif /* not _LIBC */
118
119#if __GNUC__ >= 4 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)
120# define __attribute(arg) __attribute__ (arg)
121#else
122# define __attribute(arg)
123#endif
124
125extern const char __re_error_msgid[] attribute_hidden;
126extern const size_t __re_error_msgid_idx[] attribute_hidden;
127
128typedef __re_idx_t Idx;
129
130/* Special return value for failure to match. */
131#define REG_MISSING ((Idx) -1)
132
133/* Special return value for internal error. */
134#define REG_ERROR ((Idx) -2)
135
136/* Test whether N is a valid index, and is not one of the above. */
137#ifdef _REGEX_LARGE_OFFSETS
138# define REG_VALID_INDEX(n) ((Idx) (n) < REG_ERROR)
139#else
140# define REG_VALID_INDEX(n) (0 <= (n))
141#endif
142
143/* Test whether N is a valid nonzero index. */
144#ifdef _REGEX_LARGE_OFFSETS
145# define REG_VALID_NONZERO_INDEX(n) ((Idx) ((n) - 1) < (Idx) (REG_ERROR - 1))
146#else
147# define REG_VALID_NONZERO_INDEX(n) (0 < (n))
148#endif
149
150/* A hash value, suitable for computing hash tables. */
151typedef __re_size_t re_hashval_t;
152
153/* An integer used to represent a set of bits. It must be unsigned,
154 and must be at least as wide as unsigned int. */
155typedef unsigned long int bitset_word;
156
157/* Maximum value of a bitset word. It must be useful in preprocessor
158 contexts, and must be consistent with bitset_word. */
159#define BITSET_WORD_MAX ULONG_MAX
160
161/* Number of bits in a bitset word. Avoid greater-than-32-bit
162 integers and unconditional shifts by more than 31 bits, as they're
163 not portable. */
164#if BITSET_WORD_MAX == 0xffffffff
165# define BITSET_WORD_BITS 32
166#elif BITSET_WORD_MAX >> 31 >> 5 == 1
167# define BITSET_WORD_BITS 36
168#elif BITSET_WORD_MAX >> 31 >> 16 == 1
169# define BITSET_WORD_BITS 48
170#elif BITSET_WORD_MAX >> 31 >> 28 == 1
171# define BITSET_WORD_BITS 60
172#elif BITSET_WORD_MAX >> 31 >> 31 >> 1 == 1
173# define BITSET_WORD_BITS 64
174#elif BITSET_WORD_MAX >> 31 >> 31 >> 9 == 1
175# define BITSET_WORD_BITS 72
176#elif BITSET_WORD_MAX >> 31 >> 31 >> 31 >> 31 >> 3 == 1
177# define BITSET_WORD_BITS 128
178#elif BITSET_WORD_MAX >> 31 >> 31 >> 31 >> 31 >> 31 >> 31 >> 31 >> 31 >> 7 == 1
179# define BITSET_WORD_BITS 256
180#elif BITSET_WORD_MAX >> 31 >> 31 >> 31 >> 31 >> 31 >> 31 >> 31 >> 31 >> 7 > 1
181# define BITSET_WORD_BITS 257 /* any value > SBC_MAX will do here */
182# if BITSET_WORD_BITS <= SBC_MAX
183# error "Invalid SBC_MAX"
184# endif
185#else
186# error "Add case for new bitset_word size"
187#endif
188
189/* Number of bitset words in a bitset. */
190#define BITSET_WORDS ((SBC_MAX + BITSET_WORD_BITS - 1) / BITSET_WORD_BITS)
191
192typedef bitset_word bitset[BITSET_WORDS];
193typedef bitset_word *re_bitset_ptr_t;
194typedef const bitset_word *re_const_bitset_ptr_t;
195
196#define PREV_WORD_CONSTRAINT 0x0001
197#define PREV_NOTWORD_CONSTRAINT 0x0002
198#define NEXT_WORD_CONSTRAINT 0x0004
199#define NEXT_NOTWORD_CONSTRAINT 0x0008
200#define PREV_NEWLINE_CONSTRAINT 0x0010
201#define NEXT_NEWLINE_CONSTRAINT 0x0020
202#define PREV_BEGBUF_CONSTRAINT 0x0040
203#define NEXT_ENDBUF_CONSTRAINT 0x0080
204#define WORD_DELIM_CONSTRAINT 0x0100
205#define NOT_WORD_DELIM_CONSTRAINT 0x0200
206
207typedef enum
208{
209 INSIDE_WORD = PREV_WORD_CONSTRAINT | NEXT_WORD_CONSTRAINT,
210 WORD_FIRST = PREV_NOTWORD_CONSTRAINT | NEXT_WORD_CONSTRAINT,
211 WORD_LAST = PREV_WORD_CONSTRAINT | NEXT_NOTWORD_CONSTRAINT,
212 INSIDE_NOTWORD = PREV_NOTWORD_CONSTRAINT | NEXT_NOTWORD_CONSTRAINT,
213 LINE_FIRST = PREV_NEWLINE_CONSTRAINT,
214 LINE_LAST = NEXT_NEWLINE_CONSTRAINT,
215 BUF_FIRST = PREV_BEGBUF_CONSTRAINT,
216 BUF_LAST = NEXT_ENDBUF_CONSTRAINT,
217 WORD_DELIM = WORD_DELIM_CONSTRAINT,
218 NOT_WORD_DELIM = NOT_WORD_DELIM_CONSTRAINT
219} re_context_type;
220
221typedef struct
222{
223 Idx alloc;
224 Idx nelem;
225 Idx *elems;
226} re_node_set;
227
228typedef enum
229{
230 NON_TYPE = 0,
231
232 /* Node type, These are used by token, node, tree. */
233 CHARACTER = 1,
234 END_OF_RE = 2,
235 SIMPLE_BRACKET = 3,
236 OP_BACK_REF = 4,
237 OP_PERIOD = 5,
238#ifdef RE_ENABLE_I18N
239 COMPLEX_BRACKET = 6,
240 OP_UTF8_PERIOD = 7,
241#endif /* RE_ENABLE_I18N */
242
243 /* We define EPSILON_BIT as a macro so that OP_OPEN_SUBEXP is used
244 when the debugger shows values of this enum type. */
245#define EPSILON_BIT 8
246 OP_OPEN_SUBEXP = EPSILON_BIT | 0,
247 OP_CLOSE_SUBEXP = EPSILON_BIT | 1,
248 OP_ALT = EPSILON_BIT | 2,
249 OP_DUP_ASTERISK = EPSILON_BIT | 3,
250 ANCHOR = EPSILON_BIT | 4,
251
252 /* Tree type, these are used only by tree. */
253 CONCAT = 16,
254 SUBEXP = 17,
255
256 /* Token type, these are used only by token. */
257 OP_DUP_PLUS = 18,
258 OP_DUP_QUESTION,
259 OP_OPEN_BRACKET,
260 OP_CLOSE_BRACKET,
261 OP_CHARSET_RANGE,
262 OP_OPEN_DUP_NUM,
263 OP_CLOSE_DUP_NUM,
264 OP_NON_MATCH_LIST,
265 OP_OPEN_COLL_ELEM,
266 OP_CLOSE_COLL_ELEM,
267 OP_OPEN_EQUIV_CLASS,
268 OP_CLOSE_EQUIV_CLASS,
269 OP_OPEN_CHAR_CLASS,
270 OP_CLOSE_CHAR_CLASS,
271 OP_WORD,
272 OP_NOTWORD,
273 OP_SPACE,
274 OP_NOTSPACE,
275 BACK_SLASH
276
277} re_token_type_t;
278
279#ifdef RE_ENABLE_I18N
280typedef struct
281{
282 /* Multibyte characters. */
283 wchar_t *mbchars;
284
285 /* Collating symbols. */
286# ifdef _LIBC
287 int32_t *coll_syms;
288# endif
289
290 /* Equivalence classes. */
291# ifdef _LIBC
292 int32_t *equiv_classes;
293# endif
294
295 /* Range expressions. */
296# ifdef _LIBC
297 uint32_t *range_starts;
298 uint32_t *range_ends;
299# else /* not _LIBC */
300 wchar_t *range_starts;
301 wchar_t *range_ends;
302# endif /* not _LIBC */
303
304 /* Character classes. */
305 wctype_t *char_classes;
306
307 /* If this character set is the non-matching list. */
308 unsigned int non_match : 1;
309
310 /* # of multibyte characters. */
311 Idx nmbchars;
312
313 /* # of collating symbols. */
314 Idx ncoll_syms;
315
316 /* # of equivalence classes. */
317 Idx nequiv_classes;
318
319 /* # of range expressions. */
320 Idx nranges;
321
322 /* # of character classes. */
323 Idx nchar_classes;
324} re_charset_t;
325#endif /* RE_ENABLE_I18N */
326
327typedef struct
328{
329 union
330 {
331 unsigned char c; /* for CHARACTER */
332 re_bitset_ptr_t sbcset; /* for SIMPLE_BRACKET */
333#ifdef RE_ENABLE_I18N
334 re_charset_t *mbcset; /* for COMPLEX_BRACKET */
335#endif /* RE_ENABLE_I18N */
336 Idx idx; /* for BACK_REF */
337 re_context_type ctx_type; /* for ANCHOR */
338 } opr;
339#if __GNUC__ >= 2
340 re_token_type_t type : 8;
341#else
342 re_token_type_t type;
343#endif
344 unsigned int constraint : 10; /* context constraint */
345 unsigned int duplicated : 1;
346 unsigned int opt_subexp : 1;
347#ifdef RE_ENABLE_I18N
348 unsigned int accept_mb : 1;
349 /* These 2 bits can be moved into the union if needed (e.g. if running out
350 of bits; move opr.c to opr.c.c and move the flags to opr.c.flags). */
351 unsigned int mb_partial : 1;
352#endif
353 unsigned int word_char : 1;
354} re_token_t;
355
356#define IS_EPSILON_NODE(type) ((type) & EPSILON_BIT)
357
358struct re_string_t
359{
360 /* Indicate the raw buffer which is the original string passed as an
361 argument of regexec(), re_search(), etc.. */
362 const unsigned char *raw_mbs;
363 /* Store the multibyte string. In case of "case insensitive mode" like
364 REG_ICASE, upper cases of the string are stored, otherwise MBS points
365 the same address that RAW_MBS points. */
366 unsigned char *mbs;
367#ifdef RE_ENABLE_I18N
368 /* Store the wide character string which is corresponding to MBS. */
369 wint_t *wcs;
370 Idx *offsets;
371 mbstate_t cur_state;
372#endif
373 /* Index in RAW_MBS. Each character mbs[i] corresponds to
374 raw_mbs[raw_mbs_idx + i]. */
375 Idx raw_mbs_idx;
376 /* The length of the valid characters in the buffers. */
377 Idx valid_len;
378 /* The corresponding number of bytes in raw_mbs array. */
379 Idx valid_raw_len;
380 /* The length of the buffers MBS and WCS. */
381 Idx bufs_len;
382 /* The index in MBS, which is updated by re_string_fetch_byte. */
383 Idx cur_idx;
384 /* length of RAW_MBS array. */
385 Idx raw_len;
386 /* This is RAW_LEN - RAW_MBS_IDX + VALID_LEN - VALID_RAW_LEN. */
387 Idx len;
388 /* End of the buffer may be shorter than its length in the cases such
389 as re_match_2, re_search_2. Then, we use STOP for end of the buffer
390 instead of LEN. */
391 Idx raw_stop;
392 /* This is RAW_STOP - RAW_MBS_IDX adjusted through OFFSETS. */
393 Idx stop;
394
395 /* The context of mbs[0]. We store the context independently, since
396 the context of mbs[0] may be different from raw_mbs[0], which is
397 the beginning of the input string. */
398 unsigned int tip_context;
399 /* The translation passed as a part of an argument of re_compile_pattern. */
400 unsigned REG_TRANSLATE_TYPE trans;
401 /* Copy of re_dfa_t's word_char. */
402 re_const_bitset_ptr_t word_char;
403 /* true if REG_ICASE. */
404 unsigned char icase;
405 unsigned char is_utf8;
406 unsigned char map_notascii;
407 unsigned char mbs_allocated;
408 unsigned char offsets_needed;
409 unsigned char newline_anchor;
410 unsigned char word_ops_used;
411 int mb_cur_max;
412};
413typedef struct re_string_t re_string_t;
414
415
416struct re_dfa_t;
417typedef struct re_dfa_t re_dfa_t;
418
419#ifndef _LIBC
420# ifdef __i386__
421# define internal_function __attribute ((regparm (3), stdcall))
422# else
423# define internal_function
424# endif
425#endif
426
427static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
428 Idx new_buf_len)
429 internal_function;
430#ifdef RE_ENABLE_I18N
431static void build_wcs_buffer (re_string_t *pstr) internal_function;
432static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr)
433 internal_function;
434#endif /* RE_ENABLE_I18N */
435static void build_upper_buffer (re_string_t *pstr) internal_function;
436static void re_string_translate_buffer (re_string_t *pstr) internal_function;
437static unsigned int re_string_context_at (const re_string_t *input,
438 Idx idx, int eflags)
439 internal_function __attribute ((pure));
440
441#define re_string_peek_byte(pstr, offset) \
442 ((pstr)->mbs[(pstr)->cur_idx + offset])
443#define re_string_fetch_byte(pstr) \
444 ((pstr)->mbs[(pstr)->cur_idx++])
445#define re_string_first_byte(pstr, idx) \
446 ((idx) == (pstr)->valid_len || (pstr)->wcs[idx] != WEOF)
447#define re_string_is_single_byte_char(pstr, idx) \
448 ((pstr)->wcs[idx] != WEOF && ((pstr)->valid_len == (idx) + 1 \
449 || (pstr)->wcs[(idx) + 1] != WEOF))
450#define re_string_eoi(pstr) ((pstr)->stop <= (pstr)->cur_idx)
451#define re_string_cur_idx(pstr) ((pstr)->cur_idx)
452#define re_string_get_buffer(pstr) ((pstr)->mbs)
453#define re_string_length(pstr) ((pstr)->len)
454#define re_string_byte_at(pstr,idx) ((pstr)->mbs[idx])
455#define re_string_skip_bytes(pstr,idx) ((pstr)->cur_idx += (idx))
456#define re_string_set_index(pstr,idx) ((pstr)->cur_idx = (idx))
457
458#include <alloca.h>
459
460#ifndef _LIBC
461# if HAVE_ALLOCA
462/* The OS usually guarantees only one guard page at the bottom of the stack,
463 and a page size can be as small as 4096 bytes. So we cannot safely
464 allocate anything larger than 4096 bytes. Also care for the possibility
465 of a few compiler-allocated temporary stack slots. */
466# define __libc_use_alloca(n) ((n) < 4032)
467# else
468/* alloca is implemented with malloc, so just use malloc. */
469# define __libc_use_alloca(n) 0
470# endif
471#endif
472
473#define re_malloc(t,n) ((t *) malloc ((n) * sizeof (t)))
474#define re_xmalloc(t,n) ((t *) re_xnmalloc (n, sizeof (t)))
475#define re_calloc(t,n) ((t *) calloc (n, sizeof (t)))
476#define re_realloc(p,t,n) ((t *) realloc (p, (n) * sizeof (t)))
477#define re_xrealloc(p,t,n) ((t *) re_xnrealloc (p, n, sizeof (t)))
478#define re_x2realloc(p,t,pn) ((t *) re_x2nrealloc (p, pn, sizeof (t)))
479#define re_free(p) free (p)
480
481#ifndef SIZE_MAX
482# define SIZE_MAX ((size_t) -1)
483#endif
484
485/* Return true if an array of N objects, each of size S, cannot exist
486 due to size arithmetic overflow. S must be nonzero. */
487static inline bool
488re_alloc_oversized (size_t n, size_t s)
489{
490 return BE (SIZE_MAX / s < n, 0);
491}
492
493/* Return true if an array of (2 * N + 1) objects, each of size S,
494 cannot exist due to size arithmetic overflow. S must be nonzero. */
495static inline bool
496re_x2alloc_oversized (size_t n, size_t s)
497{
498 return BE ((SIZE_MAX / s - 1) / 2 < n, 0);
499}
500
501/* Allocate an array of N objects, each with S bytes of memory,
502 dynamically, with error checking. S must be nonzero. */
503static inline void *
504re_xnmalloc (size_t n, size_t s)
505{
506 return re_alloc_oversized (n, s) ? NULL : malloc (n * s);
507}
508
509/* Change the size of an allocated block of memory P to an array of N
510 objects each of S bytes, with error checking. S must be nonzero. */
511static inline void *
512re_xnrealloc (void *p, size_t n, size_t s)
513{
514 return re_alloc_oversized (n, s) ? NULL : realloc (p, n * s);
515}
516
517/* Reallocate a block of memory P to an array of (2 * (*PN) + 1)
518 objects each of S bytes, with error checking. S must be nonzero.
519 If the allocation is successful, set *PN to the new allocation
520 count and return the resulting pointer. Otherwise, return
521 NULL. */
522static inline void *
523re_x2nrealloc (void *p, size_t *pn, size_t s)
524{
525 if (re_x2alloc_oversized (*pn, s))
526 return NULL;
527 else
528 {
529 /* Add 1 in case *PN is zero. */
530 size_t n1 = 2 * *pn + 1;
531 p = realloc (p, n1 * s);
532 if (BE (p != NULL, 1))
533 *pn = n1;
534 return p;
535 }
536}
537
538struct bin_tree_t
539{
540 struct bin_tree_t *parent;
541 struct bin_tree_t *left;
542 struct bin_tree_t *right;
543 struct bin_tree_t *first;
544 struct bin_tree_t *next;
545
546 re_token_t token;
547
548 /* `node_idx' is the index in dfa->nodes, if `type' == 0.
549 Otherwise `type' indicate the type of this node. */
550 Idx node_idx;
551};
552typedef struct bin_tree_t bin_tree_t;
553
554#define BIN_TREE_STORAGE_SIZE \
555 ((1024 - sizeof (void *)) / sizeof (bin_tree_t))
556
557struct bin_tree_storage_t
558{
559 struct bin_tree_storage_t *next;
560 bin_tree_t data[BIN_TREE_STORAGE_SIZE];
561};
562typedef struct bin_tree_storage_t bin_tree_storage_t;
563
564#define CONTEXT_WORD 1
565#define CONTEXT_NEWLINE (CONTEXT_WORD << 1)
566#define CONTEXT_BEGBUF (CONTEXT_NEWLINE << 1)
567#define CONTEXT_ENDBUF (CONTEXT_BEGBUF << 1)
568
569#define IS_WORD_CONTEXT(c) ((c) & CONTEXT_WORD)
570#define IS_NEWLINE_CONTEXT(c) ((c) & CONTEXT_NEWLINE)
571#define IS_BEGBUF_CONTEXT(c) ((c) & CONTEXT_BEGBUF)
572#define IS_ENDBUF_CONTEXT(c) ((c) & CONTEXT_ENDBUF)
573#define IS_ORDINARY_CONTEXT(c) ((c) == 0)
574
575#define IS_WORD_CHAR(ch) (isalnum (ch) || (ch) == '_')
576#define IS_NEWLINE(ch) ((ch) == NEWLINE_CHAR)
577#define IS_WIDE_WORD_CHAR(ch) (iswalnum (ch) || (ch) == L'_')
578#define IS_WIDE_NEWLINE(ch) ((ch) == WIDE_NEWLINE_CHAR)
579
580#define NOT_SATISFY_PREV_CONSTRAINT(constraint,context) \
581 ((((constraint) & PREV_WORD_CONSTRAINT) && !IS_WORD_CONTEXT (context)) \
582 || ((constraint & PREV_NOTWORD_CONSTRAINT) && IS_WORD_CONTEXT (context)) \
583 || ((constraint & PREV_NEWLINE_CONSTRAINT) && !IS_NEWLINE_CONTEXT (context))\
584 || ((constraint & PREV_BEGBUF_CONSTRAINT) && !IS_BEGBUF_CONTEXT (context)))
585
586#define NOT_SATISFY_NEXT_CONSTRAINT(constraint,context) \
587 ((((constraint) & NEXT_WORD_CONSTRAINT) && !IS_WORD_CONTEXT (context)) \
588 || (((constraint) & NEXT_NOTWORD_CONSTRAINT) && IS_WORD_CONTEXT (context)) \
589 || (((constraint) & NEXT_NEWLINE_CONSTRAINT) && !IS_NEWLINE_CONTEXT (context)) \
590 || (((constraint) & NEXT_ENDBUF_CONSTRAINT) && !IS_ENDBUF_CONTEXT (context)))
591
592struct re_dfastate_t
593{
594 re_hashval_t hash;
595 re_node_set nodes;
596 re_node_set non_eps_nodes;
597 re_node_set inveclosure;
598 re_node_set *entrance_nodes;
599 struct re_dfastate_t **trtable, **word_trtable;
600 unsigned int context : 4;
601 unsigned int halt : 1;
602 /* If this state can accept `multi byte'.
603 Note that we refer to multibyte characters, and multi character
604 collating elements as `multi byte'. */
605 unsigned int accept_mb : 1;
606 /* If this state has backreference node(s). */
607 unsigned int has_backref : 1;
608 unsigned int has_constraint : 1;
609};
610typedef struct re_dfastate_t re_dfastate_t;
611
612struct re_state_table_entry
613{
614 Idx num;
615 Idx alloc;
616 re_dfastate_t **array;
617};
618
619/* Array type used in re_sub_match_last_t and re_sub_match_top_t. */
620
621typedef struct
622{
623 Idx next_idx;
624 Idx alloc;
625 re_dfastate_t **array;
626} state_array_t;
627
628/* Store information about the node NODE whose type is OP_CLOSE_SUBEXP. */
629
630typedef struct
631{
632 Idx node;
633 Idx str_idx; /* The position NODE match at. */
634 state_array_t path;
635} re_sub_match_last_t;
636
637/* Store information about the node NODE whose type is OP_OPEN_SUBEXP.
638 And information about the node, whose type is OP_CLOSE_SUBEXP,
639 corresponding to NODE is stored in LASTS. */
640
641typedef struct
642{
643 Idx str_idx;
644 Idx node;
645 state_array_t *path;
646 Idx alasts; /* Allocation size of LASTS. */
647 Idx nlasts; /* The number of LASTS. */
648 re_sub_match_last_t **lasts;
649} re_sub_match_top_t;
650
651struct re_backref_cache_entry
652{
653 Idx node;
654 Idx str_idx;
655 Idx subexp_from;
656 Idx subexp_to;
657 char more;
658 char unused;
659 unsigned short int eps_reachable_subexps_map;
660};
661
662typedef struct
663{
664 /* The string object corresponding to the input string. */
665 re_string_t input;
666#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
667 re_dfa_t *const dfa;
668#else
669 re_dfa_t *dfa;
670#endif
671 /* EFLAGS of the argument of regexec. */
672 int eflags;
673 /* Where the matching ends. */
674 Idx match_last;
675 Idx last_node;
676 /* The state log used by the matcher. */
677 re_dfastate_t **state_log;
678 Idx state_log_top;
679 /* Back reference cache. */
680 Idx nbkref_ents;
681 Idx abkref_ents;
682 struct re_backref_cache_entry *bkref_ents;
683 int max_mb_elem_len;
684 Idx nsub_tops;
685 Idx asub_tops;
686 re_sub_match_top_t **sub_tops;
687} re_match_context_t;
688
689typedef struct
690{
691 re_dfastate_t **sifted_states;
692 re_dfastate_t **limited_states;
693 Idx last_node;
694 Idx last_str_idx;
695 re_node_set limits;
696} re_sift_context_t;
697
698struct re_fail_stack_ent_t
699{
700 Idx idx;
701 Idx node;
702 regmatch_t *regs;
703 re_node_set eps_via_nodes;
704};
705
706struct re_fail_stack_t
707{
708 Idx num;
709 Idx alloc;
710 struct re_fail_stack_ent_t *stack;
711};
712
713struct re_dfa_t
714{
715 re_token_t *nodes;
716 Idx nodes_alloc;
717 Idx nodes_len;
718 Idx *nexts;
719 Idx *org_indices;
720 re_node_set *edests;
721 re_node_set *eclosures;
722 re_node_set *inveclosures;
723 struct re_state_table_entry *state_table;
724 re_dfastate_t *init_state;
725 re_dfastate_t *init_state_word;
726 re_dfastate_t *init_state_nl;
727 re_dfastate_t *init_state_begbuf;
728 bin_tree_t *str_tree;
729 bin_tree_storage_t *str_tree_storage;
730 re_bitset_ptr_t sb_char;
731 int str_tree_storage_idx;
732
733 /* number of subexpressions `re_nsub' is in regex_t. */
734 re_hashval_t state_hash_mask;
735 Idx init_node;
736 Idx nbackref; /* The number of backreference in this dfa. */
737
738 /* Bitmap expressing which backreference is used. */
739 bitset_word used_bkref_map;
740 bitset_word completed_bkref_map;
741
742 unsigned int has_plural_match : 1;
743 /* If this dfa has "multibyte node", which is a backreference or
744 a node which can accept multibyte character or multi character
745 collating element. */
746 unsigned int has_mb_node : 1;
747 unsigned int is_utf8 : 1;
748 unsigned int map_notascii : 1;
749 unsigned int word_ops_used : 1;
750 int mb_cur_max;
751 bitset word_char;
752 reg_syntax_t syntax;
753 Idx *subexp_map;
754#ifdef DEBUG
755 char* re_str;
756#endif
757 __libc_lock_define (, lock)
758};
759
760#define re_node_set_init_empty(set) memset (set, '\0', sizeof (re_node_set))
761#define re_node_set_remove(set,id) \
762 (re_node_set_remove_at (set, re_node_set_contains (set, id) - 1))
763#define re_node_set_empty(p) ((p)->nelem = 0)
764#define re_node_set_free(set) re_free ((set)->elems)
765
766static void free_state (re_dfastate_t *state) internal_function;
767
768
769typedef enum
770{
771 SB_CHAR,
772 MB_CHAR,
773 EQUIV_CLASS,
774 COLL_SYM,
775 CHAR_CLASS
776} bracket_elem_type;
777
778typedef struct
779{
780 bracket_elem_type type;
781 union
782 {
783 unsigned char ch;
784 unsigned char *name;
785 wchar_t wch;
786 } opr;
787} bracket_elem_t;
788
789
790/* Inline functions for bitset operation. */
791
792static inline void
793bitset_set (bitset set, Idx i)
794{
795 set[i / BITSET_WORD_BITS] |= (bitset_word) 1 << i % BITSET_WORD_BITS;
796}
797
798static inline void
799bitset_clear (bitset set, Idx i)
800{
801 set[i / BITSET_WORD_BITS] &= ~ ((bitset_word) 1 << i % BITSET_WORD_BITS);
802}
803
804static inline bool
805bitset_contain (const bitset set, Idx i)
806{
807 return (set[i / BITSET_WORD_BITS] >> i % BITSET_WORD_BITS) & 1;
808}
809
810static inline void
811bitset_empty (bitset set)
812{
813 memset (set, 0, sizeof (bitset));
814}
815
816static inline void
817bitset_set_all (bitset set)
818{
819 memset (set, -1, sizeof (bitset_word) * (SBC_MAX / BITSET_WORD_BITS));
820 if (SBC_MAX % BITSET_WORD_BITS != 0)
821 set[BITSET_WORDS - 1] =
822 ((bitset_word) 1 << SBC_MAX % BITSET_WORD_BITS) - 1;
823}
824
825static inline void
826bitset_copy (bitset dest, const bitset src)
827{
828 memcpy (dest, src, sizeof (bitset));
829}
830
831static inline void
832bitset_not (bitset set)
833{
834 int i;
835 for (i = 0; i < SBC_MAX / BITSET_WORD_BITS; ++i)
836 set[i] = ~set[i];
837 if (SBC_MAX % BITSET_WORD_BITS != 0)
838 set[BITSET_WORDS - 1] =
839 ((((bitset_word) 1 << SBC_MAX % BITSET_WORD_BITS) - 1)
840 & ~set[BITSET_WORDS - 1]);
841}
842
843static inline void
844bitset_merge (bitset dest, const bitset src)
845{
846 int i;
847 for (i = 0; i < BITSET_WORDS; ++i)
848 dest[i] |= src[i];
849}
850
851static inline void
852bitset_mask (bitset dest, const bitset src)
853{
854 int i;
855 for (i = 0; i < BITSET_WORDS; ++i)
856 dest[i] &= src[i];
857}
858
859#if defined RE_ENABLE_I18N
860/* Inline functions for re_string. */
861static inline int
862internal_function __attribute ((pure))
863re_string_char_size_at (const re_string_t *pstr, Idx idx)
864{
865 int byte_idx;
866 if (pstr->mb_cur_max == 1)
867 return 1;
868 for (byte_idx = 1; idx + byte_idx < pstr->valid_len; ++byte_idx)
869 if (pstr->wcs[idx + byte_idx] != WEOF)
870 break;
871 return byte_idx;
872}
873
874static inline wint_t
875internal_function __attribute ((pure))
876re_string_wchar_at (const re_string_t *pstr, Idx idx)
877{
878 if (pstr->mb_cur_max == 1)
879 return (wint_t) pstr->mbs[idx];
880 return (wint_t) pstr->wcs[idx];
881}
882
883static int
884internal_function __attribute ((pure))
885re_string_elem_size_at (const re_string_t *pstr, Idx idx)
886{
887#ifdef _LIBC
888 const unsigned char *p, *extra;
889 const int32_t *table, *indirect;
890 int32_t tmp;
891# include <locale/weight.h>
892 uint_fast32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
893
894 if (nrules != 0)
895 {
896 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
897 extra = (const unsigned char *)
898 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
899 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
900 _NL_COLLATE_INDIRECTMB);
901 p = pstr->mbs + idx;
902 tmp = findidx (&p);
903 return p - pstr->mbs - idx;
904 }
905 else
906#endif /* _LIBC */
907 return 1;
908}
909#endif /* RE_ENABLE_I18N */
910
911#endif /* _REGEX_INTERNAL_H */
diff --git a/lib/regexec.c b/lib/regexec.c
new file mode 100644
index 0000000..a85077c
--- /dev/null
+++ b/lib/regexec.c
@@ -0,0 +1,4333 @@
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19
20static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
21 Idx n) internal_function;
22static void match_ctx_clean (re_match_context_t *mctx) internal_function;
23static void match_ctx_free (re_match_context_t *cache) internal_function;
24static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
25 Idx str_idx, Idx from, Idx to)
26 internal_function;
27static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
28 internal_function;
29static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
30 Idx str_idx) internal_function;
31static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
32 Idx node, Idx str_idx)
33 internal_function;
34static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
35 re_dfastate_t **limited_sts, Idx last_node,
36 Idx last_str_idx)
37 internal_function;
38static reg_errcode_t re_search_internal (const regex_t *preg,
39 const char *string, Idx length,
40 Idx start, Idx last_start, Idx stop,
41 size_t nmatch, regmatch_t pmatch[],
42 int eflags) internal_function;
43static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
44 const char *string1, Idx length1,
45 const char *string2, Idx length2,
46 Idx start, regoff_t range,
47 struct re_registers *regs,
48 Idx stop, bool ret_len) internal_function;
49static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
50 const char *string, Idx length, Idx start,
51 regoff_t range, Idx stop,
52 struct re_registers *regs,
53 bool ret_len) internal_function;
54static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
55 Idx nregs, int regs_allocated) internal_function;
56static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
57 internal_function;
58static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
59 Idx *p_match_first)
60 internal_function;
61static Idx check_halt_state_context (const re_match_context_t *mctx,
62 const re_dfastate_t *state, Idx idx)
63 internal_function;
64static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
65 regmatch_t *prev_idx_match, Idx cur_node,
66 Idx cur_idx, Idx nmatch) internal_function;
67static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
68 Idx str_idx, Idx dest_node, Idx nregs,
69 regmatch_t *regs,
70 re_node_set *eps_via_nodes) internal_function;
71static reg_errcode_t set_regs (const regex_t *preg,
72 const re_match_context_t *mctx,
73 size_t nmatch, regmatch_t *pmatch,
74 bool fl_backtrack) internal_function;
75static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
76
77#ifdef RE_ENABLE_I18N
78static int sift_states_iter_mb (const re_match_context_t *mctx,
79 re_sift_context_t *sctx,
80 Idx node_idx, Idx str_idx, Idx max_str_idx) internal_function;
81#endif /* RE_ENABLE_I18N */
82static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
83 re_sift_context_t *sctx) internal_function;
84static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
85 re_sift_context_t *sctx, Idx str_idx,
86 re_node_set *cur_dest) internal_function;
87static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
88 re_sift_context_t *sctx,
89 Idx str_idx,
90 re_node_set *dest_nodes) internal_function;
91static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
92 re_node_set *dest_nodes,
93 const re_node_set *candidates) internal_function;
94static bool check_dst_limits (const re_match_context_t *mctx,
95 const re_node_set *limits,
96 Idx dst_node, Idx dst_idx, Idx src_node,
97 Idx src_idx) internal_function;
98static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
99 int boundaries, Idx subexp_idx,
100 Idx from_node, Idx bkref_idx) internal_function;
101static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
102 Idx limit, Idx subexp_idx,
103 Idx node, Idx str_idx,
104 Idx bkref_idx) internal_function;
105static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
106 re_node_set *dest_nodes,
107 const re_node_set *candidates,
108 re_node_set *limits,
109 struct re_backref_cache_entry *bkref_ents,
110 Idx str_idx) internal_function;
111static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
112 re_sift_context_t *sctx,
113 Idx str_idx, const re_node_set *candidates) internal_function;
114static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
115 re_dfastate_t **src, Idx num) internal_function;
116static re_dfastate_t *find_recover_state (reg_errcode_t *err,
117 re_match_context_t *mctx) internal_function;
118static re_dfastate_t *transit_state (reg_errcode_t *err,
119 re_match_context_t *mctx,
120 re_dfastate_t *state) internal_function;
121static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
122 re_match_context_t *mctx,
123 re_dfastate_t *next_state) internal_function;
124static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
125 re_node_set *cur_nodes,
126 Idx str_idx) internal_function;
127#if 0
128static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
129 re_match_context_t *mctx,
130 re_dfastate_t *pstate) internal_function;
131#endif
132#ifdef RE_ENABLE_I18N
133static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
134 re_dfastate_t *pstate) internal_function;
135#endif /* RE_ENABLE_I18N */
136static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
137 const re_node_set *nodes) internal_function;
138static reg_errcode_t get_subexp (re_match_context_t *mctx,
139 Idx bkref_node, Idx bkref_str_idx) internal_function;
140static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
141 const re_sub_match_top_t *sub_top,
142 re_sub_match_last_t *sub_last,
143 Idx bkref_node, Idx bkref_str) internal_function;
144static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
145 Idx subexp_idx, int type) internal_function;
146static reg_errcode_t check_arrival (re_match_context_t *mctx,
147 state_array_t *path, Idx top_node,
148 Idx top_str, Idx last_node, Idx last_str,
149 int type) internal_function;
150static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
151 Idx str_idx,
152 re_node_set *cur_nodes,
153 re_node_set *next_nodes) internal_function;
154static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
155 re_node_set *cur_nodes,
156 Idx ex_subexp, int type) internal_function;
157static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
158 re_node_set *dst_nodes,
159 Idx target, Idx ex_subexp,
160 int type) internal_function;
161static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
162 re_node_set *cur_nodes, Idx cur_str,
163 Idx subexp_num, int type) internal_function;
164static bool build_trtable (re_dfa_t *dfa,
165 re_dfastate_t *state) internal_function;
166#ifdef RE_ENABLE_I18N
167static int check_node_accept_bytes (re_dfa_t *dfa, Idx node_idx,
168 const re_string_t *input, Idx idx) internal_function;
169# ifdef _LIBC
170static unsigned int find_collation_sequence_value (const unsigned char *mbs,
171 size_t name_len) internal_function;
172# endif /* _LIBC */
173#endif /* RE_ENABLE_I18N */
174static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
175 const re_dfastate_t *state,
176 re_node_set *states_node,
177 bitset *states_ch) internal_function;
178static bool check_node_accept (const re_match_context_t *mctx,
179 const re_token_t *node, Idx idx)
180 internal_function;
181static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
182
183/* Entry point for POSIX code. */
184
185/* regexec searches for a given pattern, specified by PREG, in the
186 string STRING.
187
188 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
189 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
190 least NMATCH elements, and we set them to the offsets of the
191 corresponding matched substrings.
192
193 EFLAGS specifies `execution flags' which affect matching: if
194 REG_NOTBOL is set, then ^ does not match at the beginning of the
195 string; if REG_NOTEOL is set, then $ does not match at the end.
196
197 We return 0 if we find a match and REG_NOMATCH if not. */
198
199int
200regexec (const regex_t *__restrict preg, const char *__restrict string,
201 size_t nmatch, regmatch_t pmatch[], int eflags)
202{
203 reg_errcode_t err;
204 Idx start, length;
205#ifdef _LIBC
206 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
207#endif
208
209 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
210 return REG_BADPAT;
211
212 if (eflags & REG_STARTEND)
213 {
214 start = pmatch[0].rm_so;
215 length = pmatch[0].rm_eo;
216 }
217 else
218 {
219 start = 0;
220 length = strlen (string);
221 }
222
223 __libc_lock_lock (dfa->lock);
224 if (preg->re_no_sub)
225 err = re_search_internal (preg, string, length, start, length,
226 length, 0, NULL, eflags);
227 else
228 err = re_search_internal (preg, string, length, start, length,
229 length, nmatch, pmatch, eflags);
230 __libc_lock_unlock (dfa->lock);
231 return err != REG_NOERROR;
232}
233
234#ifdef _LIBC
235# include <shlib-compat.h>
236versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
237
238# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
239__typeof__ (__regexec) __compat_regexec;
240
241int
242attribute_compat_text_section
243__compat_regexec (const regex_t *__restrict preg,
244 const char *__restrict string, size_t nmatch,
245 regmatch_t pmatch[], int eflags)
246{
247 return regexec (preg, string, nmatch, pmatch,
248 eflags & (REG_NOTBOL | REG_NOTEOL));
249}
250compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
251# endif
252#endif
253
254/* Entry points for GNU code. */
255
256/* re_match, re_search, re_match_2, re_search_2
257
258 The former two functions operate on STRING with length LENGTH,
259 while the later two operate on concatenation of STRING1 and STRING2
260 with lengths LENGTH1 and LENGTH2, respectively.
261
262 re_match() matches the compiled pattern in BUFP against the string,
263 starting at index START.
264
265 re_search() first tries matching at index START, then it tries to match
266 starting from index START + 1, and so on. The last start position tried
267 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
268 way as re_match().)
269
270 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
271 the first STOP characters of the concatenation of the strings should be
272 concerned.
273
274 If REGS is not NULL, and BUFP->re_no_sub is not set, the offsets of the match
275 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
276 computed relative to the concatenation, not relative to the individual
277 strings.)
278
279 On success, re_match* functions return the length of the match, re_search*
280 return the position of the start of the match. Return value -1 means no
281 match was found and -2 indicates an internal error. */
282
283regoff_t
284re_match (struct re_pattern_buffer *bufp, const char *string,
285 Idx length, Idx start, struct re_registers *regs)
286{
287 return re_search_stub (bufp, string, length, start, 0, length, regs, true);
288}
289#ifdef _LIBC
290weak_alias (__re_match, re_match)
291#endif
292
293regoff_t
294re_search (struct re_pattern_buffer *bufp, const char *string,
295 Idx length, Idx start, regoff_t range, struct re_registers *regs)
296{
297 return re_search_stub (bufp, string, length, start, range, length, regs,
298 false);
299}
300#ifdef _LIBC
301weak_alias (__re_search, re_search)
302#endif
303
304regoff_t
305re_match_2 (struct re_pattern_buffer *bufp,
306 const char *string1, Idx length1,
307 const char *string2, Idx length2,
308 Idx start, struct re_registers *regs, Idx stop)
309{
310 return re_search_2_stub (bufp, string1, length1, string2, length2,
311 start, 0, regs, stop, true);
312}
313#ifdef _LIBC
314weak_alias (__re_match_2, re_match_2)
315#endif
316
317regoff_t
318re_search_2 (struct re_pattern_buffer *bufp,
319 const char *string1, Idx length1,
320 const char *string2, Idx length2,
321 Idx start, regoff_t range, struct re_registers *regs, Idx stop)
322{
323 return re_search_2_stub (bufp, string1, length1, string2, length2,
324 start, range, regs, stop, false);
325}
326#ifdef _LIBC
327weak_alias (__re_search_2, re_search_2)
328#endif
329
330static regoff_t
331internal_function
332re_search_2_stub (struct re_pattern_buffer *bufp,
333 const char *string1, Idx length1,
334 const char *string2, Idx length2,
335 Idx start, regoff_t range, struct re_registers *regs,
336 Idx stop, bool ret_len)
337{
338 const char *str;
339 regoff_t rval;
340 Idx len = length1 + length2;
341 char *s = NULL;
342
343 if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1, 0))
344 return -2;
345
346 /* Concatenate the strings. */
347 if (length2 > 0)
348 if (length1 > 0)
349 {
350 s = re_malloc (char, len);
351
352 if (BE (s == NULL, 0))
353 return -2;
354 memcpy (s, string1, length1);
355 memcpy (s + length1, string2, length2);
356 str = s;
357 }
358 else
359 str = string2;
360 else
361 str = string1;
362
363 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
364 ret_len);
365 re_free (s);
366 return rval;
367}
368
369/* The parameters have the same meaning as those of re_search.
370 Additional parameters:
371 If RET_LEN is true the length of the match is returned (re_match style);
372 otherwise the position of the match is returned. */
373
374static regoff_t
375internal_function
376re_search_stub (struct re_pattern_buffer *bufp,
377 const char *string, Idx length,
378 Idx start, regoff_t range, Idx stop, struct re_registers *regs,
379 bool ret_len)
380{
381 reg_errcode_t result;
382 regmatch_t *pmatch;
383 Idx nregs;
384 regoff_t rval;
385 int eflags = 0;
386#ifdef _LIBC
387 re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
388#endif
389 Idx last_start = start + range;
390
391 /* Check for out-of-range. */
392 if (BE (start < 0 || start > length, 0))
393 return -1;
394 if (sizeof start < sizeof range)
395 {
396 regoff_t length_offset = length;
397 regoff_t start_offset = start;
398 if (BE (length_offset - start_offset < range, 0))
399 last_start = length;
400 else if (BE (range < - start_offset, 0))
401 last_start = 0;
402 }
403 else
404 {
405 if (BE ((last_start < start) != (range < 0), 0))
406 {
407 /* Overflow occurred when computing last_start; substitute
408 the extreme value. */
409 last_start = range < 0 ? 0 : length;
410 }
411 else
412 {
413 if (BE (length < last_start, 0))
414 last_start = length;
415 else if (BE (last_start < 0, 0))
416 last_start = 0;
417 }
418 }
419
420 __libc_lock_lock (dfa->lock);
421
422 eflags |= (bufp->re_not_bol) ? REG_NOTBOL : 0;
423 eflags |= (bufp->re_not_eol) ? REG_NOTEOL : 0;
424
425 /* Compile fastmap if we haven't yet. */
426 if (start < last_start && bufp->re_fastmap != NULL
427 && !bufp->re_fastmap_accurate)
428 re_compile_fastmap (bufp);
429
430 if (BE (bufp->re_no_sub, 0))
431 regs = NULL;
432
433 /* We need at least 1 register. */
434 if (regs == NULL)
435 nregs = 1;
436 else if (BE (bufp->re_regs_allocated == REG_FIXED
437 && regs->rm_num_regs <= bufp->re_nsub, 0))
438 {
439 nregs = regs->rm_num_regs;
440 if (BE (nregs < 1, 0))
441 {
442 /* Nothing can be copied to regs. */
443 regs = NULL;
444 nregs = 1;
445 }
446 }
447 else
448 nregs = bufp->re_nsub + 1;
449 pmatch = re_xmalloc (regmatch_t, nregs);
450 if (BE (pmatch == NULL, 0))
451 {
452 rval = -2;
453 goto out;
454 }
455
456 result = re_search_internal (bufp, string, length, start, last_start, stop,
457 nregs, pmatch, eflags);
458
459 rval = 0;
460
461 /* I hope we needn't fill ther regs with -1's when no match was found. */
462 if (result != REG_NOERROR)
463 rval = -1;
464 else if (regs != NULL)
465 {
466 /* If caller wants register contents data back, copy them. */
467 bufp->re_regs_allocated = re_copy_regs (regs, pmatch, nregs,
468 bufp->re_regs_allocated);
469 if (BE (bufp->re_regs_allocated == REG_UNALLOCATED, 0))
470 rval = -2;
471 }
472
473 if (BE (rval == 0, 1))
474 {
475 if (ret_len)
476 {
477 assert (pmatch[0].rm_so == start);
478 rval = pmatch[0].rm_eo - start;
479 }
480 else
481 rval = pmatch[0].rm_so;
482 }
483 re_free (pmatch);
484 out:
485 __libc_lock_unlock (dfa->lock);
486 return rval;
487}
488
489static unsigned
490internal_function
491re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
492 int regs_allocated)
493{
494 int rval = REG_REALLOCATE;
495 Idx i;
496 Idx need_regs = nregs + 1;
497 /* We need one extra element beyond `rm_num_regs' for the `-1' marker GNU code
498 uses. */
499
500 /* Have the register data arrays been allocated? */
501 if (regs_allocated == REG_UNALLOCATED)
502 { /* No. So allocate them with malloc. */
503 regs->rm_start = re_xmalloc (regoff_t, need_regs);
504 regs->rm_end = re_malloc (regoff_t, need_regs);
505 if (BE (regs->rm_start == NULL, 0) || BE (regs->rm_end == NULL, 0))
506 return REG_UNALLOCATED;
507 regs->rm_num_regs = need_regs;
508 }
509 else if (regs_allocated == REG_REALLOCATE)
510 { /* Yes. If we need more elements than were already
511 allocated, reallocate them. If we need fewer, just
512 leave it alone. */
513 if (BE (need_regs > regs->rm_num_regs, 0))
514 {
515 regoff_t *new_start =
516 re_xrealloc (regs->rm_start, regoff_t, need_regs);
517 regoff_t *new_end = re_realloc (regs->rm_end, regoff_t, need_regs);
518 if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
519 return REG_UNALLOCATED;
520 regs->rm_start = new_start;
521 regs->rm_end = new_end;
522 regs->rm_num_regs = need_regs;
523 }
524 }
525 else
526 {
527 assert (regs_allocated == REG_FIXED);
528 /* This function may not be called with REG_FIXED and nregs too big. */
529 assert (regs->rm_num_regs >= nregs);
530 rval = REG_FIXED;
531 }
532
533 /* Copy the regs. */
534 for (i = 0; i < nregs; ++i)
535 {
536 regs->rm_start[i] = pmatch[i].rm_so;
537 regs->rm_end[i] = pmatch[i].rm_eo;
538 }
539 for ( ; i < regs->rm_num_regs; ++i)
540 regs->rm_start[i] = regs->rm_end[i] = -1;
541
542 return rval;
543}
544
545/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
546 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
547 this memory for recording register information. STARTS and ENDS
548 must be allocated using the malloc library routine, and must each
549 be at least NUM_REGS * sizeof (regoff_t) bytes long.
550
551 If NUM_REGS == 0, then subsequent matches should allocate their own
552 register data.
553
554 Unless this function is called, the first search or match using
555 PATTERN_BUFFER will allocate its own register data, without
556 freeing the old data. */
557
558void
559re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
560 __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
561{
562 if (num_regs)
563 {
564 bufp->re_regs_allocated = REG_REALLOCATE;
565 regs->rm_num_regs = num_regs;
566 regs->rm_start = starts;
567 regs->rm_end = ends;
568 }
569 else
570 {
571 bufp->re_regs_allocated = REG_UNALLOCATED;
572 regs->rm_num_regs = 0;
573 regs->rm_start = regs->rm_end = NULL;
574 }
575}
576#ifdef _LIBC
577weak_alias (__re_set_registers, re_set_registers)
578#endif
579
580/* Entry points compatible with 4.2 BSD regex library. We don't define
581 them unless specifically requested. */
582
583#if defined _REGEX_RE_COMP || defined _LIBC
584int
585# ifdef _LIBC
586weak_function
587# endif
588re_exec (const char *s)
589{
590 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
591}
592#endif /* _REGEX_RE_COMP */
593
594/* Internal entry point. */
595
596/* Searches for a compiled pattern PREG in the string STRING, whose
597 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
598 meaning as with regexec. LAST_START is START + RANGE, where
599 START and RANGE have the same meaning as with re_search.
600 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
601 otherwise return the error code.
602 Note: We assume front end functions already check ranges.
603 (0 <= LAST_START && LAST_START <= LENGTH) */
604
605static reg_errcode_t
606internal_function
607re_search_internal (const regex_t *preg,
608 const char *string, Idx length,
609 Idx start, Idx last_start, Idx stop,
610 size_t nmatch, regmatch_t pmatch[],
611 int eflags)
612{
613 reg_errcode_t err;
614 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
615 Idx left_lim, right_lim;
616 int incr;
617 bool fl_longest_match;
618 int match_kind;
619 Idx match_first, match_last = REG_MISSING;
620 Idx extra_nmatch;
621 bool sb;
622 int ch;
623#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
624 re_match_context_t mctx = { .dfa = dfa };
625#else
626 re_match_context_t mctx;
627#endif
628 char *fastmap = ((preg->re_fastmap != NULL && preg->re_fastmap_accurate
629 && start != last_start && !preg->re_can_be_null)
630 ? preg->re_fastmap : NULL);
631 unsigned REG_TRANSLATE_TYPE t =
632 (unsigned REG_TRANSLATE_TYPE) preg->re_translate;
633
634#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
635 memset (&mctx, '\0', sizeof (re_match_context_t));
636 mctx.dfa = dfa;
637#endif
638
639 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
640 nmatch -= extra_nmatch;
641
642 /* Check if the DFA haven't been compiled. */
643 if (BE (preg->re_used == 0 || dfa->init_state == NULL
644 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
645 || dfa->init_state_begbuf == NULL, 0))
646 return REG_NOMATCH;
647
648#ifdef DEBUG
649 /* We assume front-end functions already check them. */
650 assert (0 <= last_start && last_start <= length);
651#endif
652
653 /* If initial states with non-begbuf contexts have no elements,
654 the regex must be anchored. If preg->re_newline_anchor is set,
655 we'll never use init_state_nl, so do not check it. */
656 if (dfa->init_state->nodes.nelem == 0
657 && dfa->init_state_word->nodes.nelem == 0
658 && (dfa->init_state_nl->nodes.nelem == 0
659 || !preg->re_newline_anchor))
660 {
661 if (start != 0 && last_start != 0)
662 return REG_NOMATCH;
663 start = last_start = 0;
664 }
665
666 /* We must check the longest matching, if nmatch > 0. */
667 fl_longest_match = (nmatch != 0 || dfa->nbackref);
668
669 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
670 preg->re_translate,
671 preg->re_syntax & REG_IGNORE_CASE, dfa);
672 if (BE (err != REG_NOERROR, 0))
673 goto free_return;
674 mctx.input.stop = stop;
675 mctx.input.raw_stop = stop;
676 mctx.input.newline_anchor = preg->re_newline_anchor;
677
678 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
679 if (BE (err != REG_NOERROR, 0))
680 goto free_return;
681
682 /* We will log all the DFA states through which the dfa pass,
683 if nmatch > 1, or this dfa has "multibyte node", which is a
684 back-reference or a node which can accept multibyte character or
685 multi character collating element. */
686 if (nmatch > 1 || dfa->has_mb_node)
687 {
688 mctx.state_log = re_xmalloc (re_dfastate_t *, mctx.input.bufs_len + 1);
689 if (BE (mctx.state_log == NULL, 0))
690 {
691 err = REG_ESPACE;
692 goto free_return;
693 }
694 }
695 else
696 mctx.state_log = NULL;
697
698 match_first = start;
699 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
700 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
701
702 /* Check incrementally whether of not the input string match. */
703 incr = (last_start < start) ? -1 : 1;
704 left_lim = (last_start < start) ? last_start : start;
705 right_lim = (last_start < start) ? start : last_start;
706 sb = dfa->mb_cur_max == 1;
707 match_kind =
708 (fastmap
709 ? ((sb || !(preg->re_syntax & REG_IGNORE_CASE || t) ? 4 : 0)
710 | (start <= last_start ? 2 : 0)
711 | (t != NULL ? 1 : 0))
712 : 8);
713
714 for (;; match_first += incr)
715 {
716 err = REG_NOMATCH;
717 if (match_first < left_lim || right_lim < match_first)
718 goto free_return;
719
720 /* Advance as rapidly as possible through the string, until we
721 find a plausible place to start matching. This may be done
722 with varying efficiency, so there are various possibilities:
723 only the most common of them are specialized, in order to
724 save on code size. We use a switch statement for speed. */
725 switch (match_kind)
726 {
727 case 8:
728 /* No fastmap. */
729 break;
730
731 case 7:
732 /* Fastmap with single-byte translation, match forward. */
733 while (BE (match_first < right_lim, 1)
734 && !fastmap[t[(unsigned char) string[match_first]]])
735 ++match_first;
736 goto forward_match_found_start_or_reached_end;
737
738 case 6:
739 /* Fastmap without translation, match forward. */
740 while (BE (match_first < right_lim, 1)
741 && !fastmap[(unsigned char) string[match_first]])
742 ++match_first;
743
744 forward_match_found_start_or_reached_end:
745 if (BE (match_first == right_lim, 0))
746 {
747 ch = match_first >= length
748 ? 0 : (unsigned char) string[match_first];
749 if (!fastmap[t ? t[ch] : ch])
750 goto free_return;
751 }
752 break;
753
754 case 4:
755 case 5:
756 /* Fastmap without multi-byte translation, match backwards. */
757 while (match_first >= left_lim)
758 {
759 ch = match_first >= length
760 ? 0 : (unsigned char) string[match_first];
761 if (fastmap[t ? t[ch] : ch])
762 break;
763 --match_first;
764 }
765 if (match_first < left_lim)
766 goto free_return;
767 break;
768
769 default:
770 /* In this case, we can't determine easily the current byte,
771 since it might be a component byte of a multibyte
772 character. Then we use the constructed buffer instead. */
773 for (;;)
774 {
775 /* If MATCH_FIRST is out of the valid range, reconstruct the
776 buffers. */
777 __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
778 if (BE (offset >= (__re_size_t) mctx.input.valid_raw_len, 0))
779 {
780 err = re_string_reconstruct (&mctx.input, match_first,
781 eflags);
782 if (BE (err != REG_NOERROR, 0))
783 goto free_return;
784
785 offset = match_first - mctx.input.raw_mbs_idx;
786 }
787 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
788 Note that MATCH_FIRST must not be smaller than 0. */
789 ch = (match_first >= length
790 ? 0 : re_string_byte_at (&mctx.input, offset));
791 if (fastmap[ch])
792 break;
793 match_first += incr;
794 if (match_first < left_lim || match_first > right_lim)
795 {
796 err = REG_NOMATCH;
797 goto free_return;
798 }
799 }
800 break;
801 }
802
803 /* Reconstruct the buffers so that the matcher can assume that
804 the matching starts from the beginning of the buffer. */
805 err = re_string_reconstruct (&mctx.input, match_first, eflags);
806 if (BE (err != REG_NOERROR, 0))
807 goto free_return;
808
809#ifdef RE_ENABLE_I18N
810 /* Don't consider this char as a possible match start if it part,
811 yet isn't the head, of a multibyte character. */
812 if (!sb && !re_string_first_byte (&mctx.input, 0))
813 continue;
814#endif
815
816 /* It seems to be appropriate one, then use the matcher. */
817 /* We assume that the matching starts from 0. */
818 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
819 match_last = check_matching (&mctx, fl_longest_match,
820 start <= last_start ? &match_first : NULL);
821 if (match_last != REG_MISSING)
822 {
823 if (BE (match_last == REG_ERROR, 0))
824 {
825 err = REG_ESPACE;
826 goto free_return;
827 }
828 else
829 {
830 mctx.match_last = match_last;
831 if ((!preg->re_no_sub && nmatch > 1) || dfa->nbackref)
832 {
833 re_dfastate_t *pstate = mctx.state_log[match_last];
834 mctx.last_node = check_halt_state_context (&mctx, pstate,
835 match_last);
836 }
837 if ((!preg->re_no_sub && nmatch > 1 && dfa->has_plural_match)
838 || dfa->nbackref)
839 {
840 err = prune_impossible_nodes (&mctx);
841 if (err == REG_NOERROR)
842 break;
843 if (BE (err != REG_NOMATCH, 0))
844 goto free_return;
845 match_last = REG_MISSING;
846 }
847 else
848 break; /* We found a match. */
849 }
850 }
851
852 match_ctx_clean (&mctx);
853 }
854
855#ifdef DEBUG
856 assert (match_last != REG_MISSING);
857 assert (err == REG_NOERROR);
858#endif
859
860 /* Set pmatch[] if we need. */
861 if (nmatch > 0)
862 {
863 Idx reg_idx;
864
865 /* Initialize registers. */
866 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
867 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
868
869 /* Set the points where matching start/end. */
870 pmatch[0].rm_so = 0;
871 pmatch[0].rm_eo = mctx.match_last;
872 /* FIXME: This function should fail if mctx.match_last exceeds
873 the maximum possible regoff_t value. We need a new error
874 code REG_OVERFLOW. */
875
876 if (!preg->re_no_sub && nmatch > 1)
877 {
878 err = set_regs (preg, &mctx, nmatch, pmatch,
879 dfa->has_plural_match && dfa->nbackref > 0);
880 if (BE (err != REG_NOERROR, 0))
881 goto free_return;
882 }
883
884 /* At last, add the offset to the each registers, since we slided
885 the buffers so that we could assume that the matching starts
886 from 0. */
887 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
888 if (pmatch[reg_idx].rm_so != -1)
889 {
890#ifdef RE_ENABLE_I18N
891 if (BE (mctx.input.offsets_needed != 0, 0))
892 {
893 pmatch[reg_idx].rm_so =
894 (pmatch[reg_idx].rm_so == mctx.input.valid_len
895 ? mctx.input.valid_raw_len
896 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
897 pmatch[reg_idx].rm_eo =
898 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
899 ? mctx.input.valid_raw_len
900 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
901 }
902#else
903 assert (mctx.input.offsets_needed == 0);
904#endif
905 pmatch[reg_idx].rm_so += match_first;
906 pmatch[reg_idx].rm_eo += match_first;
907 }
908 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
909 {
910 pmatch[nmatch + reg_idx].rm_so = -1;
911 pmatch[nmatch + reg_idx].rm_eo = -1;
912 }
913
914 if (dfa->subexp_map)
915 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
916 if (dfa->subexp_map[reg_idx] != reg_idx)
917 {
918 pmatch[reg_idx + 1].rm_so
919 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
920 pmatch[reg_idx + 1].rm_eo
921 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
922 }
923 }
924
925 free_return:
926 re_free (mctx.state_log);
927 if (dfa->nbackref)
928 match_ctx_free (&mctx);
929 re_string_destruct (&mctx.input);
930 return err;
931}
932
933static reg_errcode_t
934internal_function
935prune_impossible_nodes (re_match_context_t *mctx)
936{
937 re_dfa_t *const dfa = mctx->dfa;
938 Idx halt_node, match_last;
939 reg_errcode_t ret;
940 re_dfastate_t **sifted_states;
941 re_dfastate_t **lim_states = NULL;
942 re_sift_context_t sctx;
943#ifdef DEBUG
944 assert (mctx->state_log != NULL);
945#endif
946 match_last = mctx->match_last;
947 halt_node = mctx->last_node;
948 sifted_states = re_xmalloc (re_dfastate_t *, match_last + 1);
949 if (BE (sifted_states == NULL, 0))
950 {
951 ret = REG_ESPACE;
952 goto free_return;
953 }
954 if (dfa->nbackref)
955 {
956 lim_states = re_xmalloc (re_dfastate_t *, match_last + 1);
957 if (BE (lim_states == NULL, 0))
958 {
959 ret = REG_ESPACE;
960 goto free_return;
961 }
962 while (1)
963 {
964 memset (lim_states, '\0',
965 sizeof (re_dfastate_t *) * (match_last + 1));
966 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
967 match_last);
968 ret = sift_states_backward (mctx, &sctx);
969 re_node_set_free (&sctx.limits);
970 if (BE (ret != REG_NOERROR, 0))
971 goto free_return;
972 if (sifted_states[0] != NULL || lim_states[0] != NULL)
973 break;
974 do
975 {
976 --match_last;
977 if (! REG_VALID_INDEX (match_last))
978 {
979 ret = REG_NOMATCH;
980 goto free_return;
981 }
982 } while (mctx->state_log[match_last] == NULL
983 || !mctx->state_log[match_last]->halt);
984 halt_node = check_halt_state_context (mctx,
985 mctx->state_log[match_last],
986 match_last);
987 }
988 ret = merge_state_array (dfa, sifted_states, lim_states,
989 match_last + 1);
990 re_free (lim_states);
991 lim_states = NULL;
992 if (BE (ret != REG_NOERROR, 0))
993 goto free_return;
994 }
995 else
996 {
997 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
998 ret = sift_states_backward (mctx, &sctx);
999 re_node_set_free (&sctx.limits);
1000 if (BE (ret != REG_NOERROR, 0))
1001 goto free_return;
1002 }
1003 re_free (mctx->state_log);
1004 mctx->state_log = sifted_states;
1005 sifted_states = NULL;
1006 mctx->last_node = halt_node;
1007 mctx->match_last = match_last;
1008 ret = REG_NOERROR;
1009 free_return:
1010 re_free (sifted_states);
1011 re_free (lim_states);
1012 return ret;
1013}
1014
1015/* Acquire an initial state and return it.
1016 We must select appropriate initial state depending on the context,
1017 since initial states may have constraints like "\<", "^", etc.. */
1018
1019static inline re_dfastate_t *
1020__attribute ((always_inline)) internal_function
1021acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1022 Idx idx)
1023{
1024 re_dfa_t *const dfa = mctx->dfa;
1025 if (dfa->init_state->has_constraint)
1026 {
1027 unsigned int context;
1028 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1029 if (IS_WORD_CONTEXT (context))
1030 return dfa->init_state_word;
1031 else if (IS_ORDINARY_CONTEXT (context))
1032 return dfa->init_state;
1033 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1034 return dfa->init_state_begbuf;
1035 else if (IS_NEWLINE_CONTEXT (context))
1036 return dfa->init_state_nl;
1037 else if (IS_BEGBUF_CONTEXT (context))
1038 {
1039 /* It is relatively rare case, then calculate on demand. */
1040 return re_acquire_state_context (err, dfa,
1041 dfa->init_state->entrance_nodes,
1042 context);
1043 }
1044 else
1045 /* Must not happen? */
1046 return dfa->init_state;
1047 }
1048 else
1049 return dfa->init_state;
1050}
1051
1052/* Check whether the regular expression match input string INPUT or not,
1053 and return the index where the matching end. Return REG_MISSING if
1054 there is no match, and return REG_ERROR in case of an error.
1055 FL_LONGEST_MATCH means we want the POSIX longest matching.
1056 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1057 next place where we may want to try matching.
1058 Note that the matcher assume that the maching starts from the current
1059 index of the buffer. */
1060
1061static Idx
1062internal_function
1063check_matching (re_match_context_t *mctx, bool fl_longest_match,
1064 Idx *p_match_first)
1065{
1066 re_dfa_t *const dfa = mctx->dfa;
1067 reg_errcode_t err;
1068 Idx match = 0;
1069 Idx match_last = REG_MISSING;
1070 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1071 re_dfastate_t *cur_state;
1072 bool at_init_state = p_match_first != NULL;
1073 Idx next_start_idx = cur_str_idx;
1074
1075 err = REG_NOERROR;
1076 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1077 /* An initial state must not be NULL (invalid). */
1078 if (BE (cur_state == NULL, 0))
1079 {
1080 assert (err == REG_ESPACE);
1081 return REG_ERROR;
1082 }
1083
1084 if (mctx->state_log != NULL)
1085 {
1086 mctx->state_log[cur_str_idx] = cur_state;
1087
1088 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1089 later. E.g. Processing back references. */
1090 if (BE (dfa->nbackref, 0))
1091 {
1092 at_init_state = false;
1093 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1094 if (BE (err != REG_NOERROR, 0))
1095 return err;
1096
1097 if (cur_state->has_backref)
1098 {
1099 err = transit_state_bkref (mctx, &cur_state->nodes);
1100 if (BE (err != REG_NOERROR, 0))
1101 return err;
1102 }
1103 }
1104 }
1105
1106 /* If the RE accepts NULL string. */
1107 if (BE (cur_state->halt, 0))
1108 {
1109 if (!cur_state->has_constraint
1110 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1111 {
1112 if (!fl_longest_match)
1113 return cur_str_idx;
1114 else
1115 {
1116 match_last = cur_str_idx;
1117 match = 1;
1118 }
1119 }
1120 }
1121
1122 while (!re_string_eoi (&mctx->input))
1123 {
1124 re_dfastate_t *old_state = cur_state;
1125 Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1126
1127 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1128 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1129 && mctx->input.valid_len < mctx->input.len))
1130 {
1131 err = extend_buffers (mctx);
1132 if (BE (err != REG_NOERROR, 0))
1133 {
1134 assert (err == REG_ESPACE);
1135 return REG_ERROR;
1136 }
1137 }
1138
1139 cur_state = transit_state (&err, mctx, cur_state);
1140 if (mctx->state_log != NULL)
1141 cur_state = merge_state_with_log (&err, mctx, cur_state);
1142
1143 if (cur_state == NULL)
1144 {
1145 /* Reached the invalid state or an error. Try to recover a valid
1146 state using the state log, if available and if we have not
1147 already found a valid (even if not the longest) match. */
1148 if (BE (err != REG_NOERROR, 0))
1149 return REG_ERROR;
1150
1151 if (mctx->state_log == NULL
1152 || (match && !fl_longest_match)
1153 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1154 break;
1155 }
1156
1157 if (BE (at_init_state, 0))
1158 {
1159 if (old_state == cur_state)
1160 next_start_idx = next_char_idx;
1161 else
1162 at_init_state = false;
1163 }
1164
1165 if (cur_state->halt)
1166 {
1167 /* Reached a halt state.
1168 Check the halt state can satisfy the current context. */
1169 if (!cur_state->has_constraint
1170 || check_halt_state_context (mctx, cur_state,
1171 re_string_cur_idx (&mctx->input)))
1172 {
1173 /* We found an appropriate halt state. */
1174 match_last = re_string_cur_idx (&mctx->input);
1175 match = 1;
1176
1177 /* We found a match, do not modify match_first below. */
1178 p_match_first = NULL;
1179 if (!fl_longest_match)
1180 break;
1181 }
1182 }
1183 }
1184
1185 if (p_match_first)
1186 *p_match_first += next_start_idx;
1187
1188 return match_last;
1189}
1190
1191/* Check NODE match the current context. */
1192
1193static bool
1194internal_function
1195check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1196{
1197 re_token_type_t type = dfa->nodes[node].type;
1198 unsigned int constraint = dfa->nodes[node].constraint;
1199 if (type != END_OF_RE)
1200 return false;
1201 if (!constraint)
1202 return true;
1203 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1204 return false;
1205 return true;
1206}
1207
1208/* Check the halt state STATE match the current context.
1209 Return 0 if not match, if the node, STATE has, is a halt node and
1210 match the context, return the node. */
1211
1212static Idx
1213internal_function
1214check_halt_state_context (const re_match_context_t *mctx,
1215 const re_dfastate_t *state, Idx idx)
1216{
1217 Idx i;
1218 unsigned int context;
1219#ifdef DEBUG
1220 assert (state->halt);
1221#endif
1222 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1223 for (i = 0; i < state->nodes.nelem; ++i)
1224 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1225 return state->nodes.elems[i];
1226 return 0;
1227}
1228
1229/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1230 corresponding to the DFA).
1231 Return the destination node, and update EPS_VIA_NODES;
1232 return REG_MISSING in case of errors. */
1233
1234static Idx
1235internal_function
1236proceed_next_node (const re_match_context_t *mctx,
1237 Idx nregs, regmatch_t *regs, Idx *pidx, Idx node,
1238 re_node_set *eps_via_nodes, struct re_fail_stack_t *fs)
1239{
1240 re_dfa_t *const dfa = mctx->dfa;
1241 Idx i;
1242 bool ok;
1243 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1244 {
1245 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1246 re_node_set *edests = &dfa->edests[node];
1247 Idx dest_node;
1248 ok = re_node_set_insert (eps_via_nodes, node);
1249 if (BE (! ok, 0))
1250 return REG_ERROR;
1251 /* Pick up a valid destination, or return REG_MISSING if none
1252 is found. */
1253 for (dest_node = REG_MISSING, i = 0; i < edests->nelem; ++i)
1254 {
1255 Idx candidate = edests->elems[i];
1256 if (!re_node_set_contains (cur_nodes, candidate))
1257 continue;
1258 if (dest_node == REG_MISSING)
1259 dest_node = candidate;
1260
1261 else
1262 {
1263 /* In order to avoid infinite loop like "(a*)*", return the second
1264 epsilon-transition if the first was already considered. */
1265 if (re_node_set_contains (eps_via_nodes, dest_node))
1266 return candidate;
1267
1268 /* Otherwise, push the second epsilon-transition on the fail stack. */
1269 else if (fs != NULL
1270 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1271 eps_via_nodes))
1272 return REG_ERROR;
1273
1274 /* We know we are going to exit. */
1275 break;
1276 }
1277 }
1278 return dest_node;
1279 }
1280 else
1281 {
1282 Idx naccepted = 0;
1283 re_token_type_t type = dfa->nodes[node].type;
1284
1285#ifdef RE_ENABLE_I18N
1286 if (dfa->nodes[node].accept_mb)
1287 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1288 else
1289#endif /* RE_ENABLE_I18N */
1290 if (type == OP_BACK_REF)
1291 {
1292 Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1293 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1294 if (fs != NULL)
1295 {
1296 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1297 return REG_MISSING;
1298 else if (naccepted)
1299 {
1300 char *buf = (char *) re_string_get_buffer (&mctx->input);
1301 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1302 naccepted) != 0)
1303 return REG_MISSING;
1304 }
1305 }
1306
1307 if (naccepted == 0)
1308 {
1309 Idx dest_node;
1310 ok = re_node_set_insert (eps_via_nodes, node);
1311 if (BE (! ok, 0))
1312 return REG_ERROR;
1313 dest_node = dfa->edests[node].elems[0];
1314 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1315 dest_node))
1316 return dest_node;
1317 }
1318 }
1319
1320 if (naccepted != 0
1321 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1322 {
1323 Idx dest_node = dfa->nexts[node];
1324 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1325 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1326 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1327 dest_node)))
1328 return REG_MISSING;
1329 re_node_set_empty (eps_via_nodes);
1330 return dest_node;
1331 }
1332 }
1333 return REG_MISSING;
1334}
1335
1336static reg_errcode_t
1337internal_function
1338push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1339 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1340{
1341 reg_errcode_t err;
1342 Idx num = fs->num++;
1343 if (fs->num == fs->alloc)
1344 {
1345 struct re_fail_stack_ent_t *new_array =
1346 re_x2realloc (fs->stack, struct re_fail_stack_ent_t, &fs->alloc);
1347 if (new_array == NULL)
1348 return REG_ESPACE;
1349 fs->stack = new_array;
1350 }
1351 fs->stack[num].idx = str_idx;
1352 fs->stack[num].node = dest_node;
1353 fs->stack[num].regs = re_xmalloc (regmatch_t, nregs);
1354 if (fs->stack[num].regs == NULL)
1355 return REG_ESPACE;
1356 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1357 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1358 return err;
1359}
1360
1361static Idx
1362internal_function
1363pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx,
1364 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1365{
1366 Idx num = --fs->num;
1367 assert (REG_VALID_INDEX (num));
1368 *pidx = fs->stack[num].idx;
1369 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1370 re_node_set_free (eps_via_nodes);
1371 re_free (fs->stack[num].regs);
1372 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1373 return fs->stack[num].node;
1374}
1375
1376/* Set the positions where the subexpressions are starts/ends to registers
1377 PMATCH.
1378 Note: We assume that pmatch[0] is already set, and
1379 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1380
1381static reg_errcode_t
1382internal_function
1383set_regs (const regex_t *preg, const re_match_context_t *mctx,
1384 size_t nmatch, regmatch_t *pmatch, bool fl_backtrack)
1385{
1386 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1387 Idx idx, cur_node;
1388 re_node_set eps_via_nodes;
1389 struct re_fail_stack_t *fs;
1390 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1391 regmatch_t *prev_idx_match;
1392 bool prev_idx_match_malloced = false;
1393
1394#ifdef DEBUG
1395 assert (nmatch > 1);
1396 assert (mctx->state_log != NULL);
1397#endif
1398 if (fl_backtrack)
1399 {
1400 fs = &fs_body;
1401 fs->stack = re_xmalloc (struct re_fail_stack_ent_t, fs->alloc);
1402 if (fs->stack == NULL)
1403 return REG_ESPACE;
1404 }
1405 else
1406 fs = NULL;
1407
1408 cur_node = dfa->init_node;
1409 re_node_set_init_empty (&eps_via_nodes);
1410
1411 if (re_alloc_oversized (nmatch, sizeof (regmatch_t)))
1412 {
1413 free_fail_stack_return (fs);
1414 return REG_ESPACE;
1415 }
1416 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1417 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1418 else
1419 {
1420 prev_idx_match = re_malloc (regmatch_t, nmatch);
1421 if (prev_idx_match == NULL)
1422 {
1423 free_fail_stack_return (fs);
1424 return REG_ESPACE;
1425 }
1426 prev_idx_match_malloced = true;
1427 }
1428 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1429
1430 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1431 {
1432 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1433
1434 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1435 {
1436 Idx reg_idx;
1437 if (fs)
1438 {
1439 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1440 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1441 break;
1442 if (reg_idx == nmatch)
1443 {
1444 re_node_set_free (&eps_via_nodes);
1445 if (prev_idx_match_malloced)
1446 re_free (prev_idx_match);
1447 return free_fail_stack_return (fs);
1448 }
1449 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1450 &eps_via_nodes);
1451 }
1452 else
1453 {
1454 re_node_set_free (&eps_via_nodes);
1455 if (prev_idx_match_malloced)
1456 re_free (prev_idx_match);
1457 return REG_NOERROR;
1458 }
1459 }
1460
1461 /* Proceed to next node. */
1462 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1463 &eps_via_nodes, fs);
1464
1465 if (BE (! REG_VALID_INDEX (cur_node), 0))
1466 {
1467 if (BE (cur_node == REG_ERROR, 0))
1468 {
1469 re_node_set_free (&eps_via_nodes);
1470 if (prev_idx_match_malloced)
1471 re_free (prev_idx_match);
1472 free_fail_stack_return (fs);
1473 return REG_ESPACE;
1474 }
1475 if (fs)
1476 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1477 &eps_via_nodes);
1478 else
1479 {
1480 re_node_set_free (&eps_via_nodes);
1481 if (prev_idx_match_malloced)
1482 re_free (prev_idx_match);
1483 return REG_NOMATCH;
1484 }
1485 }
1486 }
1487 re_node_set_free (&eps_via_nodes);
1488 if (prev_idx_match_malloced)
1489 re_free (prev_idx_match);
1490 return free_fail_stack_return (fs);
1491}
1492
1493static reg_errcode_t
1494internal_function
1495free_fail_stack_return (struct re_fail_stack_t *fs)
1496{
1497 if (fs)
1498 {
1499 Idx fs_idx;
1500 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1501 {
1502 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1503 re_free (fs->stack[fs_idx].regs);
1504 }
1505 re_free (fs->stack);
1506 }
1507 return REG_NOERROR;
1508}
1509
1510static void
1511internal_function
1512update_regs (re_dfa_t *dfa, regmatch_t *pmatch, regmatch_t *prev_idx_match,
1513 Idx cur_node, Idx cur_idx, Idx nmatch)
1514{
1515 int type = dfa->nodes[cur_node].type;
1516 if (type == OP_OPEN_SUBEXP)
1517 {
1518 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1519
1520 /* We are at the first node of this sub expression. */
1521 if (reg_num < nmatch)
1522 {
1523 pmatch[reg_num].rm_so = cur_idx;
1524 pmatch[reg_num].rm_eo = -1;
1525 }
1526 }
1527 else if (type == OP_CLOSE_SUBEXP)
1528 {
1529 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1530 if (reg_num < nmatch)
1531 {
1532 /* We are at the last node of this sub expression. */
1533 if (pmatch[reg_num].rm_so < cur_idx)
1534 {
1535 pmatch[reg_num].rm_eo = cur_idx;
1536 /* This is a non-empty match or we are not inside an optional
1537 subexpression. Accept this right away. */
1538 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1539 }
1540 else
1541 {
1542 if (dfa->nodes[cur_node].opt_subexp
1543 && prev_idx_match[reg_num].rm_so != -1)
1544 /* We transited through an empty match for an optional
1545 subexpression, like (a?)*, and this is not the subexp's
1546 first match. Copy back the old content of the registers
1547 so that matches of an inner subexpression are undone as
1548 well, like in ((a?))*. */
1549 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1550 else
1551 /* We completed a subexpression, but it may be part of
1552 an optional one, so do not update PREV_IDX_MATCH. */
1553 pmatch[reg_num].rm_eo = cur_idx;
1554 }
1555 }
1556 }
1557}
1558
1559/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1560 and sift the nodes in each states according to the following rules.
1561 Updated state_log will be wrote to STATE_LOG.
1562
1563 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1564 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1565 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1566 the LAST_NODE, we throw away the node `a'.
1567 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1568 string `s' and transit to `b':
1569 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1570 away the node `a'.
1571 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1572 thrown away, we throw away the node `a'.
1573 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1574 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1575 node `a'.
1576 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1577 we throw away the node `a'. */
1578
1579#define STATE_NODE_CONTAINS(state,node) \
1580 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1581
1582static reg_errcode_t
1583internal_function
1584sift_states_backward (re_match_context_t *mctx, re_sift_context_t *sctx)
1585{
1586 reg_errcode_t err;
1587 int null_cnt = 0;
1588 Idx str_idx = sctx->last_str_idx;
1589 re_node_set cur_dest;
1590
1591#ifdef DEBUG
1592 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1593#endif
1594
1595 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1596 transit to the last_node and the last_node itself. */
1597 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1598 if (BE (err != REG_NOERROR, 0))
1599 return err;
1600 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1601 if (BE (err != REG_NOERROR, 0))
1602 goto free_return;
1603
1604 /* Then check each states in the state_log. */
1605 while (str_idx > 0)
1606 {
1607 /* Update counters. */
1608 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1609 if (null_cnt > mctx->max_mb_elem_len)
1610 {
1611 memset (sctx->sifted_states, '\0',
1612 sizeof (re_dfastate_t *) * str_idx);
1613 re_node_set_free (&cur_dest);
1614 return REG_NOERROR;
1615 }
1616 re_node_set_empty (&cur_dest);
1617 --str_idx;
1618
1619 if (mctx->state_log[str_idx])
1620 {
1621 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1622 if (BE (err != REG_NOERROR, 0))
1623 goto free_return;
1624 }
1625
1626 /* Add all the nodes which satisfy the following conditions:
1627 - It can epsilon transit to a node in CUR_DEST.
1628 - It is in CUR_SRC.
1629 And update state_log. */
1630 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1631 if (BE (err != REG_NOERROR, 0))
1632 goto free_return;
1633 }
1634 err = REG_NOERROR;
1635 free_return:
1636 re_node_set_free (&cur_dest);
1637 return err;
1638}
1639
1640static reg_errcode_t
1641internal_function
1642build_sifted_states (re_match_context_t *mctx, re_sift_context_t *sctx,
1643 Idx str_idx, re_node_set *cur_dest)
1644{
1645 re_dfa_t *const dfa = mctx->dfa;
1646 re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1647 Idx i;
1648
1649 /* Then build the next sifted state.
1650 We build the next sifted state on `cur_dest', and update
1651 `sifted_states[str_idx]' with `cur_dest'.
1652 Note:
1653 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1654 `cur_src' points the node_set of the old `state_log[str_idx]'
1655 (with the epsilon nodes pre-filtered out). */
1656 for (i = 0; i < cur_src->nelem; i++)
1657 {
1658 Idx prev_node = cur_src->elems[i];
1659 int naccepted = 0;
1660 bool ok;
1661
1662#ifdef DEBUG
1663 re_token_type_t type = dfa->nodes[prev_node].type;
1664 assert (!IS_EPSILON_NODE (type));
1665#endif
1666#ifdef RE_ENABLE_I18N
1667 /* If the node may accept `multi byte'. */
1668 if (dfa->nodes[prev_node].accept_mb)
1669 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1670 str_idx, sctx->last_str_idx);
1671#endif /* RE_ENABLE_I18N */
1672
1673 /* We don't check backreferences here.
1674 See update_cur_sifted_state(). */
1675 if (!naccepted
1676 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1677 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1678 dfa->nexts[prev_node]))
1679 naccepted = 1;
1680
1681 if (naccepted == 0)
1682 continue;
1683
1684 if (sctx->limits.nelem)
1685 {
1686 Idx to_idx = str_idx + naccepted;
1687 if (check_dst_limits (mctx, &sctx->limits,
1688 dfa->nexts[prev_node], to_idx,
1689 prev_node, str_idx))
1690 continue;
1691 }
1692 ok = re_node_set_insert (cur_dest, prev_node);
1693 if (BE (! ok, 0))
1694 return REG_ESPACE;
1695 }
1696
1697 return REG_NOERROR;
1698}
1699
1700/* Helper functions. */
1701
1702static reg_errcode_t
1703internal_function
1704clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1705{
1706 Idx top = mctx->state_log_top;
1707
1708 if (next_state_log_idx >= mctx->input.bufs_len
1709 || (next_state_log_idx >= mctx->input.valid_len
1710 && mctx->input.valid_len < mctx->input.len))
1711 {
1712 reg_errcode_t err;
1713 err = extend_buffers (mctx);
1714 if (BE (err != REG_NOERROR, 0))
1715 return err;
1716 }
1717
1718 if (top < next_state_log_idx)
1719 {
1720 memset (mctx->state_log + top + 1, '\0',
1721 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1722 mctx->state_log_top = next_state_log_idx;
1723 }
1724 return REG_NOERROR;
1725}
1726
1727static reg_errcode_t
1728internal_function
1729merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst, re_dfastate_t **src,
1730 Idx num)
1731{
1732 Idx st_idx;
1733 reg_errcode_t err;
1734 for (st_idx = 0; st_idx < num; ++st_idx)
1735 {
1736 if (dst[st_idx] == NULL)
1737 dst[st_idx] = src[st_idx];
1738 else if (src[st_idx] != NULL)
1739 {
1740 re_node_set merged_set;
1741 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1742 &src[st_idx]->nodes);
1743 if (BE (err != REG_NOERROR, 0))
1744 return err;
1745 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1746 re_node_set_free (&merged_set);
1747 if (BE (err != REG_NOERROR, 0))
1748 return err;
1749 }
1750 }
1751 return REG_NOERROR;
1752}
1753
1754static reg_errcode_t
1755internal_function
1756update_cur_sifted_state (re_match_context_t *mctx, re_sift_context_t *sctx,
1757 Idx str_idx, re_node_set *dest_nodes)
1758{
1759 re_dfa_t *const dfa = mctx->dfa;
1760 reg_errcode_t err;
1761 const re_node_set *candidates;
1762 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1763 : &mctx->state_log[str_idx]->nodes);
1764
1765 if (dest_nodes->nelem == 0)
1766 sctx->sifted_states[str_idx] = NULL;
1767 else
1768 {
1769 if (candidates)
1770 {
1771 /* At first, add the nodes which can epsilon transit to a node in
1772 DEST_NODE. */
1773 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1774 if (BE (err != REG_NOERROR, 0))
1775 return err;
1776
1777 /* Then, check the limitations in the current sift_context. */
1778 if (sctx->limits.nelem)
1779 {
1780 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1781 mctx->bkref_ents, str_idx);
1782 if (BE (err != REG_NOERROR, 0))
1783 return err;
1784 }
1785 }
1786
1787 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1788 if (BE (err != REG_NOERROR, 0))
1789 return err;
1790 }
1791
1792 if (candidates && mctx->state_log[str_idx]->has_backref)
1793 {
1794 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1795 if (BE (err != REG_NOERROR, 0))
1796 return err;
1797 }
1798 return REG_NOERROR;
1799}
1800
1801static reg_errcode_t
1802internal_function
1803add_epsilon_src_nodes (re_dfa_t *dfa, re_node_set *dest_nodes,
1804 const re_node_set *candidates)
1805{
1806 reg_errcode_t err = REG_NOERROR;
1807 Idx i;
1808
1809 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1810 if (BE (err != REG_NOERROR, 0))
1811 return err;
1812
1813 if (!state->inveclosure.alloc)
1814 {
1815 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1816 if (BE (err != REG_NOERROR, 0))
1817 return REG_ESPACE;
1818 for (i = 0; i < dest_nodes->nelem; i++)
1819 re_node_set_merge (&state->inveclosure,
1820 dfa->inveclosures + dest_nodes->elems[i]);
1821 }
1822 return re_node_set_add_intersect (dest_nodes, candidates,
1823 &state->inveclosure);
1824}
1825
1826static reg_errcode_t
1827internal_function
1828sub_epsilon_src_nodes (re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1829 const re_node_set *candidates)
1830{
1831 Idx ecl_idx;
1832 reg_errcode_t err;
1833 re_node_set *inv_eclosure = dfa->inveclosures + node;
1834 re_node_set except_nodes;
1835 re_node_set_init_empty (&except_nodes);
1836 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1837 {
1838 Idx cur_node = inv_eclosure->elems[ecl_idx];
1839 if (cur_node == node)
1840 continue;
1841 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1842 {
1843 Idx edst1 = dfa->edests[cur_node].elems[0];
1844 Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1845 ? dfa->edests[cur_node].elems[1] : REG_MISSING);
1846 if ((!re_node_set_contains (inv_eclosure, edst1)
1847 && re_node_set_contains (dest_nodes, edst1))
1848 || (REG_VALID_NONZERO_INDEX (edst2)
1849 && !re_node_set_contains (inv_eclosure, edst2)
1850 && re_node_set_contains (dest_nodes, edst2)))
1851 {
1852 err = re_node_set_add_intersect (&except_nodes, candidates,
1853 dfa->inveclosures + cur_node);
1854 if (BE (err != REG_NOERROR, 0))
1855 {
1856 re_node_set_free (&except_nodes);
1857 return err;
1858 }
1859 }
1860 }
1861 }
1862 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1863 {
1864 Idx cur_node = inv_eclosure->elems[ecl_idx];
1865 if (!re_node_set_contains (&except_nodes, cur_node))
1866 {
1867 Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1868 re_node_set_remove_at (dest_nodes, idx);
1869 }
1870 }
1871 re_node_set_free (&except_nodes);
1872 return REG_NOERROR;
1873}
1874
1875static bool
1876internal_function
1877check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1878 Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1879{
1880 re_dfa_t *const dfa = mctx->dfa;
1881 Idx lim_idx, src_pos, dst_pos;
1882
1883 Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1884 Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1885 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1886 {
1887 Idx subexp_idx;
1888 struct re_backref_cache_entry *ent;
1889 ent = mctx->bkref_ents + limits->elems[lim_idx];
1890 subexp_idx = dfa->nodes[ent->node].opr.idx;
1891
1892 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1893 subexp_idx, dst_node, dst_idx,
1894 dst_bkref_idx);
1895 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1896 subexp_idx, src_node, src_idx,
1897 src_bkref_idx);
1898
1899 /* In case of:
1900 <src> <dst> ( <subexp> )
1901 ( <subexp> ) <src> <dst>
1902 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1903 if (src_pos == dst_pos)
1904 continue; /* This is unrelated limitation. */
1905 else
1906 return true;
1907 }
1908 return false;
1909}
1910
1911static int
1912internal_function
1913check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1914 Idx subexp_idx, Idx from_node, Idx bkref_idx)
1915{
1916 re_dfa_t *const dfa = mctx->dfa;
1917 re_node_set *eclosures = dfa->eclosures + from_node;
1918 Idx node_idx;
1919
1920 /* Else, we are on the boundary: examine the nodes on the epsilon
1921 closure. */
1922 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1923 {
1924 Idx node = eclosures->elems[node_idx];
1925 switch (dfa->nodes[node].type)
1926 {
1927 case OP_BACK_REF:
1928 if (bkref_idx != REG_MISSING)
1929 {
1930 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1931 do
1932 {
1933 Idx dst;
1934 int cpos;
1935
1936 if (ent->node != node)
1937 continue;
1938
1939 if (subexp_idx < BITSET_WORD_BITS
1940 && !(ent->eps_reachable_subexps_map
1941 & ((bitset_word) 1 << subexp_idx)))
1942 continue;
1943
1944 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1945 OP_CLOSE_SUBEXP cases below. But, if the
1946 destination node is the same node as the source
1947 node, don't recurse because it would cause an
1948 infinite loop: a regex that exhibits this behavior
1949 is ()\1*\1* */
1950 dst = dfa->edests[node].elems[0];
1951 if (dst == from_node)
1952 {
1953 if (boundaries & 1)
1954 return -1;
1955 else /* if (boundaries & 2) */
1956 return 0;
1957 }
1958
1959 cpos =
1960 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1961 dst, bkref_idx);
1962 if (cpos == -1 /* && (boundaries & 1) */)
1963 return -1;
1964 if (cpos == 0 && (boundaries & 2))
1965 return 0;
1966
1967 if (subexp_idx < BITSET_WORD_BITS)
1968 ent->eps_reachable_subexps_map &=
1969 ~ ((bitset_word) 1 << subexp_idx);
1970 }
1971 while (ent++->more);
1972 }
1973 break;
1974
1975 case OP_OPEN_SUBEXP:
1976 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1977 return -1;
1978 break;
1979
1980 case OP_CLOSE_SUBEXP:
1981 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1982 return 0;
1983 break;
1984
1985 default:
1986 break;
1987 }
1988 }
1989
1990 return (boundaries & 2) ? 1 : 0;
1991}
1992
1993static int
1994internal_function
1995check_dst_limits_calc_pos (const re_match_context_t *mctx,
1996 Idx limit, Idx subexp_idx,
1997 Idx from_node, Idx str_idx, Idx bkref_idx)
1998{
1999 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2000 int boundaries;
2001
2002 /* If we are outside the range of the subexpression, return -1 or 1. */
2003 if (str_idx < lim->subexp_from)
2004 return -1;
2005
2006 if (lim->subexp_to < str_idx)
2007 return 1;
2008
2009 /* If we are within the subexpression, return 0. */
2010 boundaries = (str_idx == lim->subexp_from);
2011 boundaries |= (str_idx == lim->subexp_to) << 1;
2012 if (boundaries == 0)
2013 return 0;
2014
2015 /* Else, examine epsilon closure. */
2016 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2017 from_node, bkref_idx);
2018}
2019
2020/* Check the limitations of sub expressions LIMITS, and remove the nodes
2021 which are against limitations from DEST_NODES. */
2022
2023static reg_errcode_t
2024internal_function
2025check_subexp_limits (re_dfa_t *dfa, re_node_set *dest_nodes,
2026 const re_node_set *candidates, re_node_set *limits,
2027 struct re_backref_cache_entry *bkref_ents, Idx str_idx)
2028{
2029 reg_errcode_t err;
2030 Idx node_idx, lim_idx;
2031
2032 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2033 {
2034 Idx subexp_idx;
2035 struct re_backref_cache_entry *ent;
2036 ent = bkref_ents + limits->elems[lim_idx];
2037
2038 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2039 continue; /* This is unrelated limitation. */
2040
2041 subexp_idx = dfa->nodes[ent->node].opr.idx;
2042 if (ent->subexp_to == str_idx)
2043 {
2044 Idx ops_node = REG_MISSING;
2045 Idx cls_node = REG_MISSING;
2046 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2047 {
2048 Idx node = dest_nodes->elems[node_idx];
2049 re_token_type_t type = dfa->nodes[node].type;
2050 if (type == OP_OPEN_SUBEXP
2051 && subexp_idx == dfa->nodes[node].opr.idx)
2052 ops_node = node;
2053 else if (type == OP_CLOSE_SUBEXP
2054 && subexp_idx == dfa->nodes[node].opr.idx)
2055 cls_node = node;
2056 }
2057
2058 /* Check the limitation of the open subexpression. */
2059 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2060 if (REG_VALID_INDEX (ops_node))
2061 {
2062 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2063 candidates);
2064 if (BE (err != REG_NOERROR, 0))
2065 return err;
2066 }
2067
2068 /* Check the limitation of the close subexpression. */
2069 if (REG_VALID_INDEX (cls_node))
2070 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2071 {
2072 Idx node = dest_nodes->elems[node_idx];
2073 if (!re_node_set_contains (dfa->inveclosures + node,
2074 cls_node)
2075 && !re_node_set_contains (dfa->eclosures + node,
2076 cls_node))
2077 {
2078 /* It is against this limitation.
2079 Remove it form the current sifted state. */
2080 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2081 candidates);
2082 if (BE (err != REG_NOERROR, 0))
2083 return err;
2084 --node_idx;
2085 }
2086 }
2087 }
2088 else /* (ent->subexp_to != str_idx) */
2089 {
2090 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2091 {
2092 Idx node = dest_nodes->elems[node_idx];
2093 re_token_type_t type = dfa->nodes[node].type;
2094 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2095 {
2096 if (subexp_idx != dfa->nodes[node].opr.idx)
2097 continue;
2098 /* It is against this limitation.
2099 Remove it form the current sifted state. */
2100 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2101 candidates);
2102 if (BE (err != REG_NOERROR, 0))
2103 return err;
2104 }
2105 }
2106 }
2107 }
2108 return REG_NOERROR;
2109}
2110
2111static reg_errcode_t
2112internal_function
2113sift_states_bkref (re_match_context_t *mctx, re_sift_context_t *sctx,
2114 Idx str_idx, const re_node_set *candidates)
2115{
2116 re_dfa_t *const dfa = mctx->dfa;
2117 reg_errcode_t err;
2118 Idx node_idx, node;
2119 re_sift_context_t local_sctx;
2120 Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2121
2122 if (first_idx == REG_MISSING)
2123 return REG_NOERROR;
2124
2125 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2126
2127 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2128 {
2129 Idx enabled_idx;
2130 re_token_type_t type;
2131 struct re_backref_cache_entry *entry;
2132 node = candidates->elems[node_idx];
2133 type = dfa->nodes[node].type;
2134 /* Avoid infinite loop for the REs like "()\1+". */
2135 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2136 continue;
2137 if (type != OP_BACK_REF)
2138 continue;
2139
2140 entry = mctx->bkref_ents + first_idx;
2141 enabled_idx = first_idx;
2142 do
2143 {
2144 bool ok;
2145 Idx subexp_len, to_idx, dst_node;
2146 re_dfastate_t *cur_state;
2147
2148 if (entry->node != node)
2149 continue;
2150 subexp_len = entry->subexp_to - entry->subexp_from;
2151 to_idx = str_idx + subexp_len;
2152 dst_node = (subexp_len ? dfa->nexts[node]
2153 : dfa->edests[node].elems[0]);
2154
2155 if (to_idx > sctx->last_str_idx
2156 || sctx->sifted_states[to_idx] == NULL
2157 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2158 || check_dst_limits (mctx, &sctx->limits, node,
2159 str_idx, dst_node, to_idx))
2160 continue;
2161
2162 if (local_sctx.sifted_states == NULL)
2163 {
2164 local_sctx = *sctx;
2165 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2166 if (BE (err != REG_NOERROR, 0))
2167 goto free_return;
2168 }
2169 local_sctx.last_node = node;
2170 local_sctx.last_str_idx = str_idx;
2171 ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2172 if (BE (! ok, 0))
2173 {
2174 err = REG_ESPACE;
2175 goto free_return;
2176 }
2177 cur_state = local_sctx.sifted_states[str_idx];
2178 err = sift_states_backward (mctx, &local_sctx);
2179 if (BE (err != REG_NOERROR, 0))
2180 goto free_return;
2181 if (sctx->limited_states != NULL)
2182 {
2183 err = merge_state_array (dfa, sctx->limited_states,
2184 local_sctx.sifted_states,
2185 str_idx + 1);
2186 if (BE (err != REG_NOERROR, 0))
2187 goto free_return;
2188 }
2189 local_sctx.sifted_states[str_idx] = cur_state;
2190 re_node_set_remove (&local_sctx.limits, enabled_idx);
2191
2192 /* mctx->bkref_ents may have changed, reload the pointer. */
2193 entry = mctx->bkref_ents + enabled_idx;
2194 }
2195 while (enabled_idx++, entry++->more);
2196 }
2197 err = REG_NOERROR;
2198 free_return:
2199 if (local_sctx.sifted_states != NULL)
2200 {
2201 re_node_set_free (&local_sctx.limits);
2202 }
2203
2204 return err;
2205}
2206
2207
2208#ifdef RE_ENABLE_I18N
2209static int
2210internal_function
2211sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2212 Idx node_idx, Idx str_idx, Idx max_str_idx)
2213{
2214 re_dfa_t *const dfa = mctx->dfa;
2215 int naccepted;
2216 /* Check the node can accept `multi byte'. */
2217 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2218 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2219 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2220 dfa->nexts[node_idx]))
2221 /* The node can't accept the `multi byte', or the
2222 destination was already thrown away, then the node
2223 could't accept the current input `multi byte'. */
2224 naccepted = 0;
2225 /* Otherwise, it is sure that the node could accept
2226 `naccepted' bytes input. */
2227 return naccepted;
2228}
2229#endif /* RE_ENABLE_I18N */
2230
2231
2232/* Functions for state transition. */
2233
2234/* Return the next state to which the current state STATE will transit by
2235 accepting the current input byte, and update STATE_LOG if necessary.
2236 If STATE can accept a multibyte char/collating element/back reference
2237 update the destination of STATE_LOG. */
2238
2239static re_dfastate_t *
2240internal_function
2241transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2242 re_dfastate_t *state)
2243{
2244 re_dfastate_t **trtable;
2245 unsigned char ch;
2246
2247#ifdef RE_ENABLE_I18N
2248 /* If the current state can accept multibyte. */
2249 if (BE (state->accept_mb, 0))
2250 {
2251 *err = transit_state_mb (mctx, state);
2252 if (BE (*err != REG_NOERROR, 0))
2253 return NULL;
2254 }
2255#endif /* RE_ENABLE_I18N */
2256
2257 /* Then decide the next state with the single byte. */
2258#if 0
2259 if (0)
2260 /* don't use transition table */
2261 return transit_state_sb (err, mctx, state);
2262#endif
2263
2264 /* Use transition table */
2265 ch = re_string_fetch_byte (&mctx->input);
2266 for (;;)
2267 {
2268 trtable = state->trtable;
2269 if (BE (trtable != NULL, 1))
2270 return trtable[ch];
2271
2272 trtable = state->word_trtable;
2273 if (BE (trtable != NULL, 1))
2274 {
2275 unsigned int context;
2276 context
2277 = re_string_context_at (&mctx->input,
2278 re_string_cur_idx (&mctx->input) - 1,
2279 mctx->eflags);
2280 if (IS_WORD_CONTEXT (context))
2281 return trtable[ch + SBC_MAX];
2282 else
2283 return trtable[ch];
2284 }
2285
2286 if (!build_trtable (mctx->dfa, state))
2287 {
2288 *err = REG_ESPACE;
2289 return NULL;
2290 }
2291
2292 /* Retry, we now have a transition table. */
2293 }
2294}
2295
2296/* Update the state_log if we need */
2297re_dfastate_t *
2298internal_function
2299merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2300 re_dfastate_t *next_state)
2301{
2302 re_dfa_t *const dfa = mctx->dfa;
2303 Idx cur_idx = re_string_cur_idx (&mctx->input);
2304
2305 if (cur_idx > mctx->state_log_top)
2306 {
2307 mctx->state_log[cur_idx] = next_state;
2308 mctx->state_log_top = cur_idx;
2309 }
2310 else if (mctx->state_log[cur_idx] == 0)
2311 {
2312 mctx->state_log[cur_idx] = next_state;
2313 }
2314 else
2315 {
2316 re_dfastate_t *pstate;
2317 unsigned int context;
2318 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2319 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2320 the destination of a multibyte char/collating element/
2321 back reference. Then the next state is the union set of
2322 these destinations and the results of the transition table. */
2323 pstate = mctx->state_log[cur_idx];
2324 log_nodes = pstate->entrance_nodes;
2325 if (next_state != NULL)
2326 {
2327 table_nodes = next_state->entrance_nodes;
2328 *err = re_node_set_init_union (&next_nodes, table_nodes,
2329 log_nodes);
2330 if (BE (*err != REG_NOERROR, 0))
2331 return NULL;
2332 }
2333 else
2334 next_nodes = *log_nodes;
2335 /* Note: We already add the nodes of the initial state,
2336 then we don't need to add them here. */
2337
2338 context = re_string_context_at (&mctx->input,
2339 re_string_cur_idx (&mctx->input) - 1,
2340 mctx->eflags);
2341 next_state = mctx->state_log[cur_idx]
2342 = re_acquire_state_context (err, dfa, &next_nodes, context);
2343 /* We don't need to check errors here, since the return value of
2344 this function is next_state and ERR is already set. */
2345
2346 if (table_nodes != NULL)
2347 re_node_set_free (&next_nodes);
2348 }
2349
2350 if (BE (dfa->nbackref, 0) && next_state != NULL)
2351 {
2352 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2353 later. We must check them here, since the back references in the
2354 next state might use them. */
2355 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2356 cur_idx);
2357 if (BE (*err != REG_NOERROR, 0))
2358 return NULL;
2359
2360 /* If the next state has back references. */
2361 if (next_state->has_backref)
2362 {
2363 *err = transit_state_bkref (mctx, &next_state->nodes);
2364 if (BE (*err != REG_NOERROR, 0))
2365 return NULL;
2366 next_state = mctx->state_log[cur_idx];
2367 }
2368 }
2369
2370 return next_state;
2371}
2372
2373/* Skip bytes in the input that correspond to part of a
2374 multi-byte match, then look in the log for a state
2375 from which to restart matching. */
2376static re_dfastate_t *
2377internal_function
2378find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2379{
2380 re_dfastate_t *cur_state = NULL;
2381 do
2382 {
2383 Idx max = mctx->state_log_top;
2384 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2385
2386 do
2387 {
2388 if (++cur_str_idx > max)
2389 return NULL;
2390 re_string_skip_bytes (&mctx->input, 1);
2391 }
2392 while (mctx->state_log[cur_str_idx] == NULL);
2393
2394 cur_state = merge_state_with_log (err, mctx, NULL);
2395 }
2396 while (*err == REG_NOERROR && cur_state == NULL);
2397 return cur_state;
2398}
2399
2400/* Helper functions for transit_state. */
2401
2402/* From the node set CUR_NODES, pick up the nodes whose types are
2403 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2404 expression. And register them to use them later for evaluating the
2405 correspoding back references. */
2406
2407static reg_errcode_t
2408internal_function
2409check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2410 Idx str_idx)
2411{
2412 re_dfa_t *const dfa = mctx->dfa;
2413 Idx node_idx;
2414 reg_errcode_t err;
2415
2416 /* TODO: This isn't efficient.
2417 Because there might be more than one nodes whose types are
2418 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2419 nodes.
2420 E.g. RE: (a){2} */
2421 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2422 {
2423 Idx node = cur_nodes->elems[node_idx];
2424 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2425 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2426 && (dfa->used_bkref_map
2427 & ((bitset_word) 1 << dfa->nodes[node].opr.idx)))
2428 {
2429 err = match_ctx_add_subtop (mctx, node, str_idx);
2430 if (BE (err != REG_NOERROR, 0))
2431 return err;
2432 }
2433 }
2434 return REG_NOERROR;
2435}
2436
2437#if 0
2438/* Return the next state to which the current state STATE will transit by
2439 accepting the current input byte. */
2440
2441static re_dfastate_t *
2442transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2443 re_dfastate_t *state)
2444{
2445 re_dfa_t *const dfa = mctx->dfa;
2446 re_node_set next_nodes;
2447 re_dfastate_t *next_state;
2448 Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2449 unsigned int context;
2450
2451 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2452 if (BE (*err != REG_NOERROR, 0))
2453 return NULL;
2454 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2455 {
2456 Idx cur_node = state->nodes.elems[node_cnt];
2457 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2458 {
2459 *err = re_node_set_merge (&next_nodes,
2460 dfa->eclosures + dfa->nexts[cur_node]);
2461 if (BE (*err != REG_NOERROR, 0))
2462 {
2463 re_node_set_free (&next_nodes);
2464 return NULL;
2465 }
2466 }
2467 }
2468 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2469 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2470 /* We don't need to check errors here, since the return value of
2471 this function is next_state and ERR is already set. */
2472
2473 re_node_set_free (&next_nodes);
2474 re_string_skip_bytes (&mctx->input, 1);
2475 return next_state;
2476}
2477#endif
2478
2479#ifdef RE_ENABLE_I18N
2480static reg_errcode_t
2481internal_function
2482transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2483{
2484 re_dfa_t *const dfa = mctx->dfa;
2485 reg_errcode_t err;
2486 Idx i;
2487
2488 for (i = 0; i < pstate->nodes.nelem; ++i)
2489 {
2490 re_node_set dest_nodes, *new_nodes;
2491 Idx cur_node_idx = pstate->nodes.elems[i];
2492 int naccepted;
2493 Idx dest_idx;
2494 unsigned int context;
2495 re_dfastate_t *dest_state;
2496
2497 if (!dfa->nodes[cur_node_idx].accept_mb)
2498 continue;
2499
2500 if (dfa->nodes[cur_node_idx].constraint)
2501 {
2502 context = re_string_context_at (&mctx->input,
2503 re_string_cur_idx (&mctx->input),
2504 mctx->eflags);
2505 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2506 context))
2507 continue;
2508 }
2509
2510 /* How many bytes the node can accept? */
2511 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2512 re_string_cur_idx (&mctx->input));
2513 if (naccepted == 0)
2514 continue;
2515
2516 /* The node can accepts `naccepted' bytes. */
2517 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2518 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2519 : mctx->max_mb_elem_len);
2520 err = clean_state_log_if_needed (mctx, dest_idx);
2521 if (BE (err != REG_NOERROR, 0))
2522 return err;
2523#ifdef DEBUG
2524 assert (dfa->nexts[cur_node_idx] != REG_MISSING);
2525#endif
2526 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2527
2528 dest_state = mctx->state_log[dest_idx];
2529 if (dest_state == NULL)
2530 dest_nodes = *new_nodes;
2531 else
2532 {
2533 err = re_node_set_init_union (&dest_nodes,
2534 dest_state->entrance_nodes, new_nodes);
2535 if (BE (err != REG_NOERROR, 0))
2536 return err;
2537 }
2538 context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2539 mctx->state_log[dest_idx]
2540 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2541 if (dest_state != NULL)
2542 re_node_set_free (&dest_nodes);
2543 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2544 return err;
2545 }
2546 return REG_NOERROR;
2547}
2548#endif /* RE_ENABLE_I18N */
2549
2550static reg_errcode_t
2551internal_function
2552transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2553{
2554 re_dfa_t *const dfa = mctx->dfa;
2555 reg_errcode_t err;
2556 Idx i;
2557 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2558
2559 for (i = 0; i < nodes->nelem; ++i)
2560 {
2561 Idx dest_str_idx, prev_nelem, bkc_idx;
2562 Idx node_idx = nodes->elems[i];
2563 unsigned int context;
2564 const re_token_t *node = dfa->nodes + node_idx;
2565 re_node_set *new_dest_nodes;
2566
2567 /* Check whether `node' is a backreference or not. */
2568 if (node->type != OP_BACK_REF)
2569 continue;
2570
2571 if (node->constraint)
2572 {
2573 context = re_string_context_at (&mctx->input, cur_str_idx,
2574 mctx->eflags);
2575 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2576 continue;
2577 }
2578
2579 /* `node' is a backreference.
2580 Check the substring which the substring matched. */
2581 bkc_idx = mctx->nbkref_ents;
2582 err = get_subexp (mctx, node_idx, cur_str_idx);
2583 if (BE (err != REG_NOERROR, 0))
2584 goto free_return;
2585
2586 /* And add the epsilon closures (which is `new_dest_nodes') of
2587 the backreference to appropriate state_log. */
2588#ifdef DEBUG
2589 assert (dfa->nexts[node_idx] != REG_MISSING);
2590#endif
2591 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2592 {
2593 Idx subexp_len;
2594 re_dfastate_t *dest_state;
2595 struct re_backref_cache_entry *bkref_ent;
2596 bkref_ent = mctx->bkref_ents + bkc_idx;
2597 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2598 continue;
2599 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2600 new_dest_nodes = (subexp_len == 0
2601 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2602 : dfa->eclosures + dfa->nexts[node_idx]);
2603 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2604 - bkref_ent->subexp_from);
2605 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2606 mctx->eflags);
2607 dest_state = mctx->state_log[dest_str_idx];
2608 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2609 : mctx->state_log[cur_str_idx]->nodes.nelem);
2610 /* Add `new_dest_node' to state_log. */
2611 if (dest_state == NULL)
2612 {
2613 mctx->state_log[dest_str_idx]
2614 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2615 context);
2616 if (BE (mctx->state_log[dest_str_idx] == NULL
2617 && err != REG_NOERROR, 0))
2618 goto free_return;
2619 }
2620 else
2621 {
2622 re_node_set dest_nodes;
2623 err = re_node_set_init_union (&dest_nodes,
2624 dest_state->entrance_nodes,
2625 new_dest_nodes);
2626 if (BE (err != REG_NOERROR, 0))
2627 {
2628 re_node_set_free (&dest_nodes);
2629 goto free_return;
2630 }
2631 mctx->state_log[dest_str_idx]
2632 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2633 re_node_set_free (&dest_nodes);
2634 if (BE (mctx->state_log[dest_str_idx] == NULL
2635 && err != REG_NOERROR, 0))
2636 goto free_return;
2637 }
2638 /* We need to check recursively if the backreference can epsilon
2639 transit. */
2640 if (subexp_len == 0
2641 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2642 {
2643 err = check_subexp_matching_top (mctx, new_dest_nodes,
2644 cur_str_idx);
2645 if (BE (err != REG_NOERROR, 0))
2646 goto free_return;
2647 err = transit_state_bkref (mctx, new_dest_nodes);
2648 if (BE (err != REG_NOERROR, 0))
2649 goto free_return;
2650 }
2651 }
2652 }
2653 err = REG_NOERROR;
2654 free_return:
2655 return err;
2656}
2657
2658/* Enumerate all the candidates which the backreference BKREF_NODE can match
2659 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2660 Note that we might collect inappropriate candidates here.
2661 However, the cost of checking them strictly here is too high, then we
2662 delay these checking for prune_impossible_nodes(). */
2663
2664static reg_errcode_t
2665internal_function
2666get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2667{
2668 re_dfa_t *const dfa = mctx->dfa;
2669 Idx subexp_num, sub_top_idx;
2670 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2671 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2672 Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2673 if (cache_idx != REG_MISSING)
2674 {
2675 const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2676 do
2677 if (entry->node == bkref_node)
2678 return REG_NOERROR; /* We already checked it. */
2679 while (entry++->more);
2680 }
2681
2682 subexp_num = dfa->nodes[bkref_node].opr.idx;
2683
2684 /* For each sub expression */
2685 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2686 {
2687 reg_errcode_t err;
2688 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2689 re_sub_match_last_t *sub_last;
2690 Idx sub_last_idx, sl_str, bkref_str_off;
2691
2692 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2693 continue; /* It isn't related. */
2694
2695 sl_str = sub_top->str_idx;
2696 bkref_str_off = bkref_str_idx;
2697 /* At first, check the last node of sub expressions we already
2698 evaluated. */
2699 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2700 {
2701 regoff_t sl_str_diff;
2702 sub_last = sub_top->lasts[sub_last_idx];
2703 sl_str_diff = sub_last->str_idx - sl_str;
2704 /* The matched string by the sub expression match with the substring
2705 at the back reference? */
2706 if (sl_str_diff > 0)
2707 {
2708 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2709 {
2710 /* Not enough chars for a successful match. */
2711 if (bkref_str_off + sl_str_diff > mctx->input.len)
2712 break;
2713
2714 err = clean_state_log_if_needed (mctx,
2715 bkref_str_off
2716 + sl_str_diff);
2717 if (BE (err != REG_NOERROR, 0))
2718 return err;
2719 buf = (const char *) re_string_get_buffer (&mctx->input);
2720 }
2721 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2722 break; /* We don't need to search this sub expression any more. */
2723 }
2724 bkref_str_off += sl_str_diff;
2725 sl_str += sl_str_diff;
2726 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2727 bkref_str_idx);
2728
2729 /* Reload buf, since the preceding call might have reallocated
2730 the buffer. */
2731 buf = (const char *) re_string_get_buffer (&mctx->input);
2732
2733 if (err == REG_NOMATCH)
2734 continue;
2735 if (BE (err != REG_NOERROR, 0))
2736 return err;
2737 }
2738
2739 if (sub_last_idx < sub_top->nlasts)
2740 continue;
2741 if (sub_last_idx > 0)
2742 ++sl_str;
2743 /* Then, search for the other last nodes of the sub expression. */
2744 for (; sl_str <= bkref_str_idx; ++sl_str)
2745 {
2746 Idx cls_node;
2747 regoff_t sl_str_off;
2748 const re_node_set *nodes;
2749 sl_str_off = sl_str - sub_top->str_idx;
2750 /* The matched string by the sub expression match with the substring
2751 at the back reference? */
2752 if (sl_str_off > 0)
2753 {
2754 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2755 {
2756 /* If we are at the end of the input, we cannot match. */
2757 if (bkref_str_off >= mctx->input.len)
2758 break;
2759
2760 err = extend_buffers (mctx);
2761 if (BE (err != REG_NOERROR, 0))
2762 return err;
2763
2764 buf = (const char *) re_string_get_buffer (&mctx->input);
2765 }
2766 if (buf [bkref_str_off++] != buf[sl_str - 1])
2767 break; /* We don't need to search this sub expression
2768 any more. */
2769 }
2770 if (mctx->state_log[sl_str] == NULL)
2771 continue;
2772 /* Does this state have a ')' of the sub expression? */
2773 nodes = &mctx->state_log[sl_str]->nodes;
2774 cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2775 if (cls_node == REG_MISSING)
2776 continue; /* No. */
2777 if (sub_top->path == NULL)
2778 {
2779 sub_top->path = re_calloc (state_array_t,
2780 sl_str - sub_top->str_idx + 1);
2781 if (sub_top->path == NULL)
2782 return REG_ESPACE;
2783 }
2784 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2785 in the current context? */
2786 err = check_arrival (mctx, sub_top->path, sub_top->node,
2787 sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2788 if (err == REG_NOMATCH)
2789 continue;
2790 if (BE (err != REG_NOERROR, 0))
2791 return err;
2792 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2793 if (BE (sub_last == NULL, 0))
2794 return REG_ESPACE;
2795 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2796 bkref_str_idx);
2797 if (err == REG_NOMATCH)
2798 continue;
2799 }
2800 }
2801 return REG_NOERROR;
2802}
2803
2804/* Helper functions for get_subexp(). */
2805
2806/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2807 If it can arrive, register the sub expression expressed with SUB_TOP
2808 and SUB_LAST. */
2809
2810static reg_errcode_t
2811internal_function
2812get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2813 re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2814{
2815 reg_errcode_t err;
2816 Idx to_idx;
2817 /* Can the subexpression arrive the back reference? */
2818 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2819 sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2820 if (err != REG_NOERROR)
2821 return err;
2822 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2823 sub_last->str_idx);
2824 if (BE (err != REG_NOERROR, 0))
2825 return err;
2826 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2827 return clean_state_log_if_needed (mctx, to_idx);
2828}
2829
2830/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2831 Search '(' if FL_OPEN, or search ')' otherwise.
2832 TODO: This function isn't efficient...
2833 Because there might be more than one nodes whose types are
2834 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2835 nodes.
2836 E.g. RE: (a){2} */
2837
2838static Idx
2839internal_function
2840find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2841 Idx subexp_idx, int type)
2842{
2843 Idx cls_idx;
2844 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2845 {
2846 Idx cls_node = nodes->elems[cls_idx];
2847 const re_token_t *node = dfa->nodes + cls_node;
2848 if (node->type == type
2849 && node->opr.idx == subexp_idx)
2850 return cls_node;
2851 }
2852 return REG_MISSING;
2853}
2854
2855/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2856 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2857 heavily reused.
2858 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2859
2860static reg_errcode_t
2861internal_function
2862check_arrival (re_match_context_t *mctx, state_array_t *path,
2863 Idx top_node, Idx top_str, Idx last_node, Idx last_str,
2864 int type)
2865{
2866 re_dfa_t *const dfa = mctx->dfa;
2867 reg_errcode_t err;
2868 Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2869 re_dfastate_t *cur_state = NULL;
2870 re_node_set *cur_nodes, next_nodes;
2871 re_dfastate_t **backup_state_log;
2872 unsigned int context;
2873
2874 subexp_num = dfa->nodes[top_node].opr.idx;
2875 /* Extend the buffer if we need. */
2876 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2877 {
2878 re_dfastate_t **new_array;
2879 Idx old_alloc = path->alloc;
2880 Idx new_alloc = old_alloc + last_str + mctx->max_mb_elem_len + 1;
2881 if (BE (new_alloc < old_alloc, 0))
2882 return REG_ESPACE;
2883 new_array = re_xrealloc (path->array, re_dfastate_t *, new_alloc);
2884 if (BE (new_array == NULL, 0))
2885 return REG_ESPACE;
2886 path->array = new_array;
2887 path->alloc = new_alloc;
2888 memset (new_array + old_alloc, '\0',
2889 sizeof (re_dfastate_t *) * (new_alloc - old_alloc));
2890 }
2891
2892 str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2893
2894 /* Temporary modify MCTX. */
2895 backup_state_log = mctx->state_log;
2896 backup_cur_idx = mctx->input.cur_idx;
2897 mctx->state_log = path->array;
2898 mctx->input.cur_idx = str_idx;
2899
2900 /* Setup initial node set. */
2901 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2902 if (str_idx == top_str)
2903 {
2904 err = re_node_set_init_1 (&next_nodes, top_node);
2905 if (BE (err != REG_NOERROR, 0))
2906 return err;
2907 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2908 if (BE (err != REG_NOERROR, 0))
2909 {
2910 re_node_set_free (&next_nodes);
2911 return err;
2912 }
2913 }
2914 else
2915 {
2916 cur_state = mctx->state_log[str_idx];
2917 if (cur_state && cur_state->has_backref)
2918 {
2919 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2920 if (BE ( err != REG_NOERROR, 0))
2921 return err;
2922 }
2923 else
2924 re_node_set_init_empty (&next_nodes);
2925 }
2926 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2927 {
2928 if (next_nodes.nelem)
2929 {
2930 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2931 subexp_num, type);
2932 if (BE ( err != REG_NOERROR, 0))
2933 {
2934 re_node_set_free (&next_nodes);
2935 return err;
2936 }
2937 }
2938 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2939 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2940 {
2941 re_node_set_free (&next_nodes);
2942 return err;
2943 }
2944 mctx->state_log[str_idx] = cur_state;
2945 }
2946
2947 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2948 {
2949 re_node_set_empty (&next_nodes);
2950 if (mctx->state_log[str_idx + 1])
2951 {
2952 err = re_node_set_merge (&next_nodes,
2953 &mctx->state_log[str_idx + 1]->nodes);
2954 if (BE (err != REG_NOERROR, 0))
2955 {
2956 re_node_set_free (&next_nodes);
2957 return err;
2958 }
2959 }
2960 if (cur_state)
2961 {
2962 err = check_arrival_add_next_nodes (mctx, str_idx,
2963 &cur_state->non_eps_nodes, &next_nodes);
2964 if (BE (err != REG_NOERROR, 0))
2965 {
2966 re_node_set_free (&next_nodes);
2967 return err;
2968 }
2969 }
2970 ++str_idx;
2971 if (next_nodes.nelem)
2972 {
2973 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2974 if (BE (err != REG_NOERROR, 0))
2975 {
2976 re_node_set_free (&next_nodes);
2977 return err;
2978 }
2979 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2980 subexp_num, type);
2981 if (BE ( err != REG_NOERROR, 0))
2982 {
2983 re_node_set_free (&next_nodes);
2984 return err;
2985 }
2986 }
2987 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2988 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2989 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2990 {
2991 re_node_set_free (&next_nodes);
2992 return err;
2993 }
2994 mctx->state_log[str_idx] = cur_state;
2995 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2996 }
2997 re_node_set_free (&next_nodes);
2998 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2999 : &mctx->state_log[last_str]->nodes);
3000 path->next_idx = str_idx;
3001
3002 /* Fix MCTX. */
3003 mctx->state_log = backup_state_log;
3004 mctx->input.cur_idx = backup_cur_idx;
3005
3006 /* Then check the current node set has the node LAST_NODE. */
3007 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3008 return REG_NOERROR;
3009
3010 return REG_NOMATCH;
3011}
3012
3013/* Helper functions for check_arrival. */
3014
3015/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3016 to NEXT_NODES.
3017 TODO: This function is similar to the functions transit_state*(),
3018 however this function has many additional works.
3019 Can't we unify them? */
3020
3021static reg_errcode_t
3022internal_function
3023check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
3024 re_node_set *cur_nodes,
3025 re_node_set *next_nodes)
3026{
3027 re_dfa_t *const dfa = mctx->dfa;
3028 bool ok;
3029 Idx cur_idx;
3030 reg_errcode_t err;
3031 re_node_set union_set;
3032 re_node_set_init_empty (&union_set);
3033 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3034 {
3035 int naccepted = 0;
3036 Idx cur_node = cur_nodes->elems[cur_idx];
3037#ifdef DEBUG
3038 re_token_type_t type = dfa->nodes[cur_node].type;
3039 assert (!IS_EPSILON_NODE (type));
3040#endif
3041#ifdef RE_ENABLE_I18N
3042 /* If the node may accept `multi byte'. */
3043 if (dfa->nodes[cur_node].accept_mb)
3044 {
3045 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3046 str_idx);
3047 if (naccepted > 1)
3048 {
3049 re_dfastate_t *dest_state;
3050 Idx next_node = dfa->nexts[cur_node];
3051 Idx next_idx = str_idx + naccepted;
3052 dest_state = mctx->state_log[next_idx];
3053 re_node_set_empty (&union_set);
3054 if (dest_state)
3055 {
3056 err = re_node_set_merge (&union_set, &dest_state->nodes);
3057 if (BE (err != REG_NOERROR, 0))
3058 {
3059 re_node_set_free (&union_set);
3060 return err;
3061 }
3062 }
3063 ok = re_node_set_insert (&union_set, next_node);
3064 if (BE (! ok, 0))
3065 {
3066 re_node_set_free (&union_set);
3067 return REG_ESPACE;
3068 }
3069 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3070 &union_set);
3071 if (BE (mctx->state_log[next_idx] == NULL
3072 && err != REG_NOERROR, 0))
3073 {
3074 re_node_set_free (&union_set);
3075 return err;
3076 }
3077 }
3078 }
3079#endif /* RE_ENABLE_I18N */
3080 if (naccepted
3081 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3082 {
3083 ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3084 if (BE (! ok, 0))
3085 {
3086 re_node_set_free (&union_set);
3087 return REG_ESPACE;
3088 }
3089 }
3090 }
3091 re_node_set_free (&union_set);
3092 return REG_NOERROR;
3093}
3094
3095/* For all the nodes in CUR_NODES, add the epsilon closures of them to
3096 CUR_NODES, however exclude the nodes which are:
3097 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3098 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3099*/
3100
3101static reg_errcode_t
3102internal_function
3103check_arrival_expand_ecl (re_dfa_t *dfa, re_node_set *cur_nodes,
3104 Idx ex_subexp, int type)
3105{
3106 reg_errcode_t err;
3107 Idx idx, outside_node;
3108 re_node_set new_nodes;
3109#ifdef DEBUG
3110 assert (cur_nodes->nelem);
3111#endif
3112 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3113 if (BE (err != REG_NOERROR, 0))
3114 return err;
3115 /* Create a new node set NEW_NODES with the nodes which are epsilon
3116 closures of the node in CUR_NODES. */
3117
3118 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3119 {
3120 Idx cur_node = cur_nodes->elems[idx];
3121 re_node_set *eclosure = dfa->eclosures + cur_node;
3122 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3123 if (outside_node == REG_MISSING)
3124 {
3125 /* There are no problematic nodes, just merge them. */
3126 err = re_node_set_merge (&new_nodes, eclosure);
3127 if (BE (err != REG_NOERROR, 0))
3128 {
3129 re_node_set_free (&new_nodes);
3130 return err;
3131 }
3132 }
3133 else
3134 {
3135 /* There are problematic nodes, re-calculate incrementally. */
3136 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3137 ex_subexp, type);
3138 if (BE (err != REG_NOERROR, 0))
3139 {
3140 re_node_set_free (&new_nodes);
3141 return err;
3142 }
3143 }
3144 }
3145 re_node_set_free (cur_nodes);
3146 *cur_nodes = new_nodes;
3147 return REG_NOERROR;
3148}
3149
3150/* Helper function for check_arrival_expand_ecl.
3151 Check incrementally the epsilon closure of TARGET, and if it isn't
3152 problematic append it to DST_NODES. */
3153
3154static reg_errcode_t
3155internal_function
3156check_arrival_expand_ecl_sub (re_dfa_t *dfa, re_node_set *dst_nodes,
3157 Idx target, Idx ex_subexp, int type)
3158{
3159 Idx cur_node;
3160 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3161 {
3162 bool ok;
3163
3164 if (dfa->nodes[cur_node].type == type
3165 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3166 {
3167 if (type == OP_CLOSE_SUBEXP)
3168 {
3169 ok = re_node_set_insert (dst_nodes, cur_node);
3170 if (BE (! ok, 0))
3171 return REG_ESPACE;
3172 }
3173 break;
3174 }
3175 ok = re_node_set_insert (dst_nodes, cur_node);
3176 if (BE (! ok, 0))
3177 return REG_ESPACE;
3178 if (dfa->edests[cur_node].nelem == 0)
3179 break;
3180 if (dfa->edests[cur_node].nelem == 2)
3181 {
3182 reg_errcode_t ret =
3183 check_arrival_expand_ecl_sub (dfa, dst_nodes,
3184 dfa->edests[cur_node].elems[1],
3185 ex_subexp, type);
3186 if (BE (ret != REG_NOERROR, 0))
3187 return ret;
3188 }
3189 cur_node = dfa->edests[cur_node].elems[0];
3190 }
3191 return REG_NOERROR;
3192}
3193
3194
3195/* For all the back references in the current state, calculate the
3196 destination of the back references by the appropriate entry
3197 in MCTX->BKREF_ENTS. */
3198
3199static reg_errcode_t
3200internal_function
3201expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3202 Idx cur_str, Idx subexp_num, int type)
3203{
3204 re_dfa_t *const dfa = mctx->dfa;
3205 reg_errcode_t err;
3206 Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3207 struct re_backref_cache_entry *ent;
3208
3209 if (cache_idx_start == REG_MISSING)
3210 return REG_NOERROR;
3211
3212 restart:
3213 ent = mctx->bkref_ents + cache_idx_start;
3214 do
3215 {
3216 Idx to_idx, next_node;
3217
3218 /* Is this entry ENT is appropriate? */
3219 if (!re_node_set_contains (cur_nodes, ent->node))
3220 continue; /* No. */
3221
3222 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3223 /* Calculate the destination of the back reference, and append it
3224 to MCTX->STATE_LOG. */
3225 if (to_idx == cur_str)
3226 {
3227 /* The backreference did epsilon transit, we must re-check all the
3228 node in the current state. */
3229 re_node_set new_dests;
3230 reg_errcode_t err2, err3;
3231 next_node = dfa->edests[ent->node].elems[0];
3232 if (re_node_set_contains (cur_nodes, next_node))
3233 continue;
3234 err = re_node_set_init_1 (&new_dests, next_node);
3235 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3236 err3 = re_node_set_merge (cur_nodes, &new_dests);
3237 re_node_set_free (&new_dests);
3238 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3239 || err3 != REG_NOERROR, 0))
3240 {
3241 err = (err != REG_NOERROR ? err
3242 : (err2 != REG_NOERROR ? err2 : err3));
3243 return err;
3244 }
3245 /* TODO: It is still inefficient... */
3246 goto restart;
3247 }
3248 else
3249 {
3250 re_node_set union_set;
3251 next_node = dfa->nexts[ent->node];
3252 if (mctx->state_log[to_idx])
3253 {
3254 bool ok;
3255 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3256 next_node))
3257 continue;
3258 err = re_node_set_init_copy (&union_set,
3259 &mctx->state_log[to_idx]->nodes);
3260 ok = re_node_set_insert (&union_set, next_node);
3261 if (BE (err != REG_NOERROR || ! ok, 0))
3262 {
3263 re_node_set_free (&union_set);
3264 err = err != REG_NOERROR ? err : REG_ESPACE;
3265 return err;
3266 }
3267 }
3268 else
3269 {
3270 err = re_node_set_init_1 (&union_set, next_node);
3271 if (BE (err != REG_NOERROR, 0))
3272 return err;
3273 }
3274 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3275 re_node_set_free (&union_set);
3276 if (BE (mctx->state_log[to_idx] == NULL
3277 && err != REG_NOERROR, 0))
3278 return err;
3279 }
3280 }
3281 while (ent++->more);
3282 return REG_NOERROR;
3283}
3284
3285/* Build transition table for the state.
3286 Return true if successful. */
3287
3288static bool
3289internal_function
3290build_trtable (re_dfa_t *dfa, re_dfastate_t *state)
3291{
3292 reg_errcode_t err;
3293 Idx i, j;
3294 int ch;
3295 bool need_word_trtable = false;
3296 bitset_word elem, mask;
3297 bool dests_node_malloced = false, dest_states_malloced = false;
3298 Idx ndests; /* Number of the destination states from `state'. */
3299 re_dfastate_t **trtable;
3300 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3301 re_node_set follows, *dests_node;
3302 bitset *dests_ch;
3303 bitset acceptable;
3304
3305 struct dests_alloc
3306 {
3307 re_node_set dests_node[SBC_MAX];
3308 bitset dests_ch[SBC_MAX];
3309 } *dests_alloc;
3310
3311 /* We build DFA states which corresponds to the destination nodes
3312 from `state'. `dests_node[i]' represents the nodes which i-th
3313 destination state contains, and `dests_ch[i]' represents the
3314 characters which i-th destination state accepts. */
3315 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3316 dests_alloc = (struct dests_alloc *) alloca (sizeof dests_alloc[0]);
3317 else
3318 {
3319 dests_alloc = re_malloc (struct dests_alloc, 1);
3320 if (BE (dests_alloc == NULL, 0))
3321 return false;
3322 dests_node_malloced = true;
3323 }
3324 dests_node = dests_alloc->dests_node;
3325 dests_ch = dests_alloc->dests_ch;
3326
3327 /* Initialize transiton table. */
3328 state->word_trtable = state->trtable = NULL;
3329
3330 /* At first, group all nodes belonging to `state' into several
3331 destinations. */
3332 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3333 if (BE (! REG_VALID_NONZERO_INDEX (ndests), 0))
3334 {
3335 if (dests_node_malloced)
3336 free (dests_alloc);
3337 if (ndests == 0)
3338 {
3339 state->trtable = re_calloc (re_dfastate_t *, SBC_MAX);
3340 return true;
3341 }
3342 return false;
3343 }
3344
3345 err = re_node_set_alloc (&follows, ndests + 1);
3346 if (BE (err != REG_NOERROR, 0))
3347 goto out_free;
3348
3349 /* Avoid arithmetic overflow in size calculation. */
3350 if (BE (((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX)
3351 / (3 * sizeof (re_dfastate_t *)))
3352 < ndests, 0))
3353 goto out_free;
3354
3355 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3356 + ndests * 3 * sizeof (re_dfastate_t *)))
3357 dest_states = (re_dfastate_t **)
3358 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3359 else
3360 {
3361 dest_states = (re_dfastate_t **)
3362 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3363 if (BE (dest_states == NULL, 0))
3364 {
3365out_free:
3366 if (dest_states_malloced)
3367 free (dest_states);
3368 re_node_set_free (&follows);
3369 for (i = 0; i < ndests; ++i)
3370 re_node_set_free (dests_node + i);
3371 if (dests_node_malloced)
3372 free (dests_alloc);
3373 return false;
3374 }
3375 dest_states_malloced = true;
3376 }
3377 dest_states_word = dest_states + ndests;
3378 dest_states_nl = dest_states_word + ndests;
3379 bitset_empty (acceptable);
3380
3381 /* Then build the states for all destinations. */
3382 for (i = 0; i < ndests; ++i)
3383 {
3384 Idx next_node;
3385 re_node_set_empty (&follows);
3386 /* Merge the follows of this destination states. */
3387 for (j = 0; j < dests_node[i].nelem; ++j)
3388 {
3389 next_node = dfa->nexts[dests_node[i].elems[j]];
3390 if (next_node != REG_MISSING)
3391 {
3392 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3393 if (BE (err != REG_NOERROR, 0))
3394 goto out_free;
3395 }
3396 }
3397 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3398 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3399 goto out_free;
3400 /* If the new state has context constraint,
3401 build appropriate states for these contexts. */
3402 if (dest_states[i]->has_constraint)
3403 {
3404 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3405 CONTEXT_WORD);
3406 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3407 goto out_free;
3408
3409 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3410 need_word_trtable = true;
3411
3412 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3413 CONTEXT_NEWLINE);
3414 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3415 goto out_free;
3416 }
3417 else
3418 {
3419 dest_states_word[i] = dest_states[i];
3420 dest_states_nl[i] = dest_states[i];
3421 }
3422 bitset_merge (acceptable, dests_ch[i]);
3423 }
3424
3425 if (!BE (need_word_trtable, 0))
3426 {
3427 /* We don't care about whether the following character is a word
3428 character, or we are in a single-byte character set so we can
3429 discern by looking at the character code: allocate a
3430 256-entry transition table. */
3431 trtable = state->trtable = re_calloc (re_dfastate_t *, SBC_MAX);
3432 if (BE (trtable == NULL, 0))
3433 goto out_free;
3434
3435 /* For all characters ch...: */
3436 for (i = 0; i < BITSET_WORDS; ++i)
3437 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3438 elem;
3439 mask <<= 1, elem >>= 1, ++ch)
3440 if (BE (elem & 1, 0))
3441 {
3442 /* There must be exactly one destination which accepts
3443 character ch. See group_nodes_into_DFAstates. */
3444 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3445 ;
3446
3447 /* j-th destination accepts the word character ch. */
3448 if (dfa->word_char[i] & mask)
3449 trtable[ch] = dest_states_word[j];
3450 else
3451 trtable[ch] = dest_states[j];
3452 }
3453 }
3454 else
3455 {
3456 /* We care about whether the following character is a word
3457 character, and we are in a multi-byte character set: discern
3458 by looking at the character code: build two 256-entry
3459 transition tables, one starting at trtable[0] and one
3460 starting at trtable[SBC_MAX]. */
3461 trtable = state->word_trtable = re_calloc (re_dfastate_t *, 2 * SBC_MAX);
3462 if (BE (trtable == NULL, 0))
3463 goto out_free;
3464
3465 /* For all characters ch...: */
3466 for (i = 0; i < BITSET_WORDS; ++i)
3467 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3468 elem;
3469 mask <<= 1, elem >>= 1, ++ch)
3470 if (BE (elem & 1, 0))
3471 {
3472 /* There must be exactly one destination which accepts
3473 character ch. See group_nodes_into_DFAstates. */
3474 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3475 ;
3476
3477 /* j-th destination accepts the word character ch. */
3478 trtable[ch] = dest_states[j];
3479 trtable[ch + SBC_MAX] = dest_states_word[j];
3480 }
3481 }
3482
3483 /* new line */
3484 if (bitset_contain (acceptable, NEWLINE_CHAR))
3485 {
3486 /* The current state accepts newline character. */
3487 for (j = 0; j < ndests; ++j)
3488 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3489 {
3490 /* k-th destination accepts newline character. */
3491 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3492 if (need_word_trtable)
3493 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3494 /* There must be only one destination which accepts
3495 newline. See group_nodes_into_DFAstates. */
3496 break;
3497 }
3498 }
3499
3500 if (dest_states_malloced)
3501 free (dest_states);
3502
3503 re_node_set_free (&follows);
3504 for (i = 0; i < ndests; ++i)
3505 re_node_set_free (dests_node + i);
3506
3507 if (dests_node_malloced)
3508 free (dests_alloc);
3509
3510 return true;
3511}
3512
3513/* Group all nodes belonging to STATE into several destinations.
3514 Then for all destinations, set the nodes belonging to the destination
3515 to DESTS_NODE[i] and set the characters accepted by the destination
3516 to DEST_CH[i]. This function return the number of destinations. */
3517
3518static Idx
3519internal_function
3520group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3521 re_node_set *dests_node, bitset *dests_ch)
3522{
3523 reg_errcode_t err;
3524 bool ok;
3525 Idx i, j, k;
3526 Idx ndests; /* Number of the destinations from `state'. */
3527 bitset accepts; /* Characters a node can accept. */
3528 const re_node_set *cur_nodes = &state->nodes;
3529 bitset_empty (accepts);
3530 ndests = 0;
3531
3532 /* For all the nodes belonging to `state', */
3533 for (i = 0; i < cur_nodes->nelem; ++i)
3534 {
3535 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3536 re_token_type_t type = node->type;
3537 unsigned int constraint = node->constraint;
3538
3539 /* Enumerate all single byte character this node can accept. */
3540 if (type == CHARACTER)
3541 bitset_set (accepts, node->opr.c);
3542 else if (type == SIMPLE_BRACKET)
3543 {
3544 bitset_merge (accepts, node->opr.sbcset);
3545 }
3546 else if (type == OP_PERIOD)
3547 {
3548#ifdef RE_ENABLE_I18N
3549 if (dfa->mb_cur_max > 1)
3550 bitset_merge (accepts, dfa->sb_char);
3551 else
3552#endif
3553 bitset_set_all (accepts);
3554 if (!(dfa->syntax & REG_DOT_NEWLINE))
3555 bitset_clear (accepts, '\n');
3556 if (dfa->syntax & REG_DOT_NOT_NULL)
3557 bitset_clear (accepts, '\0');
3558 }
3559#ifdef RE_ENABLE_I18N
3560 else if (type == OP_UTF8_PERIOD)
3561 {
3562 if (SBC_MAX / 2 % BITSET_WORD_BITS == 0)
3563 memset (accepts, -1, sizeof accepts / 2);
3564 else
3565 bitset_merge (accepts, utf8_sb_map);
3566 if (!(dfa->syntax & REG_DOT_NEWLINE))
3567 bitset_clear (accepts, '\n');
3568 if (dfa->syntax & REG_DOT_NOT_NULL)
3569 bitset_clear (accepts, '\0');
3570 }
3571#endif
3572 else
3573 continue;
3574
3575 /* Check the `accepts' and sift the characters which are not
3576 match it the context. */
3577 if (constraint)
3578 {
3579 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3580 {
3581 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3582 bitset_empty (accepts);
3583 if (accepts_newline)
3584 bitset_set (accepts, NEWLINE_CHAR);
3585 else
3586 continue;
3587 }
3588 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3589 {
3590 bitset_empty (accepts);
3591 continue;
3592 }
3593
3594 if (constraint & NEXT_WORD_CONSTRAINT)
3595 {
3596 bitset_word any_set = 0;
3597 if (type == CHARACTER && !node->word_char)
3598 {
3599 bitset_empty (accepts);
3600 continue;
3601 }
3602#ifdef RE_ENABLE_I18N
3603 if (dfa->mb_cur_max > 1)
3604 for (j = 0; j < BITSET_WORDS; ++j)
3605 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3606 else
3607#endif
3608 for (j = 0; j < BITSET_WORDS; ++j)
3609 any_set |= (accepts[j] &= dfa->word_char[j]);
3610 if (!any_set)
3611 continue;
3612 }
3613 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3614 {
3615 bitset_word any_set = 0;
3616 if (type == CHARACTER && node->word_char)
3617 {
3618 bitset_empty (accepts);
3619 continue;
3620 }
3621#ifdef RE_ENABLE_I18N
3622 if (dfa->mb_cur_max > 1)
3623 for (j = 0; j < BITSET_WORDS; ++j)
3624 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3625 else
3626#endif
3627 for (j = 0; j < BITSET_WORDS; ++j)
3628 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3629 if (!any_set)
3630 continue;
3631 }
3632 }
3633
3634 /* Then divide `accepts' into DFA states, or create a new
3635 state. Above, we make sure that accepts is not empty. */
3636 for (j = 0; j < ndests; ++j)
3637 {
3638 bitset intersec; /* Intersection sets, see below. */
3639 bitset remains;
3640 /* Flags, see below. */
3641 bitset_word has_intersec, not_subset, not_consumed;
3642
3643 /* Optimization, skip if this state doesn't accept the character. */
3644 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3645 continue;
3646
3647 /* Enumerate the intersection set of this state and `accepts'. */
3648 has_intersec = 0;
3649 for (k = 0; k < BITSET_WORDS; ++k)
3650 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3651 /* And skip if the intersection set is empty. */
3652 if (!has_intersec)
3653 continue;
3654
3655 /* Then check if this state is a subset of `accepts'. */
3656 not_subset = not_consumed = 0;
3657 for (k = 0; k < BITSET_WORDS; ++k)
3658 {
3659 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3660 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3661 }
3662
3663 /* If this state isn't a subset of `accepts', create a
3664 new group state, which has the `remains'. */
3665 if (not_subset)
3666 {
3667 bitset_copy (dests_ch[ndests], remains);
3668 bitset_copy (dests_ch[j], intersec);
3669 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3670 if (BE (err != REG_NOERROR, 0))
3671 goto error_return;
3672 ++ndests;
3673 }
3674
3675 /* Put the position in the current group. */
3676 ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3677 if (BE (! ok, 0))
3678 goto error_return;
3679
3680 /* If all characters are consumed, go to next node. */
3681 if (!not_consumed)
3682 break;
3683 }
3684 /* Some characters remain, create a new group. */
3685 if (j == ndests)
3686 {
3687 bitset_copy (dests_ch[ndests], accepts);
3688 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3689 if (BE (err != REG_NOERROR, 0))
3690 goto error_return;
3691 ++ndests;
3692 bitset_empty (accepts);
3693 }
3694 }
3695 return ndests;
3696 error_return:
3697 for (j = 0; j < ndests; ++j)
3698 re_node_set_free (dests_node + j);
3699 return REG_MISSING;
3700}
3701
3702#ifdef RE_ENABLE_I18N
3703/* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3704 Return the number of the bytes the node accepts.
3705 STR_IDX is the current index of the input string.
3706
3707 This function handles the nodes which can accept one character, or
3708 one collating element like '.', '[a-z]', opposite to the other nodes
3709 can only accept one byte. */
3710
3711static int
3712internal_function
3713check_node_accept_bytes (re_dfa_t *dfa, Idx node_idx,
3714 const re_string_t *input, Idx str_idx)
3715{
3716 const re_token_t *node = dfa->nodes + node_idx;
3717 int char_len, elem_len;
3718 Idx i;
3719
3720 if (BE (node->type == OP_UTF8_PERIOD, 0))
3721 {
3722 unsigned char c = re_string_byte_at (input, str_idx), d;
3723 if (BE (c < 0xc2, 1))
3724 return 0;
3725
3726 if (str_idx + 2 > input->len)
3727 return 0;
3728
3729 d = re_string_byte_at (input, str_idx + 1);
3730 if (c < 0xe0)
3731 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3732 else if (c < 0xf0)
3733 {
3734 char_len = 3;
3735 if (c == 0xe0 && d < 0xa0)
3736 return 0;
3737 }
3738 else if (c < 0xf8)
3739 {
3740 char_len = 4;
3741 if (c == 0xf0 && d < 0x90)
3742 return 0;
3743 }
3744 else if (c < 0xfc)
3745 {
3746 char_len = 5;
3747 if (c == 0xf8 && d < 0x88)
3748 return 0;
3749 }
3750 else if (c < 0xfe)
3751 {
3752 char_len = 6;
3753 if (c == 0xfc && d < 0x84)
3754 return 0;
3755 }
3756 else
3757 return 0;
3758
3759 if (str_idx + char_len > input->len)
3760 return 0;
3761
3762 for (i = 1; i < char_len; ++i)
3763 {
3764 d = re_string_byte_at (input, str_idx + i);
3765 if (d < 0x80 || d > 0xbf)
3766 return 0;
3767 }
3768 return char_len;
3769 }
3770
3771 char_len = re_string_char_size_at (input, str_idx);
3772 if (node->type == OP_PERIOD)
3773 {
3774 if (char_len <= 1)
3775 return 0;
3776 /* FIXME: I don't think this if is needed, as both '\n'
3777 and '\0' are char_len == 1. */
3778 /* '.' accepts any one character except the following two cases. */
3779 if ((!(dfa->syntax & REG_DOT_NEWLINE) &&
3780 re_string_byte_at (input, str_idx) == '\n') ||
3781 ((dfa->syntax & REG_DOT_NOT_NULL) &&
3782 re_string_byte_at (input, str_idx) == '\0'))
3783 return 0;
3784 return char_len;
3785 }
3786
3787 elem_len = re_string_elem_size_at (input, str_idx);
3788 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3789 return 0;
3790
3791 if (node->type == COMPLEX_BRACKET)
3792 {
3793 const re_charset_t *cset = node->opr.mbcset;
3794# ifdef _LIBC
3795 const unsigned char *pin
3796 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3797 Idx j;
3798 uint32_t nrules;
3799# endif /* _LIBC */
3800 int match_len = 0;
3801 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3802 ? re_string_wchar_at (input, str_idx) : 0);
3803
3804 /* match with multibyte character? */
3805 for (i = 0; i < cset->nmbchars; ++i)
3806 if (wc == cset->mbchars[i])
3807 {
3808 match_len = char_len;
3809 goto check_node_accept_bytes_match;
3810 }
3811 /* match with character_class? */
3812 for (i = 0; i < cset->nchar_classes; ++i)
3813 {
3814 wctype_t wt = cset->char_classes[i];
3815 if (__iswctype (wc, wt))
3816 {
3817 match_len = char_len;
3818 goto check_node_accept_bytes_match;
3819 }
3820 }
3821
3822# ifdef _LIBC
3823 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3824 if (nrules != 0)
3825 {
3826 unsigned int in_collseq = 0;
3827 const int32_t *table, *indirect;
3828 const unsigned char *weights, *extra;
3829 const char *collseqwc;
3830 int32_t idx;
3831 /* This #include defines a local function! */
3832# include <locale/weight.h>
3833
3834 /* match with collating_symbol? */
3835 if (cset->ncoll_syms)
3836 extra = (const unsigned char *)
3837 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3838 for (i = 0; i < cset->ncoll_syms; ++i)
3839 {
3840 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3841 /* Compare the length of input collating element and
3842 the length of current collating element. */
3843 if (*coll_sym != elem_len)
3844 continue;
3845 /* Compare each bytes. */
3846 for (j = 0; j < *coll_sym; j++)
3847 if (pin[j] != coll_sym[1 + j])
3848 break;
3849 if (j == *coll_sym)
3850 {
3851 /* Match if every bytes is equal. */
3852 match_len = j;
3853 goto check_node_accept_bytes_match;
3854 }
3855 }
3856
3857 if (cset->nranges)
3858 {
3859 if (elem_len <= char_len)
3860 {
3861 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3862 in_collseq = __collseq_table_lookup (collseqwc, wc);
3863 }
3864 else
3865 in_collseq = find_collation_sequence_value (pin, elem_len);
3866 }
3867 /* match with range expression? */
3868 for (i = 0; i < cset->nranges; ++i)
3869 if (cset->range_starts[i] <= in_collseq
3870 && in_collseq <= cset->range_ends[i])
3871 {
3872 match_len = elem_len;
3873 goto check_node_accept_bytes_match;
3874 }
3875
3876 /* match with equivalence_class? */
3877 if (cset->nequiv_classes)
3878 {
3879 const unsigned char *cp = pin;
3880 table = (const int32_t *)
3881 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3882 weights = (const unsigned char *)
3883 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3884 extra = (const unsigned char *)
3885 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3886 indirect = (const int32_t *)
3887 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3888 idx = findidx (&cp);
3889 if (idx > 0)
3890 for (i = 0; i < cset->nequiv_classes; ++i)
3891 {
3892 int32_t equiv_class_idx = cset->equiv_classes[i];
3893 size_t weight_len = weights[idx];
3894 if (weight_len == weights[equiv_class_idx])
3895 {
3896 Idx cnt = 0;
3897 while (cnt <= weight_len
3898 && (weights[equiv_class_idx + 1 + cnt]
3899 == weights[idx + 1 + cnt]))
3900 ++cnt;
3901 if (cnt > weight_len)
3902 {
3903 match_len = elem_len;
3904 goto check_node_accept_bytes_match;
3905 }
3906 }
3907 }
3908 }
3909 }
3910 else
3911# endif /* _LIBC */
3912 {
3913 /* match with range expression? */
3914#if __GNUC__ >= 2
3915 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3916#else
3917 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3918 cmp_buf[2] = wc;
3919#endif
3920 for (i = 0; i < cset->nranges; ++i)
3921 {
3922 cmp_buf[0] = cset->range_starts[i];
3923 cmp_buf[4] = cset->range_ends[i];
3924 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3925 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3926 {
3927 match_len = char_len;
3928 goto check_node_accept_bytes_match;
3929 }
3930 }
3931 }
3932 check_node_accept_bytes_match:
3933 if (!cset->non_match)
3934 return match_len;
3935 else
3936 {
3937 if (match_len > 0)
3938 return 0;
3939 else
3940 return (elem_len > char_len) ? elem_len : char_len;
3941 }
3942 }
3943 return 0;
3944}
3945
3946# ifdef _LIBC
3947static unsigned int
3948find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3949{
3950 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3951 if (nrules == 0)
3952 {
3953 if (mbs_len == 1)
3954 {
3955 /* No valid character. Match it as a single byte character. */
3956 const unsigned char *collseq = (const unsigned char *)
3957 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3958 return collseq[mbs[0]];
3959 }
3960 return UINT_MAX;
3961 }
3962 else
3963 {
3964 int32_t idx;
3965 const unsigned char *extra = (const unsigned char *)
3966 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3967 int32_t extrasize = (const unsigned char *)
3968 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3969
3970 for (idx = 0; idx < extrasize;)
3971 {
3972 int mbs_cnt;
3973 bool found = false;
3974 int32_t elem_mbs_len;
3975 /* Skip the name of collating element name. */
3976 idx = idx + extra[idx] + 1;
3977 elem_mbs_len = extra[idx++];
3978 if (mbs_len == elem_mbs_len)
3979 {
3980 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3981 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3982 break;
3983 if (mbs_cnt == elem_mbs_len)
3984 /* Found the entry. */
3985 found = true;
3986 }
3987 /* Skip the byte sequence of the collating element. */
3988 idx += elem_mbs_len;
3989 /* Adjust for the alignment. */
3990 idx = (idx + 3) & ~3;
3991 /* Skip the collation sequence value. */
3992 idx += sizeof (uint32_t);
3993 /* Skip the wide char sequence of the collating element. */
3994 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3995 /* If we found the entry, return the sequence value. */
3996 if (found)
3997 return *(uint32_t *) (extra + idx);
3998 /* Skip the collation sequence value. */
3999 idx += sizeof (uint32_t);
4000 }
4001 return UINT_MAX;
4002 }
4003}
4004# endif /* _LIBC */
4005#endif /* RE_ENABLE_I18N */
4006
4007/* Check whether the node accepts the byte which is IDX-th
4008 byte of the INPUT. */
4009
4010static bool
4011internal_function
4012check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4013 Idx idx)
4014{
4015 unsigned char ch;
4016 ch = re_string_byte_at (&mctx->input, idx);
4017 switch (node->type)
4018 {
4019 case CHARACTER:
4020 if (node->opr.c != ch)
4021 return false;
4022 break;
4023
4024 case SIMPLE_BRACKET:
4025 if (!bitset_contain (node->opr.sbcset, ch))
4026 return false;
4027 break;
4028
4029#ifdef RE_ENABLE_I18N
4030 case OP_UTF8_PERIOD:
4031 if (ch >= 0x80)
4032 return false;
4033 /* FALLTHROUGH */
4034#endif
4035 case OP_PERIOD:
4036 if ((ch == '\n' && !(mctx->dfa->syntax & REG_DOT_NEWLINE))
4037 || (ch == '\0' && (mctx->dfa->syntax & REG_DOT_NOT_NULL)))
4038 return false;
4039 break;
4040
4041 default:
4042 return false;
4043 }
4044
4045 if (node->constraint)
4046 {
4047 /* The node has constraints. Check whether the current context
4048 satisfies the constraints. */
4049 unsigned int context = re_string_context_at (&mctx->input, idx,
4050 mctx->eflags);
4051 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4052 return false;
4053 }
4054
4055 return true;
4056}
4057
4058/* Extend the buffers, if the buffers have run out. */
4059
4060static reg_errcode_t
4061internal_function
4062extend_buffers (re_match_context_t *mctx)
4063{
4064 reg_errcode_t ret;
4065 re_string_t *pstr = &mctx->input;
4066
4067 /* Double the lengthes of the buffers. */
4068 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4069 if (BE (ret != REG_NOERROR, 0))
4070 return ret;
4071
4072 if (mctx->state_log != NULL)
4073 {
4074 /* And double the length of state_log. */
4075 /* XXX We have no indication of the size of this buffer. If this
4076 allocation fail we have no indication that the state_log array
4077 does not have the right size. */
4078 re_dfastate_t **new_array = re_xrealloc (mctx->state_log, re_dfastate_t *,
4079 pstr->bufs_len + 1);
4080 if (BE (new_array == NULL, 0))
4081 return REG_ESPACE;
4082 mctx->state_log = new_array;
4083 }
4084
4085 /* Then reconstruct the buffers. */
4086 if (pstr->icase)
4087 {
4088#ifdef RE_ENABLE_I18N
4089 if (pstr->mb_cur_max > 1)
4090 {
4091 ret = build_wcs_upper_buffer (pstr);
4092 if (BE (ret != REG_NOERROR, 0))
4093 return ret;
4094 }
4095 else
4096#endif /* RE_ENABLE_I18N */
4097 build_upper_buffer (pstr);
4098 }
4099 else
4100 {
4101#ifdef RE_ENABLE_I18N
4102 if (pstr->mb_cur_max > 1)
4103 build_wcs_buffer (pstr);
4104 else
4105#endif /* RE_ENABLE_I18N */
4106 {
4107 if (pstr->trans != NULL)
4108 re_string_translate_buffer (pstr);
4109 }
4110 }
4111 return REG_NOERROR;
4112}
4113
4114
4115/* Functions for matching context. */
4116
4117/* Initialize MCTX. */
4118
4119static reg_errcode_t
4120internal_function
4121match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4122{
4123 mctx->eflags = eflags;
4124 mctx->match_last = REG_MISSING;
4125 if (n > 0)
4126 {
4127 mctx->bkref_ents = re_xmalloc (struct re_backref_cache_entry, n);
4128 mctx->sub_tops = re_xmalloc (re_sub_match_top_t *, n);
4129 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4130 return REG_ESPACE;
4131 }
4132 /* Already zero-ed by the caller.
4133 else
4134 mctx->bkref_ents = NULL;
4135 mctx->nbkref_ents = 0;
4136 mctx->nsub_tops = 0; */
4137 mctx->abkref_ents = n;
4138 mctx->max_mb_elem_len = 1;
4139 mctx->asub_tops = n;
4140 return REG_NOERROR;
4141}
4142
4143/* Clean the entries which depend on the current input in MCTX.
4144 This function must be invoked when the matcher changes the start index
4145 of the input, or changes the input string. */
4146
4147static void
4148internal_function
4149match_ctx_clean (re_match_context_t *mctx)
4150{
4151 Idx st_idx;
4152 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4153 {
4154 Idx sl_idx;
4155 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4156 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4157 {
4158 re_sub_match_last_t *last = top->lasts[sl_idx];
4159 re_free (last->path.array);
4160 re_free (last);
4161 }
4162 re_free (top->lasts);
4163 if (top->path)
4164 {
4165 re_free (top->path->array);
4166 re_free (top->path);
4167 }
4168 free (top);
4169 }
4170
4171 mctx->nsub_tops = 0;
4172 mctx->nbkref_ents = 0;
4173}
4174
4175/* Free all the memory associated with MCTX. */
4176
4177static void
4178internal_function
4179match_ctx_free (re_match_context_t *mctx)
4180{
4181 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4182 match_ctx_clean (mctx);
4183 re_free (mctx->sub_tops);
4184 re_free (mctx->bkref_ents);
4185}
4186
4187/* Add a new backreference entry to MCTX.
4188 Note that we assume that caller never call this function with duplicate
4189 entry, and call with STR_IDX which isn't smaller than any existing entry.
4190*/
4191
4192static reg_errcode_t
4193internal_function
4194match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx,
4195 Idx from, Idx to)
4196{
4197 if (mctx->nbkref_ents >= mctx->abkref_ents)
4198 {
4199 struct re_backref_cache_entry* new_entry;
4200 new_entry = re_x2realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4201 &mctx->abkref_ents);
4202 if (BE (new_entry == NULL, 0))
4203 {
4204 re_free (mctx->bkref_ents);
4205 return REG_ESPACE;
4206 }
4207 mctx->bkref_ents = new_entry;
4208 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4209 (sizeof (struct re_backref_cache_entry)
4210 * (mctx->abkref_ents - mctx->nbkref_ents)));
4211 }
4212 if (mctx->nbkref_ents > 0
4213 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4214 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4215
4216 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4217 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4218 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4219 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4220
4221 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4222 If bit N is clear, means that this entry won't epsilon-transition to
4223 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4224 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4225 such node.
4226
4227 A backreference does not epsilon-transition unless it is empty, so set
4228 to all zeros if FROM != TO. */
4229 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4230 = (from == to ? -1 : 0);
4231
4232 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4233 if (mctx->max_mb_elem_len < to - from)
4234 mctx->max_mb_elem_len = to - from;
4235 return REG_NOERROR;
4236}
4237
4238/* Return the first entry with the same str_idx, or REG_MISSING if none is
4239 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4240
4241static Idx
4242internal_function
4243search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4244{
4245 Idx left, right, mid, last;
4246 last = right = mctx->nbkref_ents;
4247 for (left = 0; left < right;)
4248 {
4249 mid = (left + right) / 2;
4250 if (mctx->bkref_ents[mid].str_idx < str_idx)
4251 left = mid + 1;
4252 else
4253 right = mid;
4254 }
4255 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4256 return left;
4257 else
4258 return REG_MISSING;
4259}
4260
4261/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4262 at STR_IDX. */
4263
4264static reg_errcode_t
4265internal_function
4266match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4267{
4268#ifdef DEBUG
4269 assert (mctx->sub_tops != NULL);
4270 assert (mctx->asub_tops > 0);
4271#endif
4272 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4273 {
4274 Idx new_asub_tops = mctx->asub_tops;
4275 re_sub_match_top_t **new_array = re_x2realloc (mctx->sub_tops,
4276 re_sub_match_top_t *,
4277 &new_asub_tops);
4278 if (BE (new_array == NULL, 0))
4279 return REG_ESPACE;
4280 mctx->sub_tops = new_array;
4281 mctx->asub_tops = new_asub_tops;
4282 }
4283 mctx->sub_tops[mctx->nsub_tops] = re_calloc (re_sub_match_top_t, 1);
4284 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4285 return REG_ESPACE;
4286 mctx->sub_tops[mctx->nsub_tops]->node = node;
4287 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4288 return REG_NOERROR;
4289}
4290
4291/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4292 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4293
4294static re_sub_match_last_t *
4295internal_function
4296match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4297{
4298 re_sub_match_last_t *new_entry;
4299 if (BE (subtop->nlasts == subtop->alasts, 0))
4300 {
4301 Idx new_alasts = subtop->alasts;
4302 re_sub_match_last_t **new_array = re_x2realloc (subtop->lasts,
4303 re_sub_match_last_t *,
4304 &new_alasts);
4305 if (BE (new_array == NULL, 0))
4306 return NULL;
4307 subtop->lasts = new_array;
4308 subtop->alasts = new_alasts;
4309 }
4310 new_entry = re_calloc (re_sub_match_last_t, 1);
4311 if (BE (new_entry != NULL, 1))
4312 {
4313 subtop->lasts[subtop->nlasts] = new_entry;
4314 new_entry->node = node;
4315 new_entry->str_idx = str_idx;
4316 ++subtop->nlasts;
4317 }
4318 return new_entry;
4319}
4320
4321static void
4322internal_function
4323sift_ctx_init (re_sift_context_t *sctx,
4324 re_dfastate_t **sifted_sts,
4325 re_dfastate_t **limited_sts,
4326 Idx last_node, Idx last_str_idx)
4327{
4328 sctx->sifted_states = sifted_sts;
4329 sctx->limited_states = limited_sts;
4330 sctx->last_node = last_node;
4331 sctx->last_str_idx = last_str_idx;
4332 re_node_set_init_empty (&sctx->limits);
4333}
diff --git a/lib/strcase.h b/lib/strcase.h
new file mode 100644
index 0000000..e420798
--- /dev/null
+++ b/lib/strcase.h
@@ -0,0 +1,48 @@
1/* Case-insensitive string comparison functions.
2 Copyright (C) 1995-1996, 2001, 2003, 2005 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
17
18#ifndef _STRCASE_H
19#define _STRCASE_H
20
21#include <stddef.h>
22
23
24#ifdef __cplusplus
25extern "C" {
26#endif
27
28
29/* Compare strings S1 and S2, ignoring case, returning less than, equal to or
30 greater than zero if S1 is lexicographically less than, equal to or greater
31 than S2.
32 Note: This function may, in multibyte locales, return 0 for strings of
33 different lengths! */
34extern int strcasecmp (const char *s1, const char *s2);
35
36/* Compare no more than N characters of strings S1 and S2, ignoring case,
37 returning less than, equal to or greater than zero if S1 is
38 lexicographically less than, equal to or greater than S2.
39 Note: This function can not work correctly in multibyte locales. */
40extern int strncasecmp (const char *s1, const char *s2, size_t n);
41
42
43#ifdef __cplusplus
44}
45#endif
46
47
48#endif /* _STRCASE_H */