From 868beb15ae02c352005a2df8857f4ebb9bd758fd Mon Sep 17 00:00:00 2001 From: RincewindsHat <12514511+RincewindsHat@users.noreply.github.com> Date: Tue, 23 Aug 2022 19:42:51 +0200 Subject: Sync with the latest Gnulib code (d27c820595) --- gl/regcomp.c | 1364 ++++++++++++++++++++++++++-------------------------------- 1 file changed, 615 insertions(+), 749 deletions(-) (limited to 'gl/regcomp.c') diff --git a/gl/regcomp.c b/gl/regcomp.c index f0b2e522..122c3de5 100644 --- a/gl/regcomp.c +++ b/gl/regcomp.c @@ -1,21 +1,25 @@ /* Extended regular expression matching and search library. - Copyright (C) 2002-2013 Free Software Foundation, Inc. + Copyright (C) 2002-2022 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Isamu Hasegawa . The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public + modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either - version 3 of the License, or (at your option) any later version. + version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. + Lesser General Public License for more details. - You should have received a copy of the GNU General Public + You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, see - . */ + . */ + +#ifdef _LIBC +# include +#endif static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, size_t length, reg_syntax_t syntax); @@ -23,14 +27,10 @@ static void re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, char *fastmap); static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len); -#ifdef RE_ENABLE_I18N static void free_charset (re_charset_t *cset); -#endif /* RE_ENABLE_I18N */ static void free_workarea_compile (regex_t *preg); static reg_errcode_t create_initial_state (re_dfa_t *dfa); -#ifdef RE_ENABLE_I18N static void optimize_utf8 (re_dfa_t *dfa); -#endif static reg_errcode_t analyze (regex_t *preg); static reg_errcode_t preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), @@ -55,7 +55,7 @@ static reg_errcode_t calc_inveclosure (re_dfa_t *dfa); static Idx fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax); static int peek_token (re_token_t *token, re_string_t *input, - reg_syntax_t syntax) internal_function; + reg_syntax_t syntax); static bin_tree_t *parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax, reg_errcode_t *err); static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg, @@ -85,7 +85,6 @@ static reg_errcode_t parse_bracket_element (bracket_elem_t *elem, static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp, re_token_t *token); -#ifdef RE_ENABLE_I18N static reg_errcode_t build_equiv_class (bitset_t sbcset, re_charset_t *mbcset, Idx *equiv_class_alloc, @@ -96,14 +95,6 @@ static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, Idx *char_class_alloc, const char *class_name, reg_syntax_t syntax); -#else /* not RE_ENABLE_I18N */ -static reg_errcode_t build_equiv_class (bitset_t sbcset, - const unsigned char *name); -static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, - bitset_t sbcset, - const char *class_name, - reg_syntax_t syntax); -#endif /* not RE_ENABLE_I18N */ static bin_tree_t *build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, const char *class_name, @@ -149,9 +140,9 @@ static const char __re_error_msgid[] = gettext_noop ("Invalid back reference") /* REG_ESUBREG */ "\0" #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference") - gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */ + gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */ "\0" -#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^") +#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=") gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */ "\0" #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(") @@ -209,17 +200,9 @@ static const size_t __re_error_msgid_idx[] = Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields are set in BUFP on entry. */ -#ifdef _LIBC -const char * -re_compile_pattern (pattern, length, bufp) - const char *pattern; - size_t length; - struct re_pattern_buffer *bufp; -#else /* size_t might promote */ const char * re_compile_pattern (const char *pattern, size_t length, struct re_pattern_buffer *bufp) -#endif { reg_errcode_t ret; @@ -237,9 +220,7 @@ re_compile_pattern (const char *pattern, size_t length, return NULL; return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); } -#ifdef _LIBC weak_alias (__re_compile_pattern, re_compile_pattern) -#endif /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can also be assigned to arbitrarily: each pattern buffer stores its own @@ -257,21 +238,17 @@ reg_syntax_t re_syntax_options; defined in regex.h. We return the old syntax. */ reg_syntax_t -re_set_syntax (syntax) - reg_syntax_t syntax; +re_set_syntax (reg_syntax_t syntax) { reg_syntax_t ret = re_syntax_options; re_syntax_options = syntax; return ret; } -#ifdef _LIBC weak_alias (__re_set_syntax, re_set_syntax) -#endif int -re_compile_fastmap (bufp) - struct re_pattern_buffer *bufp; +re_compile_fastmap (struct re_pattern_buffer *bufp) { re_dfa_t *dfa = bufp->buffer; char *fastmap = bufp->fastmap; @@ -287,12 +264,9 @@ re_compile_fastmap (bufp) bufp->fastmap_accurate = 1; return 0; } -#ifdef _LIBC weak_alias (__re_compile_fastmap, re_compile_fastmap) -#endif -static inline void -__attribute__ ((always_inline)) +static __always_inline void re_set_fastmap (char *fastmap, bool icase, int ch) { fastmap[ch] = 1; @@ -318,7 +292,6 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, if (type == CHARACTER) { re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c); -#ifdef RE_ENABLE_I18N if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) { unsigned char buf[MB_LEN_MAX]; @@ -335,11 +308,10 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, memset (&state, '\0', sizeof (state)); if (__mbrtowc (&wc, (const char *) buf, p - buf, &state) == p - buf - && (__wcrtomb ((char *) buf, towlower (wc), &state) + && (__wcrtomb ((char *) buf, __towlower (wc), &state) != (size_t) -1)) re_set_fastmap (fastmap, false, buf[0]); } -#endif } else if (type == SIMPLE_BRACKET) { @@ -353,13 +325,12 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, re_set_fastmap (fastmap, icase, ch); } } -#ifdef RE_ENABLE_I18N else if (type == COMPLEX_BRACKET) { re_charset_t *cset = dfa->nodes[node].opr.mbcset; Idx i; -# ifdef _LIBC +#ifdef _LIBC /* See if we have to try all bytes which start multiple collation elements. e.g. In da_DK, we want to catch 'a' since "aa" is a valid @@ -375,7 +346,7 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, if (table[i] < 0) re_set_fastmap (fastmap, icase, i); } -# endif /* _LIBC */ +#endif /* _LIBC */ /* See if we have to start the match at all multibyte characters, i.e. where we would not find an invalid sequence. This only @@ -383,9 +354,9 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, sets, the SIMPLE_BRACKET again suffices. */ if (dfa->mb_cur_max > 1 && (cset->nchar_classes || cset->non_match || cset->nranges -# ifdef _LIBC +#ifdef _LIBC || cset->nequiv_classes -# endif /* _LIBC */ +#endif /* _LIBC */ )) { unsigned char c = 0; @@ -411,19 +382,14 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, re_set_fastmap (fastmap, icase, *(unsigned char *) buf); if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) { - if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state) + if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state) != (size_t) -1) re_set_fastmap (fastmap, false, *(unsigned char *) buf); } } } } -#endif /* RE_ENABLE_I18N */ - else if (type == OP_PERIOD -#ifdef RE_ENABLE_I18N - || type == OP_UTF8_PERIOD -#endif /* RE_ENABLE_I18N */ - || type == END_OF_RE) + else if (type == OP_PERIOD || type == OP_UTF8_PERIOD || type == END_OF_RE) { memset (fastmap, '\1', sizeof (char) * SBC_MAX); if (type == END_OF_RE) @@ -470,10 +436,7 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, the return codes and their meanings.) */ int -regcomp (preg, pattern, cflags) - regex_t *_Restrict_ preg; - const char *_Restrict_ pattern; - int cflags; +regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags) { reg_errcode_t ret; reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED @@ -485,7 +448,7 @@ regcomp (preg, pattern, cflags) /* Try to allocate space for the fastmap. */ preg->fastmap = re_malloc (char, SBC_MAX); - if (BE (preg->fastmap == NULL, 0)) + if (__glibc_unlikely (preg->fastmap == NULL)) return REG_ESPACE; syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0; @@ -511,7 +474,7 @@ regcomp (preg, pattern, cflags) ret = REG_EPAREN; /* We have already checked preg->fastmap != NULL. */ - if (BE (ret == REG_NOERROR, 1)) + if (__glibc_likely (ret == REG_NOERROR)) /* Compute the fastmap now, since regexec cannot modify the pattern buffer. This function never fails in this implementation. */ (void) re_compile_fastmap (preg); @@ -524,32 +487,21 @@ regcomp (preg, pattern, cflags) return (int) ret; } -#ifdef _LIBC +libc_hidden_def (__regcomp) weak_alias (__regcomp, regcomp) -#endif /* Returns a message corresponding to an error code, ERRCODE, returned from either regcomp or regexec. We don't use PREG here. */ -#ifdef _LIBC -size_t -regerror (errcode, preg, errbuf, errbuf_size) - int errcode; - const regex_t *_Restrict_ preg; - char *_Restrict_ errbuf; - size_t errbuf_size; -#else /* size_t might promote */ size_t -regerror (int errcode, const regex_t *_Restrict_ preg, - char *_Restrict_ errbuf, size_t errbuf_size) -#endif +regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf, + size_t errbuf_size) { const char *msg; size_t msg_size; + int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0]; - if (BE (errcode < 0 - || errcode >= (int) (sizeof (__re_error_msgid_idx) - / sizeof (__re_error_msgid_idx[0])), 0)) + if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes)) /* Only error codes returned by the rest of the code should be passed to this routine. If we are given anything else, or if other regex code generates an invalid error code, then the program has a bug. @@ -560,10 +512,10 @@ regerror (int errcode, const regex_t *_Restrict_ preg, msg_size = strlen (msg) + 1; /* Includes the null. */ - if (BE (errbuf_size != 0, 1)) + if (__glibc_likely (errbuf_size != 0)) { size_t cpy_size = msg_size; - if (BE (msg_size > errbuf_size, 0)) + if (__glibc_unlikely (msg_size > errbuf_size)) { cpy_size = errbuf_size - 1; errbuf[cpy_size] = '\0'; @@ -573,12 +525,9 @@ regerror (int errcode, const regex_t *_Restrict_ preg, return msg_size; } -#ifdef _LIBC weak_alias (__regerror, regerror) -#endif -#ifdef RE_ENABLE_I18N /* This static array is used for the map to single-byte characters when UTF-8 is used. Otherwise we would allocate memory just to initialize it the same all the time. UTF-8 is the preferred encoding so this is @@ -586,25 +535,24 @@ weak_alias (__regerror, regerror) static const bitset_t utf8_sb_map = { /* Set the first 128 bits. */ -# if defined __GNUC__ && !defined __STRICT_ANSI__ +#if (defined __GNUC__ || __clang_major__ >= 4) && !defined __STRICT_ANSI__ [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX -# else -# if 4 * BITSET_WORD_BITS < ASCII_CHARS -# error "bitset_word_t is narrower than 32 bits" -# elif 3 * BITSET_WORD_BITS < ASCII_CHARS +#else +# if 4 * BITSET_WORD_BITS < ASCII_CHARS +# error "bitset_word_t is narrower than 32 bits" +# elif 3 * BITSET_WORD_BITS < ASCII_CHARS BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX, -# elif 2 * BITSET_WORD_BITS < ASCII_CHARS +# elif 2 * BITSET_WORD_BITS < ASCII_CHARS BITSET_WORD_MAX, BITSET_WORD_MAX, -# elif 1 * BITSET_WORD_BITS < ASCII_CHARS +# elif 1 * BITSET_WORD_BITS < ASCII_CHARS BITSET_WORD_MAX, -# endif +# endif (BITSET_WORD_MAX >> (SBC_MAX % BITSET_WORD_BITS == 0 ? 0 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS)) -# endif -}; #endif +}; static void @@ -642,10 +590,8 @@ free_dfa_content (re_dfa_t *dfa) re_free (entry->array); } re_free (dfa->state_table); -#ifdef RE_ENABLE_I18N if (dfa->sb_char != utf8_sb_map) re_free (dfa->sb_char); -#endif re_free (dfa->subexp_map); #ifdef DEBUG re_free (dfa->re_str); @@ -658,11 +604,10 @@ free_dfa_content (re_dfa_t *dfa) /* Free dynamically allocated space used by PREG. */ void -regfree (preg) - regex_t *preg; +regfree (regex_t *preg) { re_dfa_t *dfa = preg->buffer; - if (BE (dfa != NULL, 1)) + if (__glibc_likely (dfa != NULL)) { lock_fini (dfa->lock); free_dfa_content (dfa); @@ -676,9 +621,8 @@ regfree (preg) re_free (preg->translate); preg->translate = NULL; } -#ifdef _LIBC +libc_hidden_def (__regfree) weak_alias (__regfree, regfree) -#endif /* Entry points compatible with 4.2 BSD regex library. We don't define them unless specifically requested. */ @@ -695,8 +639,7 @@ char * regcomp/regexec above without link errors. */ weak_function # endif -re_comp (s) - const char *s; +re_comp (const char *s) { reg_errcode_t ret; char *fastmap; @@ -719,7 +662,7 @@ re_comp (s) if (re_comp_buf.fastmap == NULL) { - re_comp_buf.fastmap = (char *) malloc (SBC_MAX); + re_comp_buf.fastmap = re_malloc (char, SBC_MAX); if (re_comp_buf.fastmap == NULL) return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) REG_ESPACE]); @@ -772,7 +715,7 @@ re_compile_internal (regex_t *preg, const char * pattern, size_t length, /* Initialize the dfa. */ dfa = preg->buffer; - if (BE (preg->allocated < sizeof (re_dfa_t), 0)) + if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t))) { /* If zero allocated, but buffer is non-null, try to realloc enough space. This loses if buffer's address is bogus, but @@ -787,9 +730,9 @@ re_compile_internal (regex_t *preg, const char * pattern, size_t length, preg->used = sizeof (re_dfa_t); err = init_dfa (dfa, length); - if (BE (err == REG_NOERROR && lock_init (dfa->lock) != 0, 0)) + if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0)) err = REG_ESPACE; - if (BE (err != REG_NOERROR, 0)) + if (__glibc_unlikely (err != REG_NOERROR)) { free_dfa_content (dfa); preg->buffer = NULL; @@ -804,7 +747,7 @@ re_compile_internal (regex_t *preg, const char * pattern, size_t length, err = re_string_construct (®exp, pattern, length, preg->translate, (syntax & RE_ICASE) != 0, dfa); - if (BE (err != REG_NOERROR, 0)) + if (__glibc_unlikely (err != REG_NOERROR)) { re_compile_internal_free_return: free_workarea_compile (preg); @@ -819,19 +762,17 @@ re_compile_internal (regex_t *preg, const char * pattern, size_t length, /* Parse the regular expression, and build a structure tree. */ preg->re_nsub = 0; dfa->str_tree = parse (®exp, preg, syntax, &err); - if (BE (dfa->str_tree == NULL, 0)) + if (__glibc_unlikely (dfa->str_tree == NULL)) goto re_compile_internal_free_return; /* Analyze the tree and create the nfa. */ err = analyze (preg); - if (BE (err != REG_NOERROR, 0)) + if (__glibc_unlikely (err != REG_NOERROR)) goto re_compile_internal_free_return; -#ifdef RE_ENABLE_I18N /* If possible, do searching in single byte encoding to speed things up. */ if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL) optimize_utf8 (dfa); -#endif /* Then create the initial state of the dfa. */ err = create_initial_state (dfa); @@ -840,7 +781,7 @@ re_compile_internal (regex_t *preg, const char * pattern, size_t length, free_workarea_compile (preg); re_string_destruct (®exp); - if (BE (err != REG_NOERROR, 0)) + if (__glibc_unlikely (err != REG_NOERROR)) { lock_fini (dfa->lock); free_dfa_content (dfa); @@ -861,11 +802,7 @@ init_dfa (re_dfa_t *dfa, size_t pat_len) #ifndef _LIBC const char *codeset_name; #endif -#ifdef RE_ENABLE_I18N size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t)); -#else - size_t max_i18n_object_size = 0; -#endif size_t max_object_size = MAX (sizeof (struct re_state_table_entry), MAX (sizeof (re_token_t), @@ -882,7 +819,8 @@ init_dfa (re_dfa_t *dfa, size_t pat_len) calculation below, and for similar doubling calculations elsewhere. And it's <= rather than <, because some of the doubling calculations add 1 afterwards. */ - if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0)) + if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 + <= pat_len)) return REG_ESPACE; dfa->nodes_alloc = pat_len + 1; @@ -916,7 +854,6 @@ init_dfa (re_dfa_t *dfa, size_t pat_len) dfa->map_notascii = 0; #endif -#ifdef RE_ENABLE_I18N if (dfa->mb_cur_max > 1) { if (dfa->is_utf8) @@ -926,7 +863,7 @@ init_dfa (re_dfa_t *dfa, size_t pat_len) int i, j, ch; dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); - if (BE (dfa->sb_char == NULL, 0)) + if (__glibc_unlikely (dfa->sb_char == NULL)) return REG_ESPACE; /* Set the bits corresponding to single byte chars. */ @@ -936,16 +873,15 @@ init_dfa (re_dfa_t *dfa, size_t pat_len) wint_t wch = __btowc (ch); if (wch != WEOF) dfa->sb_char[i] |= (bitset_word_t) 1 << j; -# ifndef _LIBC +#ifndef _LIBC if (isascii (ch) && wch != ch) dfa->map_notascii = 1; -# endif +#endif } } } -#endif - if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0)) + if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL)) return REG_ESPACE; return REG_NOERROR; } @@ -955,14 +891,13 @@ init_dfa (re_dfa_t *dfa, size_t pat_len) character used by some operators like "\<", "\>", etc. */ static void -internal_function init_word_char (re_dfa_t *dfa) { int i = 0; int j; int ch = 0; dfa->word_ops_used = 1; - if (BE (dfa->map_notascii == 0, 1)) + if (__glibc_likely (dfa->map_notascii == 0)) { bitset_word_t bits0 = 0x00000000; bitset_word_t bits1 = 0x03ff0000; @@ -970,6 +905,7 @@ init_word_char (re_dfa_t *dfa) bitset_word_t bits3 = 0x07fffffe; if (BITSET_WORD_BITS == 64) { + /* Pacify gcc -Woverflow on 32-bit platformns. */ dfa->word_char[0] = bits1 << 31 << 1 | bits0; dfa->word_char[1] = bits3 << 31 << 1 | bits2; i = 2; @@ -986,7 +922,7 @@ init_word_char (re_dfa_t *dfa) goto general_case; ch = 128; - if (BE (dfa->is_utf8, 1)) + if (__glibc_likely (dfa->is_utf8)) { memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8); return; @@ -1033,7 +969,7 @@ create_initial_state (re_dfa_t *dfa) first = dfa->str_tree->first->node_idx; dfa->init_node = first; err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first); - if (BE (err != REG_NOERROR, 0)) + if (__glibc_unlikely (err != REG_NOERROR)) return err; /* The back-references which are in initial states can epsilon transit, @@ -1077,7 +1013,7 @@ create_initial_state (re_dfa_t *dfa) /* It must be the first time to invoke acquire_state. */ dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0); /* We don't check ERR here, since the initial state must not be NULL. */ - if (BE (dfa->init_state == NULL, 0)) + if (__glibc_unlikely (dfa->init_state == NULL)) return err; if (dfa->init_state->has_constraint) { @@ -1089,8 +1025,9 @@ create_initial_state (re_dfa_t *dfa) &init_nodes, CONTEXT_NEWLINE | CONTEXT_BEGBUF); - if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL - || dfa->init_state_begbuf == NULL, 0)) + if (__glibc_unlikely (dfa->init_state_word == NULL + || dfa->init_state_nl == NULL + || dfa->init_state_begbuf == NULL)) return err; } else @@ -1101,7 +1038,6 @@ create_initial_state (re_dfa_t *dfa) return REG_NOERROR; } -#ifdef RE_ENABLE_I18N /* If it is possible to do searching in single byte encoding instead of UTF-8 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change DFA nodes where needed. */ @@ -1181,7 +1117,6 @@ optimize_utf8 (re_dfa_t *dfa) dfa->is_utf8 = 0; dfa->has_mb_node = dfa->nbackref > 0 || has_period; } -#endif /* Analyze the structure tree, and calculate "first", "next", "edest", "eclosure", and "inveclosure". */ @@ -1197,8 +1132,8 @@ analyze (regex_t *preg) dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc); dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc); dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc); - if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL - || dfa->eclosures == NULL, 0)) + if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL + || dfa->edests == NULL || dfa->eclosures == NULL)) return REG_ESPACE; dfa->subexp_map = re_malloc (Idx, preg->re_nsub); @@ -1213,23 +1148,23 @@ analyze (regex_t *preg) break; if (i == preg->re_nsub) { - free (dfa->subexp_map); + re_free (dfa->subexp_map); dfa->subexp_map = NULL; } } ret = postorder (dfa->str_tree, lower_subexps, preg); - if (BE (ret != REG_NOERROR, 0)) + if (__glibc_unlikely (ret != REG_NOERROR)) return ret; ret = postorder (dfa->str_tree, calc_first, dfa); - if (BE (ret != REG_NOERROR, 0)) + if (__glibc_unlikely (ret != REG_NOERROR)) return ret; preorder (dfa->str_tree, calc_next, dfa); ret = preorder (dfa->str_tree, link_nfa_nodes, dfa); - if (BE (ret != REG_NOERROR, 0)) + if (__glibc_unlikely (ret != REG_NOERROR)) return ret; ret = calc_eclosure (dfa); - if (BE (ret != REG_NOERROR, 0)) + if (__glibc_unlikely (ret != REG_NOERROR)) return ret; /* We only need this during the prune_impossible_nodes pass in regexec.c; @@ -1238,7 +1173,7 @@ analyze (regex_t *preg) || dfa->nbackref) { dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len); - if (BE (dfa->inveclosures == NULL, 0)) + if (__glibc_unlikely (dfa->inveclosures == NULL)) return REG_ESPACE; ret = calc_inveclosure (dfa); } @@ -1268,7 +1203,7 @@ postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), do { reg_errcode_t err = fn (extra, node); - if (BE (err != REG_NOERROR, 0)) + if (__glibc_unlikely (err != REG_NOERROR)) return err; if (node->parent == NULL) return REG_NOERROR; @@ -1290,7 +1225,7 @@ preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), for (node = root; ; ) { reg_errcode_t err = fn (extra, node); - if (BE (err != REG_NOERROR, 0)) + if (__glibc_unlikely (err != REG_NOERROR)) return err; /* Go to the left node, or up and to the right. */ @@ -1391,7 +1326,8 @@ lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node) cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP); tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls; tree = create_tree (dfa, op, tree1, CONCAT); - if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0)) + if (__glibc_unlikely (tree == NULL || tree1 == NULL + || op == NULL || cls == NULL)) { *err = REG_ESPACE; return NULL; @@ -1417,7 +1353,7 @@ calc_first (void *extra, bin_tree_t *node) { node->first = node; node->node_idx = re_dfa_add_node (dfa, node->token); - if (BE (node->node_idx == REG_MISSING, 0)) + if (__glibc_unlikely (node->node_idx == -1)) return REG_ESPACE; if (node->token.type == ANCHOR) dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type; @@ -1462,7 +1398,7 @@ link_nfa_nodes (void *extra, bin_tree_t *node) break; case END_OF_RE: - assert (node->next == NULL); + DEBUG_ASSERT (node->next == NULL); break; case OP_DUP_ASTERISK: @@ -1478,8 +1414,8 @@ link_nfa_nodes (void *extra, bin_tree_t *node) right = node->right->first->node_idx; else right = node->next->node_idx; - assert (REG_VALID_INDEX (left)); - assert (REG_VALID_INDEX (right)); + DEBUG_ASSERT (left > -1); + DEBUG_ASSERT (right > -1); err = re_node_set_init_2 (dfa->edests + idx, left, right); } break; @@ -1497,7 +1433,7 @@ link_nfa_nodes (void *extra, bin_tree_t *node) break; default: - assert (!IS_EPSILON_NODE (node->token.type)); + DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type)); dfa->nexts[idx] = node->next->node_idx; break; } @@ -1510,7 +1446,6 @@ link_nfa_nodes (void *extra, bin_tree_t *node) to their own constraint. */ static reg_errcode_t -internal_function duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node, Idx root_node, unsigned int init_constraint) { @@ -1529,11 +1464,11 @@ duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node, org_dest = dfa->nexts[org_node]; re_node_set_empty (dfa->edests + clone_node); clone_dest = duplicate_node (dfa, org_dest, constraint); - if (BE (clone_dest == REG_MISSING, 0)) + if (__glibc_unlikely (clone_dest == -1)) return REG_ESPACE; dfa->nexts[clone_node] = dfa->nexts[org_node]; ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); - if (BE (! ok, 0)) + if (__glibc_unlikely (! ok)) return REG_ESPACE; } else if (dfa->edests[org_node].nelem == 0) @@ -1555,17 +1490,17 @@ duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node, if (org_node == root_node && clone_node != org_node) { ok = re_node_set_insert (dfa->edests + clone_node, org_dest); - if (BE (! ok, 0)) + if (__glibc_unlikely (! ok)) return REG_ESPACE; break; } /* In case the node has another constraint, append it. */ constraint |= dfa->nodes[org_node].constraint; clone_dest = duplicate_node (dfa, org_dest, constraint); - if (BE (clone_dest == REG_MISSING, 0)) + if (__glibc_unlikely (clone_dest == -1)) return REG_ESPACE; ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); - if (BE (! ok, 0)) + if (__glibc_unlikely (! ok)) return REG_ESPACE; } else /* dfa->edests[org_node].nelem == 2 */ @@ -1576,19 +1511,19 @@ duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node, re_node_set_empty (dfa->edests + clone_node); /* Search for a duplicated node which satisfies the constraint. */ clone_dest = search_duplicated_node (dfa, org_dest, constraint); - if (clone_dest == REG_MISSING) + if (clone_dest == -1) { /* There is no such duplicated node, create a new one. */ reg_errcode_t err; clone_dest = duplicate_node (dfa, org_dest, constraint); - if (BE (clone_dest == REG_MISSING, 0)) + if (__glibc_unlikely (clone_dest == -1)) return REG_ESPACE; ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); - if (BE (! ok, 0)) + if (__glibc_unlikely (! ok)) return REG_ESPACE; err = duplicate_node_closure (dfa, org_dest, clone_dest, root_node, constraint); - if (BE (err != REG_NOERROR, 0)) + if (__glibc_unlikely (err != REG_NOERROR)) return err; } else @@ -1596,16 +1531,16 @@ duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node, /* There is a duplicated node which satisfies the constraint, use it to avoid infinite loop. */ ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); - if (BE (! ok, 0)) + if (__glibc_unlikely (! ok)) return REG_ESPACE; } org_dest = dfa->edests[org_node].elems[1]; clone_dest = duplicate_node (dfa, org_dest, constraint); - if (BE (clone_dest == REG_MISSING, 0)) + if (__glibc_unlikely (clone_dest == -1)) return REG_ESPACE; ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); - if (BE (! ok, 0)) + if (__glibc_unlikely (! ok)) return REG_ESPACE; } org_node = org_dest; @@ -1628,18 +1563,18 @@ search_duplicated_node (const re_dfa_t *dfa, Idx org_node, && constraint == dfa->nodes[idx].constraint) return idx; /* Found. */ } - return REG_MISSING; /* Not found. */ + return -1; /* Not found. */ } /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT. - Return the index of the new node, or REG_MISSING if insufficient storage is + Return the index of the new node, or -1 if insufficient storage is available. */ static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint) { Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]); - if (BE (dup_idx != REG_MISSING, 1)) + if (__glibc_likely (dup_idx != -1)) { dfa->nodes[dup_idx].constraint = constraint; dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint; @@ -1665,7 +1600,7 @@ calc_inveclosure (re_dfa_t *dfa) for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx) { ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src); - if (BE (! ok, 0)) + if (__glibc_unlikely (! ok)) return REG_ESPACE; } } @@ -1680,9 +1615,7 @@ calc_eclosure (re_dfa_t *dfa) { Idx node_idx; bool incomplete; -#ifdef DEBUG - assert (dfa->nodes_len > 0); -#endif + DEBUG_ASSERT (dfa->nodes_len > 0); incomplete = false; /* For each nodes, calculate epsilon closure. */ for (node_idx = 0; ; ++node_idx) @@ -1697,16 +1630,14 @@ calc_eclosure (re_dfa_t *dfa) node_idx = 0; } -#ifdef DEBUG - assert (dfa->eclosures[node_idx].nelem != REG_MISSING); -#endif + DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1); /* If we have already calculated, skip it. */ if (dfa->eclosures[node_idx].nelem != 0) continue; /* Calculate epsilon closure of 'node_idx'. */ err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true); - if (BE (err != REG_NOERROR, 0)) + if (__glibc_unlikely (err != REG_NOERROR)) return err; if (dfa->eclosures[node_idx].nelem == 0) @@ -1726,15 +1657,17 @@ calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) reg_errcode_t err; Idx i; re_node_set eclosure; - bool ok; bool incomplete = false; err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1); - if (BE (err != REG_NOERROR, 0)) + if (__glibc_unlikely (err != REG_NOERROR)) return err; + /* An epsilon closure includes itself. */ + eclosure.elems[eclosure.nelem++] = node; + /* This indicates that we are calculating this node now. We reference this value to avoid infinite loop. */ - dfa->eclosures[node].nelem = REG_MISSING; + dfa->eclosures[node].nelem = -1; /* If the current node has constraints, duplicate all nodes since they must inherit the constraints. */ @@ -1744,7 +1677,7 @@ calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) { err = duplicate_node_closure (dfa, node, node, node, dfa->nodes[node].constraint); - if (BE (err != REG_NOERROR, 0)) + if (__glibc_unlikely (err != REG_NOERROR)) return err; } @@ -1756,7 +1689,7 @@ calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) Idx edest = dfa->edests[node].elems[i]; /* If calculating the epsilon closure of 'edest' is in progress, return intermediate result. */ - if (dfa->eclosures[edest].nelem == REG_MISSING) + if (dfa->eclosures[edest].nelem == -1) { incomplete = true; continue; @@ -1766,14 +1699,14 @@ calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) if (dfa->eclosures[edest].nelem == 0) { err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false); - if (BE (err != REG_NOERROR, 0)) + if (__glibc_unlikely (err != REG_NOERROR)) return err; } else eclosure_elem = dfa->eclosures[edest]; /* Merge the epsilon closure of 'edest'. */ err = re_node_set_merge (&eclosure, &eclosure_elem); - if (BE (err != REG_NOERROR, 0)) + if (__glibc_unlikely (err != REG_NOERROR)) return err; /* If the epsilon closure of 'edest' is incomplete, the epsilon closure of this node is also incomplete. */ @@ -1784,10 +1717,6 @@ calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) } } - /* An epsilon closure includes itself. */ - ok = re_node_set_insert (&eclosure, node); - if (BE (! ok, 0)) - return REG_ESPACE; if (incomplete && !root) dfa->eclosures[node].nelem = 0; else @@ -1802,7 +1731,6 @@ calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) We must not use this function inside bracket expressions. */ static void -internal_function fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax) { re_string_skip_bytes (input, peek_token (result, input, syntax)); @@ -1812,7 +1740,6 @@ fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax) We must not use this function inside bracket expressions. */ static int -internal_function peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) { unsigned char c; @@ -1827,16 +1754,14 @@ peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) token->opr.c = c; token->word_char = 0; -#ifdef RE_ENABLE_I18N token->mb_partial = 0; - if (input->mb_cur_max > 1 && - !re_string_first_byte (input, re_string_cur_idx (input))) + if (input->mb_cur_max > 1 + && !re_string_first_byte (input, re_string_cur_idx (input))) { token->type = CHARACTER; token->mb_partial = 1; return 1; } -#endif if (c == '\\') { unsigned char c2; @@ -1849,7 +1774,6 @@ peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) c2 = re_string_peek_byte_case (input, 1); token->opr.c = c2; token->type = CHARACTER; -#ifdef RE_ENABLE_I18N if (input->mb_cur_max > 1) { wint_t wc = re_string_wchar_at (input, @@ -1857,7 +1781,6 @@ peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; } else -#endif token->word_char = IS_WORD_CHAR (c2) != 0; switch (c2) @@ -1963,14 +1886,12 @@ peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) } token->type = CHARACTER; -#ifdef RE_ENABLE_I18N if (input->mb_cur_max > 1) { wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input)); token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; } else -#endif token->word_char = IS_WORD_CHAR (token->opr.c); switch (c) @@ -2017,8 +1938,8 @@ peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) token->type = OP_PERIOD; break; case '^': - if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) && - re_string_cur_idx (input) != 0) + if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) + && re_string_cur_idx (input) != 0) { char prev = re_string_peek_byte (input, -1); if (!(syntax & RE_NEWLINE_ALT) || prev != '\n') @@ -2028,8 +1949,8 @@ peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) token->opr.ctx_type = LINE_FIRST; break; case '$': - if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) && - re_string_cur_idx (input) + 1 != re_string_length (input)) + if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) + && re_string_cur_idx (input) + 1 != re_string_length (input)) { re_token_t next; re_string_skip_bytes (input, 1); @@ -2051,7 +1972,6 @@ peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) We must not use this function out of bracket expressions. */ static int -internal_function peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax) { unsigned char c; @@ -2063,14 +1983,12 @@ peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax) c = re_string_peek_byte (input, 0); token->opr.c = c; -#ifdef RE_ENABLE_I18N - if (input->mb_cur_max > 1 && - !re_string_first_byte (input, re_string_cur_idx (input))) + if (input->mb_cur_max > 1 + && !re_string_first_byte (input, re_string_cur_idx (input))) { token->type = CHARACTER; return 1; } -#endif /* RE_ENABLE_I18N */ if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && re_string_cur_idx (input) + 1 < re_string_length (input)) @@ -2098,16 +2016,18 @@ peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax) case '.': token->type = OP_OPEN_COLL_ELEM; break; + case '=': token->type = OP_OPEN_EQUIV_CLASS; break; + case ':': if (syntax & RE_CHAR_CLASSES) { token->type = OP_OPEN_CHAR_CLASS; break; } - /* else fall through. */ + FALLTHROUGH; default: token->type = CHARACTER; token->opr.c = c; @@ -2118,15 +2038,25 @@ peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax) } switch (c) { - case '-': - token->type = OP_CHARSET_RANGE; - break; case ']': token->type = OP_CLOSE_BRACKET; break; case '^': token->type = OP_NON_MATCH_LIST; break; + case '-': + /* In V7 Unix grep and Unix awk and mawk, [...---...] + (3 adjacent minus signs) stands for a single minus sign. + Support that without breaking anything else. */ + if (! (re_string_cur_idx (input) + 2 < re_string_length (input) + && re_string_peek_byte (input, 1) == '-' + && re_string_peek_byte (input, 2) == '-')) + { + token->type = OP_CHARSET_RANGE; + break; + } + re_string_skip_bytes (input, 2); + FALLTHROUGH; default: token->type = CHARACTER; } @@ -2157,14 +2087,14 @@ parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax, dfa->syntax = syntax; fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE); tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err); - if (BE (*err != REG_NOERROR && tree == NULL, 0)) + if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL)) return NULL; eor = create_tree (dfa, NULL, NULL, END_OF_RE); if (tree != NULL) root = create_tree (dfa, tree, eor, CONCAT); else root = eor; - if (BE (eor == NULL || root == NULL, 0)) + if (__glibc_unlikely (eor == NULL || root == NULL)) { *err = REG_ESPACE; return NULL; @@ -2187,8 +2117,9 @@ parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, { re_dfa_t *dfa = preg->buffer; bin_tree_t *tree, *branch = NULL; + bitset_word_t initial_bkref_map = dfa->completed_bkref_map; tree = parse_branch (regexp, preg, token, syntax, nest, err); - if (BE (*err != REG_NOERROR && tree == NULL, 0)) + if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL)) return NULL; while (token->type == OP_ALT) @@ -2197,14 +2128,21 @@ parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, if (token->type != OP_ALT && token->type != END_OF_RE && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) { + bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map; + dfa->completed_bkref_map = initial_bkref_map; branch = parse_branch (regexp, preg, token, syntax, nest, err); - if (BE (*err != REG_NOERROR && branch == NULL, 0)) - return NULL; + if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL)) + { + if (tree != NULL) + postorder (tree, free_tree, NULL); + return NULL; + } + dfa->completed_bkref_map |= accumulated_bkref_map; } else branch = NULL; tree = create_tree (dfa, tree, branch, OP_ALT); - if (BE (tree == NULL, 0)) + if (__glibc_unlikely (tree == NULL)) { *err = REG_ESPACE; return NULL; @@ -2229,14 +2167,14 @@ parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token, bin_tree_t *tree, *expr; re_dfa_t *dfa = preg->buffer; tree = parse_expression (regexp, preg, token, syntax, nest, err); - if (BE (*err != REG_NOERROR && tree == NULL, 0)) + if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL)) return NULL; while (token->type != OP_ALT && token->type != END_OF_RE && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) { expr = parse_expression (regexp, preg, token, syntax, nest, err); - if (BE (*err != REG_NOERROR && expr == NULL, 0)) + if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL)) { if (tree != NULL) postorder (tree, free_tree, NULL); @@ -2277,12 +2215,11 @@ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, { case CHARACTER: tree = create_token_tree (dfa, NULL, NULL, token); - if (BE (tree == NULL, 0)) + if (__glibc_unlikely (tree == NULL)) { *err = REG_ESPACE; return NULL; } -#ifdef RE_ENABLE_I18N if (dfa->mb_cur_max > 1) { while (!re_string_eoi (regexp) @@ -2292,34 +2229,36 @@ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, fetch_token (token, regexp, syntax); mbc_remain = create_token_tree (dfa, NULL, NULL, token); tree = create_tree (dfa, tree, mbc_remain, CONCAT); - if (BE (mbc_remain == NULL || tree == NULL, 0)) + if (__glibc_unlikely (mbc_remain == NULL || tree == NULL)) { *err = REG_ESPACE; return NULL; } } } -#endif break; + case OP_OPEN_SUBEXP: tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err); - if (BE (*err != REG_NOERROR && tree == NULL, 0)) + if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL)) return NULL; break; + case OP_OPEN_BRACKET: tree = parse_bracket_exp (regexp, dfa, token, syntax, err); - if (BE (*err != REG_NOERROR && tree == NULL, 0)) + if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL)) return NULL; break; + case OP_BACK_REF: - if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1)) + if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx))) { *err = REG_ESUBREG; return NULL; } dfa->used_bkref_map |= 1 << token->opr.idx; tree = create_token_tree (dfa, NULL, NULL, token); - if (BE (tree == NULL, 0)) + if (__glibc_unlikely (tree == NULL)) { *err = REG_ESPACE; return NULL; @@ -2327,13 +2266,14 @@ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, ++dfa->nbackref; dfa->has_mb_node = 1; break; + case OP_OPEN_DUP_NUM: if (syntax & RE_CONTEXT_INVALID_DUP) { *err = REG_BADRPT; return NULL; } - /* FALLTHROUGH */ + FALLTHROUGH; case OP_DUP_ASTERISK: case OP_DUP_PLUS: case OP_DUP_QUESTION: @@ -2347,15 +2287,15 @@ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, fetch_token (token, regexp, syntax); return parse_expression (regexp, preg, token, syntax, nest, err); } - /* else fall through */ + FALLTHROUGH; case OP_CLOSE_SUBEXP: - if ((token->type == OP_CLOSE_SUBEXP) && - !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)) + if ((token->type == OP_CLOSE_SUBEXP) + && !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)) { *err = REG_ERPAREN; return NULL; } - /* else fall through */ + FALLTHROUGH; case OP_CLOSE_DUP_NUM: /* We treat it as a normal character. */ @@ -2364,12 +2304,13 @@ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, /* mb_partial and word_char bits should be initialized already by peek_token. */ tree = create_token_tree (dfa, NULL, NULL, token); - if (BE (tree == NULL, 0)) + if (__glibc_unlikely (tree == NULL)) { *err = REG_ESPACE; return NULL; } break; + case ANCHOR: if ((token->opr.ctx_type & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST)) @@ -2393,7 +2334,8 @@ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, } tree_last = create_token_tree (dfa, NULL, NULL, token); tree = create_tree (dfa, tree_first, tree_last, OP_ALT); - if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0)) + if (__glibc_unlikely (tree_first == NULL || tree_last == NULL + || tree == NULL)) { *err = REG_ESPACE; return NULL; @@ -2402,7 +2344,7 @@ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, else { tree = create_token_tree (dfa, NULL, NULL, token); - if (BE (tree == NULL, 0)) + if (__glibc_unlikely (tree == NULL)) { *err = REG_ESPACE; return NULL; @@ -2414,9 +2356,10 @@ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, it must not be "". */ fetch_token (token, regexp, syntax); return tree; + case OP_PERIOD: tree = create_token_tree (dfa, NULL, NULL, token); - if (BE (tree == NULL, 0)) + if (__glibc_unlikely (tree == NULL)) { *err = REG_ESPACE; return NULL; @@ -2424,35 +2367,38 @@ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, if (dfa->mb_cur_max > 1) dfa->has_mb_node = 1; break; + case OP_WORD: case OP_NOTWORD: tree = build_charclass_op (dfa, regexp->trans, "alnum", "_", token->type == OP_NOTWORD, err); - if (BE (*err != REG_NOERROR && tree == NULL, 0)) + if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL)) return NULL; break; + case OP_SPACE: case OP_NOTSPACE: tree = build_charclass_op (dfa, regexp->trans, "space", "", token->type == OP_NOTSPACE, err); - if (BE (*err != REG_NOERROR && tree == NULL, 0)) + if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL)) return NULL; break; + case OP_ALT: case END_OF_RE: return NULL; + case BACK_SLASH: *err = REG_EESCAPE; return NULL; + default: /* Must not happen? */ -#ifdef DEBUG - assert (0); -#endif + DEBUG_ASSERT (false); return NULL; } fetch_token (token, regexp, syntax); @@ -2460,14 +2406,22 @@ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM) { - tree = parse_dup_op (tree, regexp, dfa, token, syntax, err); - if (BE (*err != REG_NOERROR && tree == NULL, 0)) - return NULL; + bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token, + syntax, err); + if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL)) + { + if (tree != NULL) + postorder (tree, free_tree, NULL); + return NULL; + } + tree = dup_tree; /* In BRE consecutive duplications are not allowed. */ if ((syntax & RE_CONTEXT_INVALID_DUP) && (token->type == OP_DUP_ASTERISK || token->type == OP_OPEN_DUP_NUM)) { + if (tree != NULL) + postorder (tree, free_tree, NULL); *err = REG_BADRPT; return NULL; } @@ -2500,13 +2454,14 @@ parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, else { tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); - if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0)) + if (__glibc_unlikely (*err == REG_NOERROR + && token->type != OP_CLOSE_SUBEXP)) { if (tree != NULL) postorder (tree, free_tree, NULL); *err = REG_EPAREN; } - if (BE (*err != REG_NOERROR, 0)) + if (__glibc_unlikely (*err != REG_NOERROR)) return NULL; } @@ -2514,7 +2469,7 @@ parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, dfa->completed_bkref_map |= 1 << cur_nsub; tree = create_tree (dfa, tree, NULL, SUBEXP); - if (BE (tree == NULL, 0)) + if (__glibc_unlikely (tree == NULL)) { *err = REG_ESPACE; return NULL; @@ -2537,7 +2492,7 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, { end = 0; start = fetch_number (regexp, token, syntax); - if (start == REG_MISSING) + if (start == -1) { if (token->type == CHARACTER && token->opr.c == ',') start = 0; /* We treat "{,m}" as "{0,m}". */ @@ -2547,17 +2502,17 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, return NULL; } } - if (BE (start != REG_ERROR, 1)) + if (__glibc_likely (start != -2)) { /* We treat "{n}" as "{n,n}". */ end = ((token->type == OP_CLOSE_DUP_NUM) ? start : ((token->type == CHARACTER && token->opr.c == ',') - ? fetch_number (regexp, token, syntax) : REG_ERROR)); + ? fetch_number (regexp, token, syntax) : -2)); } - if (BE (start == REG_ERROR || end == REG_ERROR, 0)) + if (__glibc_unlikely (start == -2 || end == -2)) { /* Invalid sequence. */ - if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0)) + if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD))) { if (token->type == END_OF_RE) *err = REG_EBRACE; @@ -2576,15 +2531,15 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, return elem; } - if (BE ((end != REG_MISSING && start > end) - || token->type != OP_CLOSE_DUP_NUM, 0)) + if (__glibc_unlikely ((end != -1 && start > end) + || token->type != OP_CLOSE_DUP_NUM)) { /* First number greater than second. */ *err = REG_BADBR; return NULL; } - if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0)) + if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end))) { *err = REG_ESIZE; return NULL; @@ -2593,28 +2548,28 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, else { start = (token->type == OP_DUP_PLUS) ? 1 : 0; - end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING; + end = (token->type == OP_DUP_QUESTION) ? 1 : -1; } fetch_token (token, regexp, syntax); - if (BE (elem == NULL, 0)) + if (__glibc_unlikely (elem == NULL)) return NULL; - if (BE (start == 0 && end == 0, 0)) + if (__glibc_unlikely (start == 0 && end == 0)) { postorder (elem, free_tree, NULL); return NULL; } /* Extract "{n,m}" to "...{0,}". */ - if (BE (start > 0, 0)) + if (__glibc_unlikely (start > 0)) { tree = elem; for (i = 2; i <= start; ++i) { elem = duplicate_tree (elem, dfa); tree = create_tree (dfa, tree, elem, CONCAT); - if (BE (elem == NULL || tree == NULL, 0)) + if (__glibc_unlikely (elem == NULL || tree == NULL)) goto parse_dup_op_espace; } @@ -2623,6 +2578,8 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, /* Duplicate ELEM before it is marked optional. */ elem = duplicate_tree (elem, dfa); + if (__glibc_unlikely (elem == NULL)) + goto parse_dup_op_espace; old_tree = tree; } else @@ -2635,27 +2592,23 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, } tree = create_tree (dfa, elem, NULL, - (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT)); - if (BE (tree == NULL, 0)) + (end == -1 ? OP_DUP_ASTERISK : OP_ALT)); + if (__glibc_unlikely (tree == NULL)) goto parse_dup_op_espace; -/* From gnulib's "intprops.h": - True if the arithmetic type T is signed. */ -#define TYPE_SIGNED(t) (! ((t) 0 < (t) -1)) - - /* This loop is actually executed only when end != REG_MISSING, + /* This loop is actually executed only when end != -1, to rewrite {0,n} as ((...?)?)?... We have already created the start+1-th copy. */ - if (TYPE_SIGNED (Idx) || end != REG_MISSING) + if (TYPE_SIGNED (Idx) || end != -1) for (i = start + 2; i <= end; ++i) { elem = duplicate_tree (elem, dfa); tree = create_tree (dfa, tree, elem, CONCAT); - if (BE (elem == NULL || tree == NULL, 0)) + if (__glibc_unlikely (elem == NULL || tree == NULL)) goto parse_dup_op_espace; tree = create_tree (dfa, tree, NULL, OP_ALT); - if (BE (tree == NULL, 0)) + if (__glibc_unlikely (tree == NULL)) goto parse_dup_op_espace; } @@ -2674,146 +2627,131 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, #define BRACKET_NAME_BUF_SIZE 32 #ifndef _LIBC - /* Local function for parse_bracket_exp only used in case of NOT _LIBC. - Build the range expression which starts from START_ELEM, and ends - at END_ELEM. The result are written to MBCSET and SBCSET. - RANGE_ALLOC is the allocated size of mbcset->range_starts, and - mbcset->range_ends, is a pointer argument since we may - update it. */ + +/* Convert the byte B to the corresponding wide character. In a + unibyte locale, treat B as itself. In a multibyte locale, return + WEOF if B is an encoding error. */ +static wint_t +parse_byte (unsigned char b, re_dfa_t const *dfa) +{ + return dfa->mb_cur_max > 1 ? __btowc (b) : b; +} + +/* Local function for parse_bracket_exp used in _LIBC environment. + Build the range expression which starts from START_ELEM, and ends + at END_ELEM. The result are written to MBCSET and SBCSET. + RANGE_ALLOC is the allocated size of mbcset->range_starts, and + mbcset->range_ends, is a pointer argument since we may + update it. */ static reg_errcode_t -internal_function -# ifdef RE_ENABLE_I18N -build_range_exp (const reg_syntax_t syntax, - bitset_t sbcset, - re_charset_t *mbcset, - Idx *range_alloc, - const bracket_elem_t *start_elem, - const bracket_elem_t *end_elem) -# else /* not RE_ENABLE_I18N */ -build_range_exp (const reg_syntax_t syntax, - bitset_t sbcset, - const bracket_elem_t *start_elem, - const bracket_elem_t *end_elem) -# endif /* not RE_ENABLE_I18N */ +build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc, + bracket_elem_t *start_elem, bracket_elem_t *end_elem, + re_dfa_t *dfa, reg_syntax_t syntax, uint_fast32_t nrules, + const unsigned char *collseqmb, const char *collseqwc, + int_fast32_t table_size, const void *symb_table, + const unsigned char *extra) { - unsigned int start_ch, end_ch; /* Equivalence Classes and Character Classes can't be a range start/end. */ - if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS - || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, - 0)) + if (__glibc_unlikely (start_elem->type == EQUIV_CLASS + || start_elem->type == CHAR_CLASS + || end_elem->type == EQUIV_CLASS + || end_elem->type == CHAR_CLASS)) return REG_ERANGE; /* We can handle no multi character collating elements without libc support. */ - if (BE ((start_elem->type == COLL_SYM - && strlen ((char *) start_elem->opr.name) > 1) - || (end_elem->type == COLL_SYM - && strlen ((char *) end_elem->opr.name) > 1), 0)) + if (__glibc_unlikely ((start_elem->type == COLL_SYM + && strlen ((char *) start_elem->opr.name) > 1) + || (end_elem->type == COLL_SYM + && strlen ((char *) end_elem->opr.name) > 1))) return REG_ECOLLATE; -# ifdef RE_ENABLE_I18N - { - wchar_t wc; - wint_t start_wc; - wint_t end_wc; - + unsigned int start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] - : 0)); + : 0)), end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] : 0)); + wint_t start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM) - ? __btowc (start_ch) : start_elem->opr.wch); + ? parse_byte (start_ch, dfa) : start_elem->opr.wch), end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM) - ? __btowc (end_ch) : end_elem->opr.wch); - if (start_wc == WEOF || end_wc == WEOF) - return REG_ECOLLATE; - else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0)) - return REG_ERANGE; - - /* Got valid collation sequence values, add them as a new entry. - However, for !_LIBC we have no collation elements: if the - character set is single byte, the single byte character set - that we build below suffices. parse_bracket_exp passes - no MBCSET if dfa->mb_cur_max == 1. */ - if (mbcset) - { - /* Check the space of the arrays. */ - if (BE (*range_alloc == mbcset->nranges, 0)) - { - /* There is not enough space, need realloc. */ - wchar_t *new_array_start, *new_array_end; - Idx new_nranges; - - /* +1 in case of mbcset->nranges is 0. */ - new_nranges = 2 * mbcset->nranges + 1; - /* Use realloc since mbcset->range_starts and mbcset->range_ends - are NULL if *range_alloc == 0. */ - new_array_start = re_realloc (mbcset->range_starts, wchar_t, - new_nranges); - new_array_end = re_realloc (mbcset->range_ends, wchar_t, - new_nranges); + ? parse_byte (end_ch, dfa) : end_elem->opr.wch); - if (BE (new_array_start == NULL || new_array_end == NULL, 0)) - return REG_ESPACE; + if (start_wc == WEOF || end_wc == WEOF) + return REG_ECOLLATE; + else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES) + && start_wc > end_wc)) + return REG_ERANGE; - mbcset->range_starts = new_array_start; - mbcset->range_ends = new_array_end; - *range_alloc = new_nranges; - } + /* Got valid collation sequence values, add them as a new entry. + However, for !_LIBC we have no collation elements: if the + character set is single byte, the single byte character set + that we build below suffices. parse_bracket_exp passes + no MBCSET if dfa->mb_cur_max == 1. */ + if (dfa->mb_cur_max > 1) + { + /* Check the space of the arrays. */ + if (__glibc_unlikely (*range_alloc == mbcset->nranges)) + { + /* There is not enough space, need realloc. */ + wchar_t *new_array_start, *new_array_end; + Idx new_nranges; - mbcset->range_starts[mbcset->nranges] = start_wc; - mbcset->range_ends[mbcset->nranges++] = end_wc; - } + /* +1 in case of mbcset->nranges is 0. */ + new_nranges = 2 * mbcset->nranges + 1; + /* Use realloc since mbcset->range_starts and mbcset->range_ends + are NULL if *range_alloc == 0. */ + new_array_start = re_realloc (mbcset->range_starts, wchar_t, + new_nranges); + new_array_end = re_realloc (mbcset->range_ends, wchar_t, + new_nranges); + + if (__glibc_unlikely (new_array_start == NULL + || new_array_end == NULL)) + { + re_free (new_array_start); + re_free (new_array_end); + return REG_ESPACE; + } + + mbcset->range_starts = new_array_start; + mbcset->range_ends = new_array_end; + *range_alloc = new_nranges; + } + + mbcset->range_starts[mbcset->nranges] = start_wc; + mbcset->range_ends[mbcset->nranges++] = end_wc; + } + + /* Build the table for single byte characters. */ + for (wchar_t wc = 0; wc < SBC_MAX; ++wc) + { + if (start_wc <= wc && wc <= end_wc) + bitset_set (sbcset, wc); + } - /* Build the table for single byte characters. */ - for (wc = 0; wc < SBC_MAX; ++wc) - { - if (start_wc <= wc && wc <= end_wc) - bitset_set (sbcset, wc); - } - } -# else /* not RE_ENABLE_I18N */ - { - unsigned int ch; - start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch - : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] - : 0)); - end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch - : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] - : 0)); - if (start_ch > end_ch) - return REG_ERANGE; - /* Build the table for single byte characters. */ - for (ch = 0; ch < SBC_MAX; ++ch) - if (start_ch <= ch && ch <= end_ch) - bitset_set (sbcset, ch); - } -# endif /* not RE_ENABLE_I18N */ return REG_NOERROR; } #endif /* not _LIBC */ #ifndef _LIBC -/* Helper function for parse_bracket_exp only used in case of NOT _LIBC.. +/* Helper function for parse_bracket_exp only used in case of NOT _LIBC. Build the collating element which is represented by NAME. The result are written to MBCSET and SBCSET. COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a pointer argument since we may update it. */ static reg_errcode_t -internal_function -# ifdef RE_ENABLE_I18N build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, - Idx *coll_sym_alloc, const unsigned char *name) -# else /* not RE_ENABLE_I18N */ -build_collating_symbol (bitset_t sbcset, const unsigned char *name) -# endif /* not RE_ENABLE_I18N */ + Idx *coll_sym_alloc, const unsigned char *name, + uint_fast32_t nrules, int_fast32_t table_size, + const void *symb_table, const unsigned char *extra) { size_t name_len = strlen ((const char *) name); - if (BE (name_len != 1, 0)) + if (__glibc_unlikely (name_len != 1)) return REG_ECOLLATE; else { @@ -2823,267 +2761,280 @@ build_collating_symbol (bitset_t sbcset, const unsigned char *name) } #endif /* not _LIBC */ -/* This function parse bracket expression like "[abc]", "[a-c]", - "[[.a-a.]]" etc. */ - -static bin_tree_t * -parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, - reg_syntax_t syntax, reg_errcode_t *err) -{ #ifdef _LIBC - const unsigned char *collseqmb; - const char *collseqwc; - uint32_t nrules; - int32_t table_size; - const int32_t *symb_table; - const unsigned char *extra; - - /* Local function for parse_bracket_exp used in _LIBC environment. - Seek the collating symbol entry corresponding to NAME. - Return the index of the symbol in the SYMB_TABLE, - or -1 if not found. */ - - auto inline int32_t - __attribute__ ((always_inline)) - seek_collating_symbol_entry (const unsigned char *name, size_t name_len) - { - int32_t elem; - - for (elem = 0; elem < table_size; elem++) - if (symb_table[2 * elem] != 0) - { - int32_t idx = symb_table[2 * elem + 1]; - /* Skip the name of collating element name. */ - idx += 1 + extra[idx]; - if (/* Compare the length of the name. */ - name_len == extra[idx] - /* Compare the name. */ - && memcmp (name, &extra[idx + 1], name_len) == 0) - /* Yep, this is the entry. */ - return elem; - } - return -1; - } +/* Local function for parse_bracket_exp used in _LIBC environment. + Seek the collating symbol entry corresponding to NAME. + Return the index of the symbol in the SYMB_TABLE, + or -1 if not found. */ + +static __always_inline int32_t +seek_collating_symbol_entry (const unsigned char *name, size_t name_len, + const int32_t *symb_table, + int_fast32_t table_size, + const unsigned char *extra) +{ + int_fast32_t elem; - /* Local function for parse_bracket_exp used in _LIBC environment. - Look up the collation sequence value of BR_ELEM. - Return the value if succeeded, UINT_MAX otherwise. */ + for (elem = 0; elem < table_size; elem++) + if (symb_table[2 * elem] != 0) + { + int32_t idx = symb_table[2 * elem + 1]; + /* Skip the name of collating element name. */ + idx += 1 + extra[idx]; + if (/* Compare the length of the name. */ + name_len == extra[idx] + /* Compare the name. */ + && memcmp (name, &extra[idx + 1], name_len) == 0) + /* Yep, this is the entry. */ + return elem; + } + return -1; +} - auto inline unsigned int - __attribute__ ((always_inline)) - lookup_collation_sequence_value (bracket_elem_t *br_elem) +/* Local function for parse_bracket_exp used in _LIBC environment. + Look up the collation sequence value of BR_ELEM. + Return the value if succeeded, UINT_MAX otherwise. */ + +static __always_inline unsigned int +lookup_collation_sequence_value (bracket_elem_t *br_elem, uint32_t nrules, + const unsigned char *collseqmb, + const char *collseqwc, + int_fast32_t table_size, + const int32_t *symb_table, + const unsigned char *extra) +{ + if (br_elem->type == SB_CHAR) { - if (br_elem->type == SB_CHAR) - { - /* - if (MB_CUR_MAX == 1) - */ - if (nrules == 0) - return collseqmb[br_elem->opr.ch]; - else - { - wint_t wc = __btowc (br_elem->opr.ch); - return __collseq_table_lookup (collseqwc, wc); - } - } - else if (br_elem->type == MB_CHAR) + /* if (MB_CUR_MAX == 1) */ + if (nrules == 0) + return collseqmb[br_elem->opr.ch]; + else { - if (nrules != 0) - return __collseq_table_lookup (collseqwc, br_elem->opr.wch); + wint_t wc = __btowc (br_elem->opr.ch); + return __collseq_table_lookup (collseqwc, wc); } - else if (br_elem->type == COLL_SYM) + } + else if (br_elem->type == MB_CHAR) + { + if (nrules != 0) + return __collseq_table_lookup (collseqwc, br_elem->opr.wch); + } + else if (br_elem->type == COLL_SYM) + { + size_t sym_name_len = strlen ((char *) br_elem->opr.name); + if (nrules != 0) { - size_t sym_name_len = strlen ((char *) br_elem->opr.name); - if (nrules != 0) + int32_t elem, idx; + elem = seek_collating_symbol_entry (br_elem->opr.name, + sym_name_len, + symb_table, table_size, + extra); + if (elem != -1) { - int32_t elem, idx; - elem = seek_collating_symbol_entry (br_elem->opr.name, - sym_name_len); - if (elem != -1) - { - /* We found the entry. */ - idx = symb_table[2 * elem + 1]; - /* Skip the name of collating element name. */ - idx += 1 + extra[idx]; - /* Skip the byte sequence of the collating element. */ - idx += 1 + extra[idx]; - /* Adjust for the alignment. */ - idx = (idx + 3) & ~3; - /* Skip the multibyte collation sequence value. */ - idx += sizeof (unsigned int); - /* Skip the wide char sequence of the collating element. */ - idx += sizeof (unsigned int) * - (1 + *(unsigned int *) (extra + idx)); - /* Return the collation sequence value. */ - return *(unsigned int *) (extra + idx); - } - else if (sym_name_len == 1) - { - /* No valid character. Match it as a single byte - character. */ - return collseqmb[br_elem->opr.name[0]]; - } + /* We found the entry. */ + idx = symb_table[2 * elem + 1]; + /* Skip the name of collating element name. */ + idx += 1 + extra[idx]; + /* Skip the byte sequence of the collating element. */ + idx += 1 + extra[idx]; + /* Adjust for the alignment. */ + idx = (idx + 3) & ~3; + /* Skip the multibyte collation sequence value. */ + idx += sizeof (unsigned int); + /* Skip the wide char sequence of the collating element. */ + idx += sizeof (unsigned int) * + (1 + *(unsigned int *) (extra + idx)); + /* Return the collation sequence value. */ + return *(unsigned int *) (extra + idx); } else if (sym_name_len == 1) - return collseqmb[br_elem->opr.name[0]]; + { + /* No valid character. Match it as a single byte + character. */ + return collseqmb[br_elem->opr.name[0]]; + } } - return UINT_MAX; + else if (sym_name_len == 1) + return collseqmb[br_elem->opr.name[0]]; } + return UINT_MAX; +} - /* Local function for parse_bracket_exp used in _LIBC environment. - Build the range expression which starts from START_ELEM, and ends - at END_ELEM. The result are written to MBCSET and SBCSET. - RANGE_ALLOC is the allocated size of mbcset->range_starts, and - mbcset->range_ends, is a pointer argument since we may - update it. */ - - auto inline reg_errcode_t - __attribute__ ((always_inline)) - build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc, - bracket_elem_t *start_elem, bracket_elem_t *end_elem) - { - unsigned int ch; - uint32_t start_collseq; - uint32_t end_collseq; - - /* Equivalence Classes and Character Classes can't be a range - start/end. */ - if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS - || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, - 0)) - return REG_ERANGE; +/* Local function for parse_bracket_exp used in _LIBC environment. + Build the range expression which starts from START_ELEM, and ends + at END_ELEM. The result are written to MBCSET and SBCSET. + RANGE_ALLOC is the allocated size of mbcset->range_starts, and + mbcset->range_ends, is a pointer argument since we may + update it. */ + +static __always_inline reg_errcode_t +build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc, + bracket_elem_t *start_elem, bracket_elem_t *end_elem, + re_dfa_t *dfa, reg_syntax_t syntax, uint32_t nrules, + const unsigned char *collseqmb, const char *collseqwc, + int_fast32_t table_size, const int32_t *symb_table, + const unsigned char *extra) +{ + unsigned int ch; + uint32_t start_collseq; + uint32_t end_collseq; + + /* Equivalence Classes and Character Classes can't be a range + start/end. */ + if (__glibc_unlikely (start_elem->type == EQUIV_CLASS + || start_elem->type == CHAR_CLASS + || end_elem->type == EQUIV_CLASS + || end_elem->type == CHAR_CLASS)) + return REG_ERANGE; - /* FIXME: Implement rational ranges here, too. */ - start_collseq = lookup_collation_sequence_value (start_elem); - end_collseq = lookup_collation_sequence_value (end_elem); - /* Check start/end collation sequence values. */ - if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0)) - return REG_ECOLLATE; - if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0)) - return REG_ERANGE; + /* FIXME: Implement rational ranges here, too. */ + start_collseq = lookup_collation_sequence_value (start_elem, nrules, collseqmb, collseqwc, + table_size, symb_table, extra); + end_collseq = lookup_collation_sequence_value (end_elem, nrules, collseqmb, collseqwc, + table_size, symb_table, extra); + /* Check start/end collation sequence values. */ + if (__glibc_unlikely (start_collseq == UINT_MAX + || end_collseq == UINT_MAX)) + return REG_ECOLLATE; + if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES) + && start_collseq > end_collseq)) + return REG_ERANGE; - /* Got valid collation sequence values, add them as a new entry. - However, if we have no collation elements, and the character set - is single byte, the single byte character set that we - build below suffices. */ - if (nrules > 0 || dfa->mb_cur_max > 1) + /* Got valid collation sequence values, add them as a new entry. + However, if we have no collation elements, and the character set + is single byte, the single byte character set that we + build below suffices. */ + if (nrules > 0 || dfa->mb_cur_max > 1) + { + /* Check the space of the arrays. */ + if (__glibc_unlikely (*range_alloc == mbcset->nranges)) { - /* Check the space of the arrays. */ - if (BE (*range_alloc == mbcset->nranges, 0)) - { - /* There is not enough space, need realloc. */ - uint32_t *new_array_start; - uint32_t *new_array_end; - Idx new_nranges; - - /* +1 in case of mbcset->nranges is 0. */ - new_nranges = 2 * mbcset->nranges + 1; - new_array_start = re_realloc (mbcset->range_starts, uint32_t, - new_nranges); - new_array_end = re_realloc (mbcset->range_ends, uint32_t, - new_nranges); - - if (BE (new_array_start == NULL || new_array_end == NULL, 0)) - return REG_ESPACE; + /* There is not enough space, need realloc. */ + uint32_t *new_array_start; + uint32_t *new_array_end; + int new_nranges; - mbcset->range_starts = new_array_start; - mbcset->range_ends = new_array_end; - *range_alloc = new_nranges; - } + /* +1 in case of mbcset->nranges is 0. */ + new_nranges = 2 * mbcset->nranges + 1; + new_array_start = re_realloc (mbcset->range_starts, uint32_t, + new_nranges); + new_array_end = re_realloc (mbcset->range_ends, uint32_t, + new_nranges); - mbcset->range_starts[mbcset->nranges] = start_collseq; - mbcset->range_ends[mbcset->nranges++] = end_collseq; - } + if (__glibc_unlikely (new_array_start == NULL + || new_array_end == NULL)) + return REG_ESPACE; - /* Build the table for single byte characters. */ - for (ch = 0; ch < SBC_MAX; ch++) - { - uint32_t ch_collseq; - /* - if (MB_CUR_MAX == 1) - */ - if (nrules == 0) - ch_collseq = collseqmb[ch]; - else - ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch)); - if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) - bitset_set (sbcset, ch); + mbcset->range_starts = new_array_start; + mbcset->range_ends = new_array_end; + *range_alloc = new_nranges; } - return REG_NOERROR; + + mbcset->range_starts[mbcset->nranges] = start_collseq; + mbcset->range_ends[mbcset->nranges++] = end_collseq; } - /* Local function for parse_bracket_exp used in _LIBC environment. - Build the collating element which is represented by NAME. - The result are written to MBCSET and SBCSET. - COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a - pointer argument since we may update it. */ + /* Build the table for single byte characters. */ + for (ch = 0; ch < SBC_MAX; ch++) + { + uint32_t ch_collseq; + /* if (MB_CUR_MAX == 1) */ + if (nrules == 0) + ch_collseq = collseqmb[ch]; + else + ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch)); + if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) + bitset_set (sbcset, ch); + } + return REG_NOERROR; +} - auto inline reg_errcode_t - __attribute__ ((always_inline)) - build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, - Idx *coll_sym_alloc, const unsigned char *name) +/* Local function for parse_bracket_exp used in _LIBC environment. + Build the collating element which is represented by NAME. + The result are written to MBCSET and SBCSET. + COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a + pointer argument since we may update it. */ + +static __always_inline reg_errcode_t +build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, + Idx *coll_sym_alloc, const unsigned char *name, + uint_fast32_t nrules, int_fast32_t table_size, + const int32_t *symb_table, const unsigned char *extra) +{ + int32_t elem, idx; + size_t name_len = strlen ((const char *) name); + if (nrules != 0) { - int32_t elem, idx; - size_t name_len = strlen ((const char *) name); - if (nrules != 0) + elem = seek_collating_symbol_entry (name, name_len, symb_table, + table_size, extra); + if (elem != -1) { - elem = seek_collating_symbol_entry (name, name_len); - if (elem != -1) - { - /* We found the entry. */ - idx = symb_table[2 * elem + 1]; - /* Skip the name of collating element name. */ - idx += 1 + extra[idx]; - } - else if (name_len == 1) - { - /* No valid character, treat it as a normal - character. */ - bitset_set (sbcset, name[0]); - return REG_NOERROR; - } - else - return REG_ECOLLATE; - - /* Got valid collation sequence, add it as a new entry. */ - /* Check the space of the arrays. */ - if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0)) - { - /* Not enough, realloc it. */ - /* +1 in case of mbcset->ncoll_syms is 0. */ - Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; - /* Use realloc since mbcset->coll_syms is NULL - if *alloc == 0. */ - int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t, - new_coll_sym_alloc); - if (BE (new_coll_syms == NULL, 0)) - return REG_ESPACE; - mbcset->coll_syms = new_coll_syms; - *coll_sym_alloc = new_coll_sym_alloc; - } - mbcset->coll_syms[mbcset->ncoll_syms++] = idx; + /* We found the entry. */ + idx = symb_table[2 * elem + 1]; + /* Skip the name of collating element name. */ + idx += 1 + extra[idx]; + } + else if (name_len == 1) + { + /* No valid character, treat it as a normal + character. */ + bitset_set (sbcset, name[0]); return REG_NOERROR; } else + return REG_ECOLLATE; + + /* Got valid collation sequence, add it as a new entry. */ + /* Check the space of the arrays. */ + if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms)) { - if (BE (name_len != 1, 0)) - return REG_ECOLLATE; - else - { - bitset_set (sbcset, name[0]); - return REG_NOERROR; - } + /* Not enough, realloc it. */ + /* +1 in case of mbcset->ncoll_syms is 0. */ + int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; + /* Use realloc since mbcset->coll_syms is NULL + if *alloc == 0. */ + int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t, + new_coll_sym_alloc); + if (__glibc_unlikely (new_coll_syms == NULL)) + return REG_ESPACE; + mbcset->coll_syms = new_coll_syms; + *coll_sym_alloc = new_coll_sym_alloc; } + mbcset->coll_syms[mbcset->ncoll_syms++] = idx; + return REG_NOERROR; } -#endif + else + { + if (__glibc_unlikely (name_len != 1)) + return REG_ECOLLATE; + else + { + bitset_set (sbcset, name[0]); + return REG_NOERROR; + } + } +} +#endif /* _LIBC */ + +/* This function parse bracket expression like "[abc]", "[a-c]", + "[[.a-a.]]" etc. */ + +static bin_tree_t * +parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, + reg_syntax_t syntax, reg_errcode_t *err) +{ + const unsigned char *collseqmb = NULL; + const char *collseqwc = NULL; + uint_fast32_t nrules = 0; + int_fast32_t table_size = 0; + const void *symb_table = NULL; + const unsigned char *extra = NULL; re_token_t br_token; re_bitset_ptr_t sbcset; -#ifdef RE_ENABLE_I18N re_charset_t *mbcset; Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0; Idx equiv_class_alloc = 0, char_class_alloc = 0; -#endif /* not RE_ENABLE_I18N */ bool non_match = false; bin_tree_t *work_tree; int token_len; @@ -3099,47 +3050,36 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, */ collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB); - symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE, - _NL_COLLATE_SYMB_TABLEMB); + symb_table = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_TABLEMB); extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); } #endif sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); -#ifdef RE_ENABLE_I18N mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); -#endif /* RE_ENABLE_I18N */ -#ifdef RE_ENABLE_I18N - if (BE (sbcset == NULL || mbcset == NULL, 0)) -#else - if (BE (sbcset == NULL, 0)) -#endif /* RE_ENABLE_I18N */ + if (__glibc_unlikely (sbcset == NULL || mbcset == NULL)) { re_free (sbcset); -#ifdef RE_ENABLE_I18N re_free (mbcset); -#endif *err = REG_ESPACE; return NULL; } token_len = peek_token_bracket (token, regexp, syntax); - if (BE (token->type == END_OF_RE, 0)) + if (__glibc_unlikely (token->type == END_OF_RE)) { *err = REG_BADPAT; goto parse_bracket_exp_free_return; } if (token->type == OP_NON_MATCH_LIST) { -#ifdef RE_ENABLE_I18N mbcset->non_match = 1; -#endif /* not RE_ENABLE_I18N */ non_match = true; if (syntax & RE_HAT_LISTS_NOT_NEWLINE) bitset_set (sbcset, '\n'); re_string_skip_bytes (regexp, token_len); /* Skip a token. */ token_len = peek_token_bracket (token, regexp, syntax); - if (BE (token->type == END_OF_RE, 0)) + if (__glibc_unlikely (token->type == END_OF_RE)) { *err = REG_BADPAT; goto parse_bracket_exp_free_return; @@ -3161,9 +3101,10 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, re_token_t token2; start_elem.opr.name = start_name_buf; + start_elem.type = COLL_SYM; ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa, syntax, first_round); - if (BE (ret != REG_NOERROR, 0)) + if (__glibc_unlikely (ret != REG_NOERROR)) { *err = ret; goto parse_bracket_exp_free_return; @@ -3176,7 +3117,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, /* Do not check for ranges if we know they are not allowed. */ if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS) { - if (BE (token->type == END_OF_RE, 0)) + if (__glibc_unlikely (token->type == END_OF_RE)) { *err = REG_EBRACK; goto parse_bracket_exp_free_return; @@ -3185,7 +3126,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, { re_string_skip_bytes (regexp, token_len); /* Skip '-'. */ token_len2 = peek_token_bracket (&token2, regexp, syntax); - if (BE (token2.type == END_OF_RE, 0)) + if (__glibc_unlikely (token2.type == END_OF_RE)) { *err = REG_EBRACK; goto parse_bracket_exp_free_return; @@ -3204,9 +3145,10 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, if (is_range_exp == true) { end_elem.opr.name = end_name_buf; + end_elem.type = COLL_SYM; ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2, dfa, syntax, true); - if (BE (ret != REG_NOERROR, 0)) + if (__glibc_unlikely (ret != REG_NOERROR)) { *err = ret; goto parse_bracket_exp_free_return; @@ -3214,19 +3156,11 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, token_len = peek_token_bracket (token, regexp, syntax); -#ifdef _LIBC *err = build_range_exp (sbcset, mbcset, &range_alloc, - &start_elem, &end_elem); -#else -# ifdef RE_ENABLE_I18N - *err = build_range_exp (syntax, sbcset, - dfa->mb_cur_max > 1 ? mbcset : NULL, - &range_alloc, &start_elem, &end_elem); -# else - *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem); -# endif -#endif /* RE_ENABLE_I18N */ - if (BE (*err != REG_NOERROR, 0)) + &start_elem, &end_elem, + dfa, syntax, nrules, collseqmb, collseqwc, + table_size, symb_table, extra); + if (__glibc_unlikely (*err != REG_NOERROR)) goto parse_bracket_exp_free_return; } else @@ -3236,10 +3170,9 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, case SB_CHAR: bitset_set (sbcset, start_elem.opr.ch); break; -#ifdef RE_ENABLE_I18N case MB_CHAR: /* Check whether the array has enough space. */ - if (BE (mbchar_alloc == mbcset->nmbchars, 0)) + if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars)) { wchar_t *new_mbchars; /* Not enough, realloc it. */ @@ -3248,47 +3181,41 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, /* Use realloc since array is NULL if *alloc == 0. */ new_mbchars = re_realloc (mbcset->mbchars, wchar_t, mbchar_alloc); - if (BE (new_mbchars == NULL, 0)) + if (__glibc_unlikely (new_mbchars == NULL)) goto parse_bracket_exp_espace; mbcset->mbchars = new_mbchars; } mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch; break; -#endif /* RE_ENABLE_I18N */ case EQUIV_CLASS: *err = build_equiv_class (sbcset, -#ifdef RE_ENABLE_I18N mbcset, &equiv_class_alloc, -#endif /* RE_ENABLE_I18N */ start_elem.opr.name); - if (BE (*err != REG_NOERROR, 0)) + if (__glibc_unlikely (*err != REG_NOERROR)) goto parse_bracket_exp_free_return; break; case COLL_SYM: *err = build_collating_symbol (sbcset, -#ifdef RE_ENABLE_I18N mbcset, &coll_sym_alloc, -#endif /* RE_ENABLE_I18N */ - start_elem.opr.name); - if (BE (*err != REG_NOERROR, 0)) + start_elem.opr.name, + nrules, table_size, symb_table, extra); + if (__glibc_unlikely (*err != REG_NOERROR)) goto parse_bracket_exp_free_return; break; case CHAR_CLASS: *err = build_charclass (regexp->trans, sbcset, -#ifdef RE_ENABLE_I18N mbcset, &char_class_alloc, -#endif /* RE_ENABLE_I18N */ (const char *) start_elem.opr.name, syntax); - if (BE (*err != REG_NOERROR, 0)) + if (__glibc_unlikely (*err != REG_NOERROR)) goto parse_bracket_exp_free_return; break; default: - assert (0); + DEBUG_ASSERT (false); break; } } - if (BE (token->type == END_OF_RE, 0)) + if (__glibc_unlikely (token->type == END_OF_RE)) { *err = REG_EBRACK; goto parse_bracket_exp_free_return; @@ -3303,7 +3230,6 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, if (non_match) bitset_not (sbcset); -#ifdef RE_ENABLE_I18N /* Ensure only single byte characters are set. */ if (dfa->mb_cur_max > 1) bitset_mask (sbcset, dfa->sb_char); @@ -3319,7 +3245,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, br_token.type = COMPLEX_BRACKET; br_token.opr.mbcset = mbcset; mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); - if (BE (mbc_tree == NULL, 0)) + if (__glibc_unlikely (mbc_tree == NULL)) goto parse_bracket_exp_espace; for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx) if (sbcset[sbc_idx]) @@ -3332,12 +3258,12 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, br_token.type = SIMPLE_BRACKET; br_token.opr.sbcset = sbcset; work_tree = create_token_tree (dfa, NULL, NULL, &br_token); - if (BE (work_tree == NULL, 0)) + if (__glibc_unlikely (work_tree == NULL)) goto parse_bracket_exp_espace; /* Then join them by ALT node. */ work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT); - if (BE (work_tree == NULL, 0)) + if (__glibc_unlikely (work_tree == NULL)) goto parse_bracket_exp_espace; } else @@ -3347,16 +3273,13 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, } } else -#endif /* not RE_ENABLE_I18N */ { -#ifdef RE_ENABLE_I18N free_charset (mbcset); -#endif /* Build a tree for simple bracket. */ br_token.type = SIMPLE_BRACKET; br_token.opr.sbcset = sbcset; work_tree = create_token_tree (dfa, NULL, NULL, &br_token); - if (BE (work_tree == NULL, 0)) + if (__glibc_unlikely (work_tree == NULL)) goto parse_bracket_exp_espace; } return work_tree; @@ -3365,9 +3288,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, *err = REG_ESPACE; parse_bracket_exp_free_return: re_free (sbcset); -#ifdef RE_ENABLE_I18N free_charset (mbcset); -#endif /* RE_ENABLE_I18N */ return NULL; } @@ -3378,7 +3299,6 @@ parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp, re_token_t *token, int token_len, re_dfa_t *dfa, reg_syntax_t syntax, bool accept_hyphen) { -#ifdef RE_ENABLE_I18N int cur_char_size; cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp)); if (cur_char_size > 1) @@ -3388,12 +3308,11 @@ parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp, re_string_skip_bytes (regexp, cur_char_size); return REG_NOERROR; } -#endif /* RE_ENABLE_I18N */ re_string_skip_bytes (regexp, token_len); /* Skip a token. */ if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS || token->type == OP_OPEN_EQUIV_CLASS) return parse_bracket_symbol (elem, regexp, token); - if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen) + if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen) { /* A '-' must only appear as anything but a range indicator before the closing bracket. Everything else is an error. */ @@ -3461,12 +3380,8 @@ parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp, is a pointer argument since we may update it. */ static reg_errcode_t -#ifdef RE_ENABLE_I18N build_equiv_class (bitset_t sbcset, re_charset_t *mbcset, Idx *equiv_class_alloc, const unsigned char *name) -#else /* not RE_ENABLE_I18N */ -build_equiv_class (bitset_t sbcset, const unsigned char *name) -#endif /* not RE_ENABLE_I18N */ { #ifdef _LIBC uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); @@ -3478,8 +3393,6 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name) int32_t idx1, idx2; unsigned int ch; size_t len; - /* This #include defines a local function! */ -# include /* Calculate the index for equivalence class. */ cp = name; table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); @@ -3489,8 +3402,8 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name) _NL_COLLATE_EXTRAMB); indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); - idx1 = findidx (&cp, -1); - if (BE (idx1 == 0 || *cp != '\0', 0)) + idx1 = findidx (table, indirect, extra, &cp, -1); + if (__glibc_unlikely (idx1 == 0 || *cp != '\0')) /* This isn't a valid character. */ return REG_ECOLLATE; @@ -3500,7 +3413,7 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name) { char_buf[0] = ch; cp = char_buf; - idx2 = findidx (&cp, 1); + idx2 = findidx (table, indirect, extra, &cp, 1); /* idx2 = table[ch]; */ @@ -3509,21 +3422,13 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name) continue; /* Compare only if the length matches and the collation rule index is the same. */ - if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)) - { - int cnt = 0; - - while (cnt <= len && - weights[(idx1 & 0xffffff) + 1 + cnt] - == weights[(idx2 & 0xffffff) + 1 + cnt]) - ++cnt; - - if (cnt > len) - bitset_set (sbcset, ch); - } + if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24) + && memcmp (weights + (idx1 & 0xffffff) + 1, + weights + (idx2 & 0xffffff) + 1, len) == 0) + bitset_set (sbcset, ch); } /* Check whether the array has enough space. */ - if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0)) + if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes)) { /* Not enough, realloc it. */ /* +1 in case of mbcset->nequiv_classes is 0. */ @@ -3532,7 +3437,7 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name) int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes, int32_t, new_equiv_class_alloc); - if (BE (new_equiv_classes == NULL, 0)) + if (__glibc_unlikely (new_equiv_classes == NULL)) return REG_ESPACE; mbcset->equiv_classes = new_equiv_classes; *equiv_class_alloc = new_equiv_class_alloc; @@ -3542,7 +3447,7 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name) else #endif /* _LIBC */ { - if (BE (strlen ((const char *) name) != 1, 0)) + if (__glibc_unlikely (strlen ((const char *) name) != 1)) return REG_ECOLLATE; bitset_set (sbcset, *name); } @@ -3556,14 +3461,9 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name) is a pointer argument since we may update it. */ static reg_errcode_t -#ifdef RE_ENABLE_I18N build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, re_charset_t *mbcset, Idx *char_class_alloc, const char *class_name, reg_syntax_t syntax) -#else /* not RE_ENABLE_I18N */ -build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, - const char *class_name, reg_syntax_t syntax) -#endif /* not RE_ENABLE_I18N */ { int i; const char *name = class_name; @@ -3574,9 +3474,8 @@ build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0)) name = "alpha"; -#ifdef RE_ENABLE_I18N /* Check the space of the arrays. */ - if (BE (*char_class_alloc == mbcset->nchar_classes, 0)) + if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes)) { /* Not enough, realloc it. */ /* +1 in case of mbcset->nchar_classes is 0. */ @@ -3584,17 +3483,16 @@ build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, /* Use realloc since array is NULL if *alloc == 0. */ wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t, new_char_class_alloc); - if (BE (new_char_classes == NULL, 0)) + if (__glibc_unlikely (new_char_classes == NULL)) return REG_ESPACE; mbcset->char_classes = new_char_classes; *char_class_alloc = new_char_class_alloc; } mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name); -#endif /* RE_ENABLE_I18N */ #define BUILD_CHARCLASS_LOOP(ctype_func) \ do { \ - if (BE (trans != NULL, 0)) \ + if (__glibc_unlikely (trans != NULL)) \ { \ for (i = 0; i < SBC_MAX; ++i) \ if (ctype_func (i)) \ @@ -3645,49 +3543,33 @@ build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, reg_errcode_t *err) { re_bitset_ptr_t sbcset; -#ifdef RE_ENABLE_I18N re_charset_t *mbcset; Idx alloc = 0; -#endif /* not RE_ENABLE_I18N */ reg_errcode_t ret; - re_token_t br_token; bin_tree_t *tree; sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); -#ifdef RE_ENABLE_I18N - mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); -#endif /* RE_ENABLE_I18N */ - -#ifdef RE_ENABLE_I18N - if (BE (sbcset == NULL || mbcset == NULL, 0)) -#else /* not RE_ENABLE_I18N */ - if (BE (sbcset == NULL, 0)) -#endif /* not RE_ENABLE_I18N */ + if (__glibc_unlikely (sbcset == NULL)) { *err = REG_ESPACE; return NULL; } - - if (non_match) + mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); + if (__glibc_unlikely (mbcset == NULL)) { -#ifdef RE_ENABLE_I18N - mbcset->non_match = 1; -#endif /* not RE_ENABLE_I18N */ + re_free (sbcset); + *err = REG_ESPACE; + return NULL; } + mbcset->non_match = non_match; /* We don't care the syntax in this case. */ - ret = build_charclass (trans, sbcset, -#ifdef RE_ENABLE_I18N - mbcset, &alloc, -#endif /* RE_ENABLE_I18N */ - class_name, 0); + ret = build_charclass (trans, sbcset, mbcset, &alloc, class_name, 0); - if (BE (ret != REG_NOERROR, 0)) + if (__glibc_unlikely (ret != REG_NOERROR)) { re_free (sbcset); -#ifdef RE_ENABLE_I18N free_charset (mbcset); -#endif /* RE_ENABLE_I18N */ *err = ret; return NULL; } @@ -3699,20 +3581,16 @@ build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, if (non_match) bitset_not (sbcset); -#ifdef RE_ENABLE_I18N /* Ensure only single byte characters are set. */ if (dfa->mb_cur_max > 1) bitset_mask (sbcset, dfa->sb_char); -#endif /* Build a tree for simple bracket. */ - br_token.type = SIMPLE_BRACKET; - br_token.opr.sbcset = sbcset; + re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset }; tree = create_token_tree (dfa, NULL, NULL, &br_token); - if (BE (tree == NULL, 0)) + if (__glibc_unlikely (tree == NULL)) goto build_word_op_espace; -#ifdef RE_ENABLE_I18N if (dfa->mb_cur_max > 1) { bin_tree_t *mbc_tree; @@ -3721,11 +3599,11 @@ build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, br_token.opr.mbcset = mbcset; dfa->has_mb_node = 1; mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); - if (BE (mbc_tree == NULL, 0)) + if (__glibc_unlikely (mbc_tree == NULL)) goto build_word_op_espace; /* Then join them by ALT node. */ tree = create_tree (dfa, tree, mbc_tree, OP_ALT); - if (BE (mbc_tree != NULL, 1)) + if (__glibc_likely (mbc_tree != NULL)) return tree; } else @@ -3733,63 +3611,55 @@ build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, free_charset (mbcset); return tree; } -#else /* not RE_ENABLE_I18N */ - return tree; -#endif /* not RE_ENABLE_I18N */ build_word_op_espace: re_free (sbcset); -#ifdef RE_ENABLE_I18N free_charset (mbcset); -#endif /* RE_ENABLE_I18N */ *err = REG_ESPACE; return NULL; } /* This is intended for the expressions like "a{1,3}". Fetch a number from 'input', and return the number. - Return REG_MISSING if the number field is empty like "{,1}". + Return -1 if the number field is empty like "{,1}". Return RE_DUP_MAX + 1 if the number field is too large. - Return REG_ERROR if an error occurred. */ + Return -2 if an error occurred. */ static Idx fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax) { - Idx num = REG_MISSING; + Idx num = -1; unsigned char c; while (1) { fetch_token (token, input, syntax); c = token->opr.c; - if (BE (token->type == END_OF_RE, 0)) - return REG_ERROR; + if (__glibc_unlikely (token->type == END_OF_RE)) + return -2; if (token->type == OP_CLOSE_DUP_NUM || c == ',') break; - num = ((token->type != CHARACTER || c < '0' || '9' < c - || num == REG_ERROR) - ? REG_ERROR - : num == REG_MISSING + num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2) + ? -2 + : num == -1 ? c - '0' : MIN (RE_DUP_MAX + 1, num * 10 + c - '0')); } return num; } -#ifdef RE_ENABLE_I18N static void free_charset (re_charset_t *cset) { re_free (cset->mbchars); -# ifdef _LIBC +#ifdef _LIBC re_free (cset->coll_syms); re_free (cset->equiv_classes); +#endif re_free (cset->range_starts); re_free (cset->range_ends); -# endif re_free (cset->char_classes); re_free (cset); } -#endif /* RE_ENABLE_I18N */ /* Functions for binary tree operation. */ @@ -3799,8 +3669,7 @@ static bin_tree_t * create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, re_token_type_t type) { - re_token_t t; - t.type = type; + re_token_t t = { .type = type }; return create_token_tree (dfa, left, right, &t); } @@ -3809,7 +3678,7 @@ create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, const re_token_t *token) { bin_tree_t *tree; - if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0)) + if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE)) { bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1); @@ -3829,7 +3698,7 @@ create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, tree->token.opt_subexp = 0; tree->first = NULL; tree->next = NULL; - tree->node_idx = REG_MISSING; + tree->node_idx = -1; if (left != NULL) left->parent = tree; @@ -3856,13 +3725,10 @@ mark_opt_subexp (void *extra, bin_tree_t *node) static void free_token (re_token_t *node) { -#ifdef RE_ENABLE_I18N if (node->type == COMPLEX_BRACKET && node->duplicated == 0) free_charset (node->opr.mbcset); - else -#endif /* RE_ENABLE_I18N */ - if (node->type == SIMPLE_BRACKET && node->duplicated == 0) - re_free (node->opr.sbcset); + else if (node->type == SIMPLE_BRACKET && node->duplicated == 0) + re_free (node->opr.sbcset); } /* Worker function for tree walking. Free the allocated memory inside NODE -- cgit v1.2.3-74-g34f1