diff options
Diffstat (limited to 'gl/regcomp.c')
-rw-r--r-- | gl/regcomp.c | 831 |
1 files changed, 351 insertions, 480 deletions
diff --git a/gl/regcomp.c b/gl/regcomp.c index 887e5b5..122c3de 100644 --- a/gl/regcomp.c +++ b/gl/regcomp.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* Extended regular expression matching and search library. | 1 | /* Extended regular expression matching and search library. |
2 | Copyright (C) 2002-2021 Free Software Foundation, Inc. | 2 | Copyright (C) 2002-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. | 3 | This file is part of the GNU C Library. |
4 | Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. | 4 | Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. |
5 | 5 | ||
@@ -27,14 +27,10 @@ static void re_compile_fastmap_iter (regex_t *bufp, | |||
27 | const re_dfastate_t *init_state, | 27 | const re_dfastate_t *init_state, |
28 | char *fastmap); | 28 | char *fastmap); |
29 | static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len); | 29 | static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len); |
30 | #ifdef RE_ENABLE_I18N | ||
31 | static void free_charset (re_charset_t *cset); | 30 | static void free_charset (re_charset_t *cset); |
32 | #endif /* RE_ENABLE_I18N */ | ||
33 | static void free_workarea_compile (regex_t *preg); | 31 | static void free_workarea_compile (regex_t *preg); |
34 | static reg_errcode_t create_initial_state (re_dfa_t *dfa); | 32 | static reg_errcode_t create_initial_state (re_dfa_t *dfa); |
35 | #ifdef RE_ENABLE_I18N | ||
36 | static void optimize_utf8 (re_dfa_t *dfa); | 33 | static void optimize_utf8 (re_dfa_t *dfa); |
37 | #endif | ||
38 | static reg_errcode_t analyze (regex_t *preg); | 34 | static reg_errcode_t analyze (regex_t *preg); |
39 | static reg_errcode_t preorder (bin_tree_t *root, | 35 | static reg_errcode_t preorder (bin_tree_t *root, |
40 | reg_errcode_t (fn (void *, bin_tree_t *)), | 36 | reg_errcode_t (fn (void *, bin_tree_t *)), |
@@ -89,7 +85,6 @@ static reg_errcode_t parse_bracket_element (bracket_elem_t *elem, | |||
89 | static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem, | 85 | static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem, |
90 | re_string_t *regexp, | 86 | re_string_t *regexp, |
91 | re_token_t *token); | 87 | re_token_t *token); |
92 | #ifdef RE_ENABLE_I18N | ||
93 | static reg_errcode_t build_equiv_class (bitset_t sbcset, | 88 | static reg_errcode_t build_equiv_class (bitset_t sbcset, |
94 | re_charset_t *mbcset, | 89 | re_charset_t *mbcset, |
95 | Idx *equiv_class_alloc, | 90 | Idx *equiv_class_alloc, |
@@ -100,14 +95,6 @@ static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, | |||
100 | Idx *char_class_alloc, | 95 | Idx *char_class_alloc, |
101 | const char *class_name, | 96 | const char *class_name, |
102 | reg_syntax_t syntax); | 97 | reg_syntax_t syntax); |
103 | #else /* not RE_ENABLE_I18N */ | ||
104 | static reg_errcode_t build_equiv_class (bitset_t sbcset, | ||
105 | const unsigned char *name); | ||
106 | static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, | ||
107 | bitset_t sbcset, | ||
108 | const char *class_name, | ||
109 | reg_syntax_t syntax); | ||
110 | #endif /* not RE_ENABLE_I18N */ | ||
111 | static bin_tree_t *build_charclass_op (re_dfa_t *dfa, | 98 | static bin_tree_t *build_charclass_op (re_dfa_t *dfa, |
112 | RE_TRANSLATE_TYPE trans, | 99 | RE_TRANSLATE_TYPE trans, |
113 | const char *class_name, | 100 | const char *class_name, |
@@ -279,8 +266,7 @@ re_compile_fastmap (struct re_pattern_buffer *bufp) | |||
279 | } | 266 | } |
280 | weak_alias (__re_compile_fastmap, re_compile_fastmap) | 267 | weak_alias (__re_compile_fastmap, re_compile_fastmap) |
281 | 268 | ||
282 | static inline void | 269 | static __always_inline void |
283 | __attribute__ ((always_inline)) | ||
284 | re_set_fastmap (char *fastmap, bool icase, int ch) | 270 | re_set_fastmap (char *fastmap, bool icase, int ch) |
285 | { | 271 | { |
286 | fastmap[ch] = 1; | 272 | fastmap[ch] = 1; |
@@ -306,7 +292,6 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, | |||
306 | if (type == CHARACTER) | 292 | if (type == CHARACTER) |
307 | { | 293 | { |
308 | re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c); | 294 | re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c); |
309 | #ifdef RE_ENABLE_I18N | ||
310 | if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) | 295 | if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) |
311 | { | 296 | { |
312 | unsigned char buf[MB_LEN_MAX]; | 297 | unsigned char buf[MB_LEN_MAX]; |
@@ -327,7 +312,6 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, | |||
327 | != (size_t) -1)) | 312 | != (size_t) -1)) |
328 | re_set_fastmap (fastmap, false, buf[0]); | 313 | re_set_fastmap (fastmap, false, buf[0]); |
329 | } | 314 | } |
330 | #endif | ||
331 | } | 315 | } |
332 | else if (type == SIMPLE_BRACKET) | 316 | else if (type == SIMPLE_BRACKET) |
333 | { | 317 | { |
@@ -341,13 +325,12 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, | |||
341 | re_set_fastmap (fastmap, icase, ch); | 325 | re_set_fastmap (fastmap, icase, ch); |
342 | } | 326 | } |
343 | } | 327 | } |
344 | #ifdef RE_ENABLE_I18N | ||
345 | else if (type == COMPLEX_BRACKET) | 328 | else if (type == COMPLEX_BRACKET) |
346 | { | 329 | { |
347 | re_charset_t *cset = dfa->nodes[node].opr.mbcset; | 330 | re_charset_t *cset = dfa->nodes[node].opr.mbcset; |
348 | Idx i; | 331 | Idx i; |
349 | 332 | ||
350 | # ifdef _LIBC | 333 | #ifdef _LIBC |
351 | /* See if we have to try all bytes which start multiple collation | 334 | /* See if we have to try all bytes which start multiple collation |
352 | elements. | 335 | elements. |
353 | e.g. In da_DK, we want to catch 'a' since "aa" is a valid | 336 | e.g. In da_DK, we want to catch 'a' since "aa" is a valid |
@@ -363,7 +346,7 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, | |||
363 | if (table[i] < 0) | 346 | if (table[i] < 0) |
364 | re_set_fastmap (fastmap, icase, i); | 347 | re_set_fastmap (fastmap, icase, i); |
365 | } | 348 | } |
366 | # endif /* _LIBC */ | 349 | #endif /* _LIBC */ |
367 | 350 | ||
368 | /* See if we have to start the match at all multibyte characters, | 351 | /* See if we have to start the match at all multibyte characters, |
369 | i.e. where we would not find an invalid sequence. This only | 352 | i.e. where we would not find an invalid sequence. This only |
@@ -371,9 +354,9 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, | |||
371 | sets, the SIMPLE_BRACKET again suffices. */ | 354 | sets, the SIMPLE_BRACKET again suffices. */ |
372 | if (dfa->mb_cur_max > 1 | 355 | if (dfa->mb_cur_max > 1 |
373 | && (cset->nchar_classes || cset->non_match || cset->nranges | 356 | && (cset->nchar_classes || cset->non_match || cset->nranges |
374 | # ifdef _LIBC | 357 | #ifdef _LIBC |
375 | || cset->nequiv_classes | 358 | || cset->nequiv_classes |
376 | # endif /* _LIBC */ | 359 | #endif /* _LIBC */ |
377 | )) | 360 | )) |
378 | { | 361 | { |
379 | unsigned char c = 0; | 362 | unsigned char c = 0; |
@@ -406,12 +389,7 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, | |||
406 | } | 389 | } |
407 | } | 390 | } |
408 | } | 391 | } |
409 | #endif /* RE_ENABLE_I18N */ | 392 | else if (type == OP_PERIOD || type == OP_UTF8_PERIOD || type == END_OF_RE) |
410 | else if (type == OP_PERIOD | ||
411 | #ifdef RE_ENABLE_I18N | ||
412 | || type == OP_UTF8_PERIOD | ||
413 | #endif /* RE_ENABLE_I18N */ | ||
414 | || type == END_OF_RE) | ||
415 | { | 393 | { |
416 | memset (fastmap, '\1', sizeof (char) * SBC_MAX); | 394 | memset (fastmap, '\1', sizeof (char) * SBC_MAX); |
417 | if (type == END_OF_RE) | 395 | if (type == END_OF_RE) |
@@ -550,7 +528,6 @@ regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf, | |||
550 | weak_alias (__regerror, regerror) | 528 | weak_alias (__regerror, regerror) |
551 | 529 | ||
552 | 530 | ||
553 | #ifdef RE_ENABLE_I18N | ||
554 | /* This static array is used for the map to single-byte characters when | 531 | /* This static array is used for the map to single-byte characters when |
555 | UTF-8 is used. Otherwise we would allocate memory just to initialize | 532 | UTF-8 is used. Otherwise we would allocate memory just to initialize |
556 | it the same all the time. UTF-8 is the preferred encoding so this is | 533 | it the same all the time. UTF-8 is the preferred encoding so this is |
@@ -558,25 +535,24 @@ weak_alias (__regerror, regerror) | |||
558 | static const bitset_t utf8_sb_map = | 535 | static const bitset_t utf8_sb_map = |
559 | { | 536 | { |
560 | /* Set the first 128 bits. */ | 537 | /* Set the first 128 bits. */ |
561 | # if (defined __GNUC__ || __clang_major__ >= 4) && !defined __STRICT_ANSI__ | 538 | #if (defined __GNUC__ || __clang_major__ >= 4) && !defined __STRICT_ANSI__ |
562 | [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX | 539 | [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX |
563 | # else | 540 | #else |
564 | # if 4 * BITSET_WORD_BITS < ASCII_CHARS | 541 | # if 4 * BITSET_WORD_BITS < ASCII_CHARS |
565 | # error "bitset_word_t is narrower than 32 bits" | 542 | # error "bitset_word_t is narrower than 32 bits" |
566 | # elif 3 * BITSET_WORD_BITS < ASCII_CHARS | 543 | # elif 3 * BITSET_WORD_BITS < ASCII_CHARS |
567 | BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX, | 544 | BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX, |
568 | # elif 2 * BITSET_WORD_BITS < ASCII_CHARS | 545 | # elif 2 * BITSET_WORD_BITS < ASCII_CHARS |
569 | BITSET_WORD_MAX, BITSET_WORD_MAX, | 546 | BITSET_WORD_MAX, BITSET_WORD_MAX, |
570 | # elif 1 * BITSET_WORD_BITS < ASCII_CHARS | 547 | # elif 1 * BITSET_WORD_BITS < ASCII_CHARS |
571 | BITSET_WORD_MAX, | 548 | BITSET_WORD_MAX, |
572 | # endif | 549 | # endif |
573 | (BITSET_WORD_MAX | 550 | (BITSET_WORD_MAX |
574 | >> (SBC_MAX % BITSET_WORD_BITS == 0 | 551 | >> (SBC_MAX % BITSET_WORD_BITS == 0 |
575 | ? 0 | 552 | ? 0 |
576 | : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS)) | 553 | : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS)) |
577 | # endif | ||
578 | }; | ||
579 | #endif | 554 | #endif |
555 | }; | ||
580 | 556 | ||
581 | 557 | ||
582 | static void | 558 | static void |
@@ -614,10 +590,8 @@ free_dfa_content (re_dfa_t *dfa) | |||
614 | re_free (entry->array); | 590 | re_free (entry->array); |
615 | } | 591 | } |
616 | re_free (dfa->state_table); | 592 | re_free (dfa->state_table); |
617 | #ifdef RE_ENABLE_I18N | ||
618 | if (dfa->sb_char != utf8_sb_map) | 593 | if (dfa->sb_char != utf8_sb_map) |
619 | re_free (dfa->sb_char); | 594 | re_free (dfa->sb_char); |
620 | #endif | ||
621 | re_free (dfa->subexp_map); | 595 | re_free (dfa->subexp_map); |
622 | #ifdef DEBUG | 596 | #ifdef DEBUG |
623 | re_free (dfa->re_str); | 597 | re_free (dfa->re_str); |
@@ -796,11 +770,9 @@ re_compile_internal (regex_t *preg, const char * pattern, size_t length, | |||
796 | if (__glibc_unlikely (err != REG_NOERROR)) | 770 | if (__glibc_unlikely (err != REG_NOERROR)) |
797 | goto re_compile_internal_free_return; | 771 | goto re_compile_internal_free_return; |
798 | 772 | ||
799 | #ifdef RE_ENABLE_I18N | ||
800 | /* If possible, do searching in single byte encoding to speed things up. */ | 773 | /* If possible, do searching in single byte encoding to speed things up. */ |
801 | if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL) | 774 | if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL) |
802 | optimize_utf8 (dfa); | 775 | optimize_utf8 (dfa); |
803 | #endif | ||
804 | 776 | ||
805 | /* Then create the initial state of the dfa. */ | 777 | /* Then create the initial state of the dfa. */ |
806 | err = create_initial_state (dfa); | 778 | err = create_initial_state (dfa); |
@@ -830,11 +802,7 @@ init_dfa (re_dfa_t *dfa, size_t pat_len) | |||
830 | #ifndef _LIBC | 802 | #ifndef _LIBC |
831 | const char *codeset_name; | 803 | const char *codeset_name; |
832 | #endif | 804 | #endif |
833 | #ifdef RE_ENABLE_I18N | ||
834 | size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t)); | 805 | size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t)); |
835 | #else | ||
836 | size_t max_i18n_object_size = 0; | ||
837 | #endif | ||
838 | size_t max_object_size = | 806 | size_t max_object_size = |
839 | MAX (sizeof (struct re_state_table_entry), | 807 | MAX (sizeof (struct re_state_table_entry), |
840 | MAX (sizeof (re_token_t), | 808 | MAX (sizeof (re_token_t), |
@@ -886,7 +854,6 @@ init_dfa (re_dfa_t *dfa, size_t pat_len) | |||
886 | dfa->map_notascii = 0; | 854 | dfa->map_notascii = 0; |
887 | #endif | 855 | #endif |
888 | 856 | ||
889 | #ifdef RE_ENABLE_I18N | ||
890 | if (dfa->mb_cur_max > 1) | 857 | if (dfa->mb_cur_max > 1) |
891 | { | 858 | { |
892 | if (dfa->is_utf8) | 859 | if (dfa->is_utf8) |
@@ -906,14 +873,13 @@ init_dfa (re_dfa_t *dfa, size_t pat_len) | |||
906 | wint_t wch = __btowc (ch); | 873 | wint_t wch = __btowc (ch); |
907 | if (wch != WEOF) | 874 | if (wch != WEOF) |
908 | dfa->sb_char[i] |= (bitset_word_t) 1 << j; | 875 | dfa->sb_char[i] |= (bitset_word_t) 1 << j; |
909 | # ifndef _LIBC | 876 | #ifndef _LIBC |
910 | if (isascii (ch) && wch != ch) | 877 | if (isascii (ch) && wch != ch) |
911 | dfa->map_notascii = 1; | 878 | dfa->map_notascii = 1; |
912 | # endif | 879 | #endif |
913 | } | 880 | } |
914 | } | 881 | } |
915 | } | 882 | } |
916 | #endif | ||
917 | 883 | ||
918 | if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL)) | 884 | if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL)) |
919 | return REG_ESPACE; | 885 | return REG_ESPACE; |
@@ -933,8 +899,6 @@ init_word_char (re_dfa_t *dfa) | |||
933 | dfa->word_ops_used = 1; | 899 | dfa->word_ops_used = 1; |
934 | if (__glibc_likely (dfa->map_notascii == 0)) | 900 | if (__glibc_likely (dfa->map_notascii == 0)) |
935 | { | 901 | { |
936 | /* Avoid uint32_t and uint64_t as some non-GCC platforms lack | ||
937 | them, an issue when this code is used in Gnulib. */ | ||
938 | bitset_word_t bits0 = 0x00000000; | 902 | bitset_word_t bits0 = 0x00000000; |
939 | bitset_word_t bits1 = 0x03ff0000; | 903 | bitset_word_t bits1 = 0x03ff0000; |
940 | bitset_word_t bits2 = 0x87fffffe; | 904 | bitset_word_t bits2 = 0x87fffffe; |
@@ -1074,7 +1038,6 @@ create_initial_state (re_dfa_t *dfa) | |||
1074 | return REG_NOERROR; | 1038 | return REG_NOERROR; |
1075 | } | 1039 | } |
1076 | 1040 | ||
1077 | #ifdef RE_ENABLE_I18N | ||
1078 | /* If it is possible to do searching in single byte encoding instead of UTF-8 | 1041 | /* If it is possible to do searching in single byte encoding instead of UTF-8 |
1079 | to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change | 1042 | to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change |
1080 | DFA nodes where needed. */ | 1043 | DFA nodes where needed. */ |
@@ -1154,7 +1117,6 @@ optimize_utf8 (re_dfa_t *dfa) | |||
1154 | dfa->is_utf8 = 0; | 1117 | dfa->is_utf8 = 0; |
1155 | dfa->has_mb_node = dfa->nbackref > 0 || has_period; | 1118 | dfa->has_mb_node = dfa->nbackref > 0 || has_period; |
1156 | } | 1119 | } |
1157 | #endif | ||
1158 | 1120 | ||
1159 | /* Analyze the structure tree, and calculate "first", "next", "edest", | 1121 | /* Analyze the structure tree, and calculate "first", "next", "edest", |
1160 | "eclosure", and "inveclosure". */ | 1122 | "eclosure", and "inveclosure". */ |
@@ -1792,7 +1754,6 @@ peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) | |||
1792 | token->opr.c = c; | 1754 | token->opr.c = c; |
1793 | 1755 | ||
1794 | token->word_char = 0; | 1756 | token->word_char = 0; |
1795 | #ifdef RE_ENABLE_I18N | ||
1796 | token->mb_partial = 0; | 1757 | token->mb_partial = 0; |
1797 | if (input->mb_cur_max > 1 | 1758 | if (input->mb_cur_max > 1 |
1798 | && !re_string_first_byte (input, re_string_cur_idx (input))) | 1759 | && !re_string_first_byte (input, re_string_cur_idx (input))) |
@@ -1801,7 +1762,6 @@ peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) | |||
1801 | token->mb_partial = 1; | 1762 | token->mb_partial = 1; |
1802 | return 1; | 1763 | return 1; |
1803 | } | 1764 | } |
1804 | #endif | ||
1805 | if (c == '\\') | 1765 | if (c == '\\') |
1806 | { | 1766 | { |
1807 | unsigned char c2; | 1767 | unsigned char c2; |
@@ -1814,7 +1774,6 @@ peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) | |||
1814 | c2 = re_string_peek_byte_case (input, 1); | 1774 | c2 = re_string_peek_byte_case (input, 1); |
1815 | token->opr.c = c2; | 1775 | token->opr.c = c2; |
1816 | token->type = CHARACTER; | 1776 | token->type = CHARACTER; |
1817 | #ifdef RE_ENABLE_I18N | ||
1818 | if (input->mb_cur_max > 1) | 1777 | if (input->mb_cur_max > 1) |
1819 | { | 1778 | { |
1820 | wint_t wc = re_string_wchar_at (input, | 1779 | wint_t wc = re_string_wchar_at (input, |
@@ -1822,7 +1781,6 @@ peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) | |||
1822 | token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; | 1781 | token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; |
1823 | } | 1782 | } |
1824 | else | 1783 | else |
1825 | #endif | ||
1826 | token->word_char = IS_WORD_CHAR (c2) != 0; | 1784 | token->word_char = IS_WORD_CHAR (c2) != 0; |
1827 | 1785 | ||
1828 | switch (c2) | 1786 | switch (c2) |
@@ -1928,14 +1886,12 @@ peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) | |||
1928 | } | 1886 | } |
1929 | 1887 | ||
1930 | token->type = CHARACTER; | 1888 | token->type = CHARACTER; |
1931 | #ifdef RE_ENABLE_I18N | ||
1932 | if (input->mb_cur_max > 1) | 1889 | if (input->mb_cur_max > 1) |
1933 | { | 1890 | { |
1934 | wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input)); | 1891 | wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input)); |
1935 | token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; | 1892 | token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; |
1936 | } | 1893 | } |
1937 | else | 1894 | else |
1938 | #endif | ||
1939 | token->word_char = IS_WORD_CHAR (token->opr.c); | 1895 | token->word_char = IS_WORD_CHAR (token->opr.c); |
1940 | 1896 | ||
1941 | switch (c) | 1897 | switch (c) |
@@ -2027,14 +1983,12 @@ peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax) | |||
2027 | c = re_string_peek_byte (input, 0); | 1983 | c = re_string_peek_byte (input, 0); |
2028 | token->opr.c = c; | 1984 | token->opr.c = c; |
2029 | 1985 | ||
2030 | #ifdef RE_ENABLE_I18N | ||
2031 | if (input->mb_cur_max > 1 | 1986 | if (input->mb_cur_max > 1 |
2032 | && !re_string_first_byte (input, re_string_cur_idx (input))) | 1987 | && !re_string_first_byte (input, re_string_cur_idx (input))) |
2033 | { | 1988 | { |
2034 | token->type = CHARACTER; | 1989 | token->type = CHARACTER; |
2035 | return 1; | 1990 | return 1; |
2036 | } | 1991 | } |
2037 | #endif /* RE_ENABLE_I18N */ | ||
2038 | 1992 | ||
2039 | if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) | 1993 | if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) |
2040 | && re_string_cur_idx (input) + 1 < re_string_length (input)) | 1994 | && re_string_cur_idx (input) + 1 < re_string_length (input)) |
@@ -2084,15 +2038,25 @@ peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax) | |||
2084 | } | 2038 | } |
2085 | switch (c) | 2039 | switch (c) |
2086 | { | 2040 | { |
2087 | case '-': | ||
2088 | token->type = OP_CHARSET_RANGE; | ||
2089 | break; | ||
2090 | case ']': | 2041 | case ']': |
2091 | token->type = OP_CLOSE_BRACKET; | 2042 | token->type = OP_CLOSE_BRACKET; |
2092 | break; | 2043 | break; |
2093 | case '^': | 2044 | case '^': |
2094 | token->type = OP_NON_MATCH_LIST; | 2045 | token->type = OP_NON_MATCH_LIST; |
2095 | break; | 2046 | break; |
2047 | case '-': | ||
2048 | /* In V7 Unix grep and Unix awk and mawk, [...---...] | ||
2049 | (3 adjacent minus signs) stands for a single minus sign. | ||
2050 | Support that without breaking anything else. */ | ||
2051 | if (! (re_string_cur_idx (input) + 2 < re_string_length (input) | ||
2052 | && re_string_peek_byte (input, 1) == '-' | ||
2053 | && re_string_peek_byte (input, 2) == '-')) | ||
2054 | { | ||
2055 | token->type = OP_CHARSET_RANGE; | ||
2056 | break; | ||
2057 | } | ||
2058 | re_string_skip_bytes (input, 2); | ||
2059 | FALLTHROUGH; | ||
2096 | default: | 2060 | default: |
2097 | token->type = CHARACTER; | 2061 | token->type = CHARACTER; |
2098 | } | 2062 | } |
@@ -2256,7 +2220,6 @@ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, | |||
2256 | *err = REG_ESPACE; | 2220 | *err = REG_ESPACE; |
2257 | return NULL; | 2221 | return NULL; |
2258 | } | 2222 | } |
2259 | #ifdef RE_ENABLE_I18N | ||
2260 | if (dfa->mb_cur_max > 1) | 2223 | if (dfa->mb_cur_max > 1) |
2261 | { | 2224 | { |
2262 | while (!re_string_eoi (regexp) | 2225 | while (!re_string_eoi (regexp) |
@@ -2273,7 +2236,6 @@ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, | |||
2273 | } | 2236 | } |
2274 | } | 2237 | } |
2275 | } | 2238 | } |
2276 | #endif | ||
2277 | break; | 2239 | break; |
2278 | 2240 | ||
2279 | case OP_OPEN_SUBEXP: | 2241 | case OP_OPEN_SUBEXP: |
@@ -2666,40 +2628,30 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, | |||
2666 | 2628 | ||
2667 | #ifndef _LIBC | 2629 | #ifndef _LIBC |
2668 | 2630 | ||
2669 | # ifdef RE_ENABLE_I18N | ||
2670 | /* Convert the byte B to the corresponding wide character. In a | 2631 | /* Convert the byte B to the corresponding wide character. In a |
2671 | unibyte locale, treat B as itself. In a multibyte locale, return | 2632 | unibyte locale, treat B as itself. In a multibyte locale, return |
2672 | WEOF if B is an encoding error. */ | 2633 | WEOF if B is an encoding error. */ |
2673 | static wint_t | 2634 | static wint_t |
2674 | parse_byte (unsigned char b, re_charset_t *mbcset) | 2635 | parse_byte (unsigned char b, re_dfa_t const *dfa) |
2675 | { | 2636 | { |
2676 | return mbcset == NULL ? b : __btowc (b); | 2637 | return dfa->mb_cur_max > 1 ? __btowc (b) : b; |
2677 | } | 2638 | } |
2678 | # endif | ||
2679 | 2639 | ||
2680 | /* Local function for parse_bracket_exp only used in case of NOT _LIBC. | 2640 | /* Local function for parse_bracket_exp used in _LIBC environment. |
2681 | Build the range expression which starts from START_ELEM, and ends | 2641 | Build the range expression which starts from START_ELEM, and ends |
2682 | at END_ELEM. The result are written to MBCSET and SBCSET. | 2642 | at END_ELEM. The result are written to MBCSET and SBCSET. |
2683 | RANGE_ALLOC is the allocated size of mbcset->range_starts, and | 2643 | RANGE_ALLOC is the allocated size of mbcset->range_starts, and |
2684 | mbcset->range_ends, is a pointer argument since we may | 2644 | mbcset->range_ends, is a pointer argument since we may |
2685 | update it. */ | 2645 | update it. */ |
2686 | 2646 | ||
2687 | static reg_errcode_t | 2647 | static reg_errcode_t |
2688 | # ifdef RE_ENABLE_I18N | 2648 | build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc, |
2689 | build_range_exp (const reg_syntax_t syntax, | 2649 | bracket_elem_t *start_elem, bracket_elem_t *end_elem, |
2690 | bitset_t sbcset, | 2650 | re_dfa_t *dfa, reg_syntax_t syntax, uint_fast32_t nrules, |
2691 | re_charset_t *mbcset, | 2651 | const unsigned char *collseqmb, const char *collseqwc, |
2692 | Idx *range_alloc, | 2652 | int_fast32_t table_size, const void *symb_table, |
2693 | const bracket_elem_t *start_elem, | 2653 | const unsigned char *extra) |
2694 | const bracket_elem_t *end_elem) | ||
2695 | # else /* not RE_ENABLE_I18N */ | ||
2696 | build_range_exp (const reg_syntax_t syntax, | ||
2697 | bitset_t sbcset, | ||
2698 | const bracket_elem_t *start_elem, | ||
2699 | const bracket_elem_t *end_elem) | ||
2700 | # endif /* not RE_ENABLE_I18N */ | ||
2701 | { | 2654 | { |
2702 | unsigned int start_ch, end_ch; | ||
2703 | /* Equivalence Classes and Character Classes can't be a range start/end. */ | 2655 | /* Equivalence Classes and Character Classes can't be a range start/end. */ |
2704 | if (__glibc_unlikely (start_elem->type == EQUIV_CLASS | 2656 | if (__glibc_unlikely (start_elem->type == EQUIV_CLASS |
2705 | || start_elem->type == CHAR_CLASS | 2657 | || start_elem->type == CHAR_CLASS |
@@ -2715,110 +2667,88 @@ build_range_exp (const reg_syntax_t syntax, | |||
2715 | && strlen ((char *) end_elem->opr.name) > 1))) | 2667 | && strlen ((char *) end_elem->opr.name) > 1))) |
2716 | return REG_ECOLLATE; | 2668 | return REG_ECOLLATE; |
2717 | 2669 | ||
2718 | # ifdef RE_ENABLE_I18N | 2670 | unsigned int |
2719 | { | ||
2720 | wchar_t wc; | ||
2721 | wint_t start_wc; | ||
2722 | wint_t end_wc; | ||
2723 | |||
2724 | start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch | 2671 | start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch |
2725 | : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] | 2672 | : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] |
2726 | : 0)); | 2673 | : 0)), |
2727 | end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch | 2674 | end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch |
2728 | : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] | 2675 | : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] |
2729 | : 0)); | 2676 | : 0)); |
2677 | wint_t | ||
2730 | start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM) | 2678 | start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM) |
2731 | ? parse_byte (start_ch, mbcset) : start_elem->opr.wch); | 2679 | ? parse_byte (start_ch, dfa) : start_elem->opr.wch), |
2732 | end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM) | 2680 | end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM) |
2733 | ? parse_byte (end_ch, mbcset) : end_elem->opr.wch); | 2681 | ? parse_byte (end_ch, dfa) : end_elem->opr.wch); |
2734 | if (start_wc == WEOF || end_wc == WEOF) | ||
2735 | return REG_ECOLLATE; | ||
2736 | else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES) | ||
2737 | && start_wc > end_wc)) | ||
2738 | return REG_ERANGE; | ||
2739 | |||
2740 | /* Got valid collation sequence values, add them as a new entry. | ||
2741 | However, for !_LIBC we have no collation elements: if the | ||
2742 | character set is single byte, the single byte character set | ||
2743 | that we build below suffices. parse_bracket_exp passes | ||
2744 | no MBCSET if dfa->mb_cur_max == 1. */ | ||
2745 | if (mbcset) | ||
2746 | { | ||
2747 | /* Check the space of the arrays. */ | ||
2748 | if (__glibc_unlikely (*range_alloc == mbcset->nranges)) | ||
2749 | { | ||
2750 | /* There is not enough space, need realloc. */ | ||
2751 | wchar_t *new_array_start, *new_array_end; | ||
2752 | Idx new_nranges; | ||
2753 | |||
2754 | /* +1 in case of mbcset->nranges is 0. */ | ||
2755 | new_nranges = 2 * mbcset->nranges + 1; | ||
2756 | /* Use realloc since mbcset->range_starts and mbcset->range_ends | ||
2757 | are NULL if *range_alloc == 0. */ | ||
2758 | new_array_start = re_realloc (mbcset->range_starts, wchar_t, | ||
2759 | new_nranges); | ||
2760 | new_array_end = re_realloc (mbcset->range_ends, wchar_t, | ||
2761 | new_nranges); | ||
2762 | 2682 | ||
2763 | if (__glibc_unlikely (new_array_start == NULL | 2683 | if (start_wc == WEOF || end_wc == WEOF) |
2764 | || new_array_end == NULL)) | 2684 | return REG_ECOLLATE; |
2765 | { | 2685 | else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES) |
2766 | re_free (new_array_start); | 2686 | && start_wc > end_wc)) |
2767 | re_free (new_array_end); | 2687 | return REG_ERANGE; |
2768 | return REG_ESPACE; | ||
2769 | } | ||
2770 | 2688 | ||
2771 | mbcset->range_starts = new_array_start; | 2689 | /* Got valid collation sequence values, add them as a new entry. |
2772 | mbcset->range_ends = new_array_end; | 2690 | However, for !_LIBC we have no collation elements: if the |
2773 | *range_alloc = new_nranges; | 2691 | character set is single byte, the single byte character set |
2774 | } | 2692 | that we build below suffices. parse_bracket_exp passes |
2693 | no MBCSET if dfa->mb_cur_max == 1. */ | ||
2694 | if (dfa->mb_cur_max > 1) | ||
2695 | { | ||
2696 | /* Check the space of the arrays. */ | ||
2697 | if (__glibc_unlikely (*range_alloc == mbcset->nranges)) | ||
2698 | { | ||
2699 | /* There is not enough space, need realloc. */ | ||
2700 | wchar_t *new_array_start, *new_array_end; | ||
2701 | Idx new_nranges; | ||
2775 | 2702 | ||
2776 | mbcset->range_starts[mbcset->nranges] = start_wc; | 2703 | /* +1 in case of mbcset->nranges is 0. */ |
2777 | mbcset->range_ends[mbcset->nranges++] = end_wc; | 2704 | new_nranges = 2 * mbcset->nranges + 1; |
2778 | } | 2705 | /* Use realloc since mbcset->range_starts and mbcset->range_ends |
2706 | are NULL if *range_alloc == 0. */ | ||
2707 | new_array_start = re_realloc (mbcset->range_starts, wchar_t, | ||
2708 | new_nranges); | ||
2709 | new_array_end = re_realloc (mbcset->range_ends, wchar_t, | ||
2710 | new_nranges); | ||
2711 | |||
2712 | if (__glibc_unlikely (new_array_start == NULL | ||
2713 | || new_array_end == NULL)) | ||
2714 | { | ||
2715 | re_free (new_array_start); | ||
2716 | re_free (new_array_end); | ||
2717 | return REG_ESPACE; | ||
2718 | } | ||
2719 | |||
2720 | mbcset->range_starts = new_array_start; | ||
2721 | mbcset->range_ends = new_array_end; | ||
2722 | *range_alloc = new_nranges; | ||
2723 | } | ||
2724 | |||
2725 | mbcset->range_starts[mbcset->nranges] = start_wc; | ||
2726 | mbcset->range_ends[mbcset->nranges++] = end_wc; | ||
2727 | } | ||
2728 | |||
2729 | /* Build the table for single byte characters. */ | ||
2730 | for (wchar_t wc = 0; wc < SBC_MAX; ++wc) | ||
2731 | { | ||
2732 | if (start_wc <= wc && wc <= end_wc) | ||
2733 | bitset_set (sbcset, wc); | ||
2734 | } | ||
2779 | 2735 | ||
2780 | /* Build the table for single byte characters. */ | ||
2781 | for (wc = 0; wc < SBC_MAX; ++wc) | ||
2782 | { | ||
2783 | if (start_wc <= wc && wc <= end_wc) | ||
2784 | bitset_set (sbcset, wc); | ||
2785 | } | ||
2786 | } | ||
2787 | # else /* not RE_ENABLE_I18N */ | ||
2788 | { | ||
2789 | unsigned int ch; | ||
2790 | start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch | ||
2791 | : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] | ||
2792 | : 0)); | ||
2793 | end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch | ||
2794 | : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] | ||
2795 | : 0)); | ||
2796 | if (start_ch > end_ch) | ||
2797 | return REG_ERANGE; | ||
2798 | /* Build the table for single byte characters. */ | ||
2799 | for (ch = 0; ch < SBC_MAX; ++ch) | ||
2800 | if (start_ch <= ch && ch <= end_ch) | ||
2801 | bitset_set (sbcset, ch); | ||
2802 | } | ||
2803 | # endif /* not RE_ENABLE_I18N */ | ||
2804 | return REG_NOERROR; | 2736 | return REG_NOERROR; |
2805 | } | 2737 | } |
2806 | #endif /* not _LIBC */ | 2738 | #endif /* not _LIBC */ |
2807 | 2739 | ||
2808 | #ifndef _LIBC | 2740 | #ifndef _LIBC |
2809 | /* Helper function for parse_bracket_exp only used in case of NOT _LIBC.. | 2741 | /* Helper function for parse_bracket_exp only used in case of NOT _LIBC. |
2810 | Build the collating element which is represented by NAME. | 2742 | Build the collating element which is represented by NAME. |
2811 | The result are written to MBCSET and SBCSET. | 2743 | The result are written to MBCSET and SBCSET. |
2812 | COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a | 2744 | COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a |
2813 | pointer argument since we may update it. */ | 2745 | pointer argument since we may update it. */ |
2814 | 2746 | ||
2815 | static reg_errcode_t | 2747 | static reg_errcode_t |
2816 | # ifdef RE_ENABLE_I18N | ||
2817 | build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, | 2748 | build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, |
2818 | Idx *coll_sym_alloc, const unsigned char *name) | 2749 | Idx *coll_sym_alloc, const unsigned char *name, |
2819 | # else /* not RE_ENABLE_I18N */ | 2750 | uint_fast32_t nrules, int_fast32_t table_size, |
2820 | build_collating_symbol (bitset_t sbcset, const unsigned char *name) | 2751 | const void *symb_table, const unsigned char *extra) |
2821 | # endif /* not RE_ENABLE_I18N */ | ||
2822 | { | 2752 | { |
2823 | size_t name_len = strlen ((const char *) name); | 2753 | size_t name_len = strlen ((const char *) name); |
2824 | if (__glibc_unlikely (name_len != 1)) | 2754 | if (__glibc_unlikely (name_len != 1)) |
@@ -2831,271 +2761,280 @@ build_collating_symbol (bitset_t sbcset, const unsigned char *name) | |||
2831 | } | 2761 | } |
2832 | #endif /* not _LIBC */ | 2762 | #endif /* not _LIBC */ |
2833 | 2763 | ||
2834 | /* This function parse bracket expression like "[abc]", "[a-c]", | ||
2835 | "[[.a-a.]]" etc. */ | ||
2836 | |||
2837 | static bin_tree_t * | ||
2838 | parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, | ||
2839 | reg_syntax_t syntax, reg_errcode_t *err) | ||
2840 | { | ||
2841 | #ifdef _LIBC | 2764 | #ifdef _LIBC |
2842 | const unsigned char *collseqmb; | 2765 | /* Local function for parse_bracket_exp used in _LIBC environment. |
2843 | const char *collseqwc; | 2766 | Seek the collating symbol entry corresponding to NAME. |
2844 | uint32_t nrules; | 2767 | Return the index of the symbol in the SYMB_TABLE, |
2845 | int32_t table_size; | 2768 | or -1 if not found. */ |
2846 | const int32_t *symb_table; | 2769 | |
2847 | const unsigned char *extra; | 2770 | static __always_inline int32_t |
2848 | 2771 | seek_collating_symbol_entry (const unsigned char *name, size_t name_len, | |
2849 | /* Local function for parse_bracket_exp used in _LIBC environment. | 2772 | const int32_t *symb_table, |
2850 | Seek the collating symbol entry corresponding to NAME. | 2773 | int_fast32_t table_size, |
2851 | Return the index of the symbol in the SYMB_TABLE, | 2774 | const unsigned char *extra) |
2852 | or -1 if not found. */ | 2775 | { |
2853 | 2776 | int_fast32_t elem; | |
2854 | auto inline int32_t | ||
2855 | __attribute__ ((always_inline)) | ||
2856 | seek_collating_symbol_entry (const unsigned char *name, size_t name_len) | ||
2857 | { | ||
2858 | int32_t elem; | ||
2859 | |||
2860 | for (elem = 0; elem < table_size; elem++) | ||
2861 | if (symb_table[2 * elem] != 0) | ||
2862 | { | ||
2863 | int32_t idx = symb_table[2 * elem + 1]; | ||
2864 | /* Skip the name of collating element name. */ | ||
2865 | idx += 1 + extra[idx]; | ||
2866 | if (/* Compare the length of the name. */ | ||
2867 | name_len == extra[idx] | ||
2868 | /* Compare the name. */ | ||
2869 | && memcmp (name, &extra[idx + 1], name_len) == 0) | ||
2870 | /* Yep, this is the entry. */ | ||
2871 | return elem; | ||
2872 | } | ||
2873 | return -1; | ||
2874 | } | ||
2875 | 2777 | ||
2876 | /* Local function for parse_bracket_exp used in _LIBC environment. | 2778 | for (elem = 0; elem < table_size; elem++) |
2877 | Look up the collation sequence value of BR_ELEM. | 2779 | if (symb_table[2 * elem] != 0) |
2878 | Return the value if succeeded, UINT_MAX otherwise. */ | 2780 | { |
2781 | int32_t idx = symb_table[2 * elem + 1]; | ||
2782 | /* Skip the name of collating element name. */ | ||
2783 | idx += 1 + extra[idx]; | ||
2784 | if (/* Compare the length of the name. */ | ||
2785 | name_len == extra[idx] | ||
2786 | /* Compare the name. */ | ||
2787 | && memcmp (name, &extra[idx + 1], name_len) == 0) | ||
2788 | /* Yep, this is the entry. */ | ||
2789 | return elem; | ||
2790 | } | ||
2791 | return -1; | ||
2792 | } | ||
2879 | 2793 | ||
2880 | auto inline unsigned int | 2794 | /* Local function for parse_bracket_exp used in _LIBC environment. |
2881 | __attribute__ ((always_inline)) | 2795 | Look up the collation sequence value of BR_ELEM. |
2882 | lookup_collation_sequence_value (bracket_elem_t *br_elem) | 2796 | Return the value if succeeded, UINT_MAX otherwise. */ |
2797 | |||
2798 | static __always_inline unsigned int | ||
2799 | lookup_collation_sequence_value (bracket_elem_t *br_elem, uint32_t nrules, | ||
2800 | const unsigned char *collseqmb, | ||
2801 | const char *collseqwc, | ||
2802 | int_fast32_t table_size, | ||
2803 | const int32_t *symb_table, | ||
2804 | const unsigned char *extra) | ||
2805 | { | ||
2806 | if (br_elem->type == SB_CHAR) | ||
2883 | { | 2807 | { |
2884 | if (br_elem->type == SB_CHAR) | 2808 | /* if (MB_CUR_MAX == 1) */ |
2885 | { | 2809 | if (nrules == 0) |
2886 | /* | 2810 | return collseqmb[br_elem->opr.ch]; |
2887 | if (MB_CUR_MAX == 1) | 2811 | else |
2888 | */ | ||
2889 | if (nrules == 0) | ||
2890 | return collseqmb[br_elem->opr.ch]; | ||
2891 | else | ||
2892 | { | ||
2893 | wint_t wc = __btowc (br_elem->opr.ch); | ||
2894 | return __collseq_table_lookup (collseqwc, wc); | ||
2895 | } | ||
2896 | } | ||
2897 | else if (br_elem->type == MB_CHAR) | ||
2898 | { | 2812 | { |
2899 | if (nrules != 0) | 2813 | wint_t wc = __btowc (br_elem->opr.ch); |
2900 | return __collseq_table_lookup (collseqwc, br_elem->opr.wch); | 2814 | return __collseq_table_lookup (collseqwc, wc); |
2901 | } | 2815 | } |
2902 | else if (br_elem->type == COLL_SYM) | 2816 | } |
2817 | else if (br_elem->type == MB_CHAR) | ||
2818 | { | ||
2819 | if (nrules != 0) | ||
2820 | return __collseq_table_lookup (collseqwc, br_elem->opr.wch); | ||
2821 | } | ||
2822 | else if (br_elem->type == COLL_SYM) | ||
2823 | { | ||
2824 | size_t sym_name_len = strlen ((char *) br_elem->opr.name); | ||
2825 | if (nrules != 0) | ||
2903 | { | 2826 | { |
2904 | size_t sym_name_len = strlen ((char *) br_elem->opr.name); | 2827 | int32_t elem, idx; |
2905 | if (nrules != 0) | 2828 | elem = seek_collating_symbol_entry (br_elem->opr.name, |
2829 | sym_name_len, | ||
2830 | symb_table, table_size, | ||
2831 | extra); | ||
2832 | if (elem != -1) | ||
2906 | { | 2833 | { |
2907 | int32_t elem, idx; | 2834 | /* We found the entry. */ |
2908 | elem = seek_collating_symbol_entry (br_elem->opr.name, | 2835 | idx = symb_table[2 * elem + 1]; |
2909 | sym_name_len); | 2836 | /* Skip the name of collating element name. */ |
2910 | if (elem != -1) | 2837 | idx += 1 + extra[idx]; |
2911 | { | 2838 | /* Skip the byte sequence of the collating element. */ |
2912 | /* We found the entry. */ | 2839 | idx += 1 + extra[idx]; |
2913 | idx = symb_table[2 * elem + 1]; | 2840 | /* Adjust for the alignment. */ |
2914 | /* Skip the name of collating element name. */ | 2841 | idx = (idx + 3) & ~3; |
2915 | idx += 1 + extra[idx]; | 2842 | /* Skip the multibyte collation sequence value. */ |
2916 | /* Skip the byte sequence of the collating element. */ | 2843 | idx += sizeof (unsigned int); |
2917 | idx += 1 + extra[idx]; | 2844 | /* Skip the wide char sequence of the collating element. */ |
2918 | /* Adjust for the alignment. */ | 2845 | idx += sizeof (unsigned int) * |
2919 | idx = (idx + 3) & ~3; | 2846 | (1 + *(unsigned int *) (extra + idx)); |
2920 | /* Skip the multibyte collation sequence value. */ | 2847 | /* Return the collation sequence value. */ |
2921 | idx += sizeof (unsigned int); | 2848 | return *(unsigned int *) (extra + idx); |
2922 | /* Skip the wide char sequence of the collating element. */ | ||
2923 | idx += sizeof (unsigned int) * | ||
2924 | (1 + *(unsigned int *) (extra + idx)); | ||
2925 | /* Return the collation sequence value. */ | ||
2926 | return *(unsigned int *) (extra + idx); | ||
2927 | } | ||
2928 | else if (sym_name_len == 1) | ||
2929 | { | ||
2930 | /* No valid character. Match it as a single byte | ||
2931 | character. */ | ||
2932 | return collseqmb[br_elem->opr.name[0]]; | ||
2933 | } | ||
2934 | } | 2849 | } |
2935 | else if (sym_name_len == 1) | 2850 | else if (sym_name_len == 1) |
2936 | return collseqmb[br_elem->opr.name[0]]; | 2851 | { |
2852 | /* No valid character. Match it as a single byte | ||
2853 | character. */ | ||
2854 | return collseqmb[br_elem->opr.name[0]]; | ||
2855 | } | ||
2937 | } | 2856 | } |
2938 | return UINT_MAX; | 2857 | else if (sym_name_len == 1) |
2858 | return collseqmb[br_elem->opr.name[0]]; | ||
2939 | } | 2859 | } |
2860 | return UINT_MAX; | ||
2861 | } | ||
2940 | 2862 | ||
2941 | /* Local function for parse_bracket_exp used in _LIBC environment. | 2863 | /* Local function for parse_bracket_exp used in _LIBC environment. |
2942 | Build the range expression which starts from START_ELEM, and ends | 2864 | Build the range expression which starts from START_ELEM, and ends |
2943 | at END_ELEM. The result are written to MBCSET and SBCSET. | 2865 | at END_ELEM. The result are written to MBCSET and SBCSET. |
2944 | RANGE_ALLOC is the allocated size of mbcset->range_starts, and | 2866 | RANGE_ALLOC is the allocated size of mbcset->range_starts, and |
2945 | mbcset->range_ends, is a pointer argument since we may | 2867 | mbcset->range_ends, is a pointer argument since we may |
2946 | update it. */ | 2868 | update it. */ |
2869 | |||
2870 | static __always_inline reg_errcode_t | ||
2871 | build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc, | ||
2872 | bracket_elem_t *start_elem, bracket_elem_t *end_elem, | ||
2873 | re_dfa_t *dfa, reg_syntax_t syntax, uint32_t nrules, | ||
2874 | const unsigned char *collseqmb, const char *collseqwc, | ||
2875 | int_fast32_t table_size, const int32_t *symb_table, | ||
2876 | const unsigned char *extra) | ||
2877 | { | ||
2878 | unsigned int ch; | ||
2879 | uint32_t start_collseq; | ||
2880 | uint32_t end_collseq; | ||
2947 | 2881 | ||
2948 | auto inline reg_errcode_t | 2882 | /* Equivalence Classes and Character Classes can't be a range |
2949 | __attribute__ ((always_inline)) | 2883 | start/end. */ |
2950 | build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc, | 2884 | if (__glibc_unlikely (start_elem->type == EQUIV_CLASS |
2951 | bracket_elem_t *start_elem, bracket_elem_t *end_elem) | 2885 | || start_elem->type == CHAR_CLASS |
2952 | { | 2886 | || end_elem->type == EQUIV_CLASS |
2953 | unsigned int ch; | 2887 | || end_elem->type == CHAR_CLASS)) |
2954 | uint32_t start_collseq; | 2888 | return REG_ERANGE; |
2955 | uint32_t end_collseq; | ||
2956 | |||
2957 | /* Equivalence Classes and Character Classes can't be a range | ||
2958 | start/end. */ | ||
2959 | if (__glibc_unlikely (start_elem->type == EQUIV_CLASS | ||
2960 | || start_elem->type == CHAR_CLASS | ||
2961 | || end_elem->type == EQUIV_CLASS | ||
2962 | || end_elem->type == CHAR_CLASS)) | ||
2963 | return REG_ERANGE; | ||
2964 | 2889 | ||
2965 | /* FIXME: Implement rational ranges here, too. */ | 2890 | /* FIXME: Implement rational ranges here, too. */ |
2966 | start_collseq = lookup_collation_sequence_value (start_elem); | 2891 | start_collseq = lookup_collation_sequence_value (start_elem, nrules, collseqmb, collseqwc, |
2967 | end_collseq = lookup_collation_sequence_value (end_elem); | 2892 | table_size, symb_table, extra); |
2968 | /* Check start/end collation sequence values. */ | 2893 | end_collseq = lookup_collation_sequence_value (end_elem, nrules, collseqmb, collseqwc, |
2969 | if (__glibc_unlikely (start_collseq == UINT_MAX | 2894 | table_size, symb_table, extra); |
2970 | || end_collseq == UINT_MAX)) | 2895 | /* Check start/end collation sequence values. */ |
2971 | return REG_ECOLLATE; | 2896 | if (__glibc_unlikely (start_collseq == UINT_MAX |
2972 | if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES) | 2897 | || end_collseq == UINT_MAX)) |
2973 | && start_collseq > end_collseq)) | 2898 | return REG_ECOLLATE; |
2974 | return REG_ERANGE; | 2899 | if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES) |
2900 | && start_collseq > end_collseq)) | ||
2901 | return REG_ERANGE; | ||
2975 | 2902 | ||
2976 | /* Got valid collation sequence values, add them as a new entry. | 2903 | /* Got valid collation sequence values, add them as a new entry. |
2977 | However, if we have no collation elements, and the character set | 2904 | However, if we have no collation elements, and the character set |
2978 | is single byte, the single byte character set that we | 2905 | is single byte, the single byte character set that we |
2979 | build below suffices. */ | 2906 | build below suffices. */ |
2980 | if (nrules > 0 || dfa->mb_cur_max > 1) | 2907 | if (nrules > 0 || dfa->mb_cur_max > 1) |
2908 | { | ||
2909 | /* Check the space of the arrays. */ | ||
2910 | if (__glibc_unlikely (*range_alloc == mbcset->nranges)) | ||
2981 | { | 2911 | { |
2982 | /* Check the space of the arrays. */ | 2912 | /* There is not enough space, need realloc. */ |
2983 | if (__glibc_unlikely (*range_alloc == mbcset->nranges)) | 2913 | uint32_t *new_array_start; |
2984 | { | 2914 | uint32_t *new_array_end; |
2985 | /* There is not enough space, need realloc. */ | 2915 | int new_nranges; |
2986 | uint32_t *new_array_start; | ||
2987 | uint32_t *new_array_end; | ||
2988 | Idx new_nranges; | ||
2989 | |||
2990 | /* +1 in case of mbcset->nranges is 0. */ | ||
2991 | new_nranges = 2 * mbcset->nranges + 1; | ||
2992 | new_array_start = re_realloc (mbcset->range_starts, uint32_t, | ||
2993 | new_nranges); | ||
2994 | new_array_end = re_realloc (mbcset->range_ends, uint32_t, | ||
2995 | new_nranges); | ||
2996 | |||
2997 | if (__glibc_unlikely (new_array_start == NULL | ||
2998 | || new_array_end == NULL)) | ||
2999 | return REG_ESPACE; | ||
3000 | 2916 | ||
3001 | mbcset->range_starts = new_array_start; | 2917 | /* +1 in case of mbcset->nranges is 0. */ |
3002 | mbcset->range_ends = new_array_end; | 2918 | new_nranges = 2 * mbcset->nranges + 1; |
3003 | *range_alloc = new_nranges; | 2919 | new_array_start = re_realloc (mbcset->range_starts, uint32_t, |
3004 | } | 2920 | new_nranges); |
2921 | new_array_end = re_realloc (mbcset->range_ends, uint32_t, | ||
2922 | new_nranges); | ||
3005 | 2923 | ||
3006 | mbcset->range_starts[mbcset->nranges] = start_collseq; | 2924 | if (__glibc_unlikely (new_array_start == NULL |
3007 | mbcset->range_ends[mbcset->nranges++] = end_collseq; | 2925 | || new_array_end == NULL)) |
3008 | } | 2926 | return REG_ESPACE; |
3009 | 2927 | ||
3010 | /* Build the table for single byte characters. */ | 2928 | mbcset->range_starts = new_array_start; |
3011 | for (ch = 0; ch < SBC_MAX; ch++) | 2929 | mbcset->range_ends = new_array_end; |
3012 | { | 2930 | *range_alloc = new_nranges; |
3013 | uint32_t ch_collseq; | ||
3014 | /* | ||
3015 | if (MB_CUR_MAX == 1) | ||
3016 | */ | ||
3017 | if (nrules == 0) | ||
3018 | ch_collseq = collseqmb[ch]; | ||
3019 | else | ||
3020 | ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch)); | ||
3021 | if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) | ||
3022 | bitset_set (sbcset, ch); | ||
3023 | } | 2931 | } |
3024 | return REG_NOERROR; | 2932 | |
2933 | mbcset->range_starts[mbcset->nranges] = start_collseq; | ||
2934 | mbcset->range_ends[mbcset->nranges++] = end_collseq; | ||
3025 | } | 2935 | } |
3026 | 2936 | ||
3027 | /* Local function for parse_bracket_exp used in _LIBC environment. | 2937 | /* Build the table for single byte characters. */ |
3028 | Build the collating element which is represented by NAME. | 2938 | for (ch = 0; ch < SBC_MAX; ch++) |
3029 | The result are written to MBCSET and SBCSET. | 2939 | { |
3030 | COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a | 2940 | uint32_t ch_collseq; |
3031 | pointer argument since we may update it. */ | 2941 | /* if (MB_CUR_MAX == 1) */ |
2942 | if (nrules == 0) | ||
2943 | ch_collseq = collseqmb[ch]; | ||
2944 | else | ||
2945 | ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch)); | ||
2946 | if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) | ||
2947 | bitset_set (sbcset, ch); | ||
2948 | } | ||
2949 | return REG_NOERROR; | ||
2950 | } | ||
3032 | 2951 | ||
3033 | auto inline reg_errcode_t | 2952 | /* Local function for parse_bracket_exp used in _LIBC environment. |
3034 | __attribute__ ((always_inline)) | 2953 | Build the collating element which is represented by NAME. |
3035 | build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, | 2954 | The result are written to MBCSET and SBCSET. |
3036 | Idx *coll_sym_alloc, const unsigned char *name) | 2955 | COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a |
2956 | pointer argument since we may update it. */ | ||
2957 | |||
2958 | static __always_inline reg_errcode_t | ||
2959 | build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, | ||
2960 | Idx *coll_sym_alloc, const unsigned char *name, | ||
2961 | uint_fast32_t nrules, int_fast32_t table_size, | ||
2962 | const int32_t *symb_table, const unsigned char *extra) | ||
2963 | { | ||
2964 | int32_t elem, idx; | ||
2965 | size_t name_len = strlen ((const char *) name); | ||
2966 | if (nrules != 0) | ||
3037 | { | 2967 | { |
3038 | int32_t elem, idx; | 2968 | elem = seek_collating_symbol_entry (name, name_len, symb_table, |
3039 | size_t name_len = strlen ((const char *) name); | 2969 | table_size, extra); |
3040 | if (nrules != 0) | 2970 | if (elem != -1) |
3041 | { | 2971 | { |
3042 | elem = seek_collating_symbol_entry (name, name_len); | 2972 | /* We found the entry. */ |
3043 | if (elem != -1) | 2973 | idx = symb_table[2 * elem + 1]; |
3044 | { | 2974 | /* Skip the name of collating element name. */ |
3045 | /* We found the entry. */ | 2975 | idx += 1 + extra[idx]; |
3046 | idx = symb_table[2 * elem + 1]; | 2976 | } |
3047 | /* Skip the name of collating element name. */ | 2977 | else if (name_len == 1) |
3048 | idx += 1 + extra[idx]; | 2978 | { |
3049 | } | 2979 | /* No valid character, treat it as a normal |
3050 | else if (name_len == 1) | 2980 | character. */ |
3051 | { | 2981 | bitset_set (sbcset, name[0]); |
3052 | /* No valid character, treat it as a normal | ||
3053 | character. */ | ||
3054 | bitset_set (sbcset, name[0]); | ||
3055 | return REG_NOERROR; | ||
3056 | } | ||
3057 | else | ||
3058 | return REG_ECOLLATE; | ||
3059 | |||
3060 | /* Got valid collation sequence, add it as a new entry. */ | ||
3061 | /* Check the space of the arrays. */ | ||
3062 | if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms)) | ||
3063 | { | ||
3064 | /* Not enough, realloc it. */ | ||
3065 | /* +1 in case of mbcset->ncoll_syms is 0. */ | ||
3066 | Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; | ||
3067 | /* Use realloc since mbcset->coll_syms is NULL | ||
3068 | if *alloc == 0. */ | ||
3069 | int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t, | ||
3070 | new_coll_sym_alloc); | ||
3071 | if (__glibc_unlikely (new_coll_syms == NULL)) | ||
3072 | return REG_ESPACE; | ||
3073 | mbcset->coll_syms = new_coll_syms; | ||
3074 | *coll_sym_alloc = new_coll_sym_alloc; | ||
3075 | } | ||
3076 | mbcset->coll_syms[mbcset->ncoll_syms++] = idx; | ||
3077 | return REG_NOERROR; | 2982 | return REG_NOERROR; |
3078 | } | 2983 | } |
3079 | else | 2984 | else |
2985 | return REG_ECOLLATE; | ||
2986 | |||
2987 | /* Got valid collation sequence, add it as a new entry. */ | ||
2988 | /* Check the space of the arrays. */ | ||
2989 | if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms)) | ||
3080 | { | 2990 | { |
3081 | if (__glibc_unlikely (name_len != 1)) | 2991 | /* Not enough, realloc it. */ |
3082 | return REG_ECOLLATE; | 2992 | /* +1 in case of mbcset->ncoll_syms is 0. */ |
3083 | else | 2993 | int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; |
3084 | { | 2994 | /* Use realloc since mbcset->coll_syms is NULL |
3085 | bitset_set (sbcset, name[0]); | 2995 | if *alloc == 0. */ |
3086 | return REG_NOERROR; | 2996 | int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t, |
3087 | } | 2997 | new_coll_sym_alloc); |
2998 | if (__glibc_unlikely (new_coll_syms == NULL)) | ||
2999 | return REG_ESPACE; | ||
3000 | mbcset->coll_syms = new_coll_syms; | ||
3001 | *coll_sym_alloc = new_coll_sym_alloc; | ||
3088 | } | 3002 | } |
3003 | mbcset->coll_syms[mbcset->ncoll_syms++] = idx; | ||
3004 | return REG_NOERROR; | ||
3089 | } | 3005 | } |
3090 | #endif | 3006 | else |
3007 | { | ||
3008 | if (__glibc_unlikely (name_len != 1)) | ||
3009 | return REG_ECOLLATE; | ||
3010 | else | ||
3011 | { | ||
3012 | bitset_set (sbcset, name[0]); | ||
3013 | return REG_NOERROR; | ||
3014 | } | ||
3015 | } | ||
3016 | } | ||
3017 | #endif /* _LIBC */ | ||
3018 | |||
3019 | /* This function parse bracket expression like "[abc]", "[a-c]", | ||
3020 | "[[.a-a.]]" etc. */ | ||
3021 | |||
3022 | static bin_tree_t * | ||
3023 | parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, | ||
3024 | reg_syntax_t syntax, reg_errcode_t *err) | ||
3025 | { | ||
3026 | const unsigned char *collseqmb = NULL; | ||
3027 | const char *collseqwc = NULL; | ||
3028 | uint_fast32_t nrules = 0; | ||
3029 | int_fast32_t table_size = 0; | ||
3030 | const void *symb_table = NULL; | ||
3031 | const unsigned char *extra = NULL; | ||
3091 | 3032 | ||
3092 | re_token_t br_token; | 3033 | re_token_t br_token; |
3093 | re_bitset_ptr_t sbcset; | 3034 | re_bitset_ptr_t sbcset; |
3094 | #ifdef RE_ENABLE_I18N | ||
3095 | re_charset_t *mbcset; | 3035 | re_charset_t *mbcset; |
3096 | Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0; | 3036 | Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0; |
3097 | Idx equiv_class_alloc = 0, char_class_alloc = 0; | 3037 | Idx equiv_class_alloc = 0, char_class_alloc = 0; |
3098 | #endif /* not RE_ENABLE_I18N */ | ||
3099 | bool non_match = false; | 3038 | bool non_match = false; |
3100 | bin_tree_t *work_tree; | 3039 | bin_tree_t *work_tree; |
3101 | int token_len; | 3040 | int token_len; |
@@ -3111,26 +3050,17 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, | |||
3111 | */ | 3050 | */ |
3112 | collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); | 3051 | collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); |
3113 | table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB); | 3052 | table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB); |
3114 | symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE, | 3053 | symb_table = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_TABLEMB); |
3115 | _NL_COLLATE_SYMB_TABLEMB); | ||
3116 | extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, | 3054 | extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, |
3117 | _NL_COLLATE_SYMB_EXTRAMB); | 3055 | _NL_COLLATE_SYMB_EXTRAMB); |
3118 | } | 3056 | } |
3119 | #endif | 3057 | #endif |
3120 | sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); | 3058 | sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); |
3121 | #ifdef RE_ENABLE_I18N | ||
3122 | mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); | 3059 | mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); |
3123 | #endif /* RE_ENABLE_I18N */ | ||
3124 | #ifdef RE_ENABLE_I18N | ||
3125 | if (__glibc_unlikely (sbcset == NULL || mbcset == NULL)) | 3060 | if (__glibc_unlikely (sbcset == NULL || mbcset == NULL)) |
3126 | #else | ||
3127 | if (__glibc_unlikely (sbcset == NULL)) | ||
3128 | #endif /* RE_ENABLE_I18N */ | ||
3129 | { | 3061 | { |
3130 | re_free (sbcset); | 3062 | re_free (sbcset); |
3131 | #ifdef RE_ENABLE_I18N | ||
3132 | re_free (mbcset); | 3063 | re_free (mbcset); |
3133 | #endif | ||
3134 | *err = REG_ESPACE; | 3064 | *err = REG_ESPACE; |
3135 | return NULL; | 3065 | return NULL; |
3136 | } | 3066 | } |
@@ -3143,9 +3073,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, | |||
3143 | } | 3073 | } |
3144 | if (token->type == OP_NON_MATCH_LIST) | 3074 | if (token->type == OP_NON_MATCH_LIST) |
3145 | { | 3075 | { |
3146 | #ifdef RE_ENABLE_I18N | ||
3147 | mbcset->non_match = 1; | 3076 | mbcset->non_match = 1; |
3148 | #endif /* not RE_ENABLE_I18N */ | ||
3149 | non_match = true; | 3077 | non_match = true; |
3150 | if (syntax & RE_HAT_LISTS_NOT_NEWLINE) | 3078 | if (syntax & RE_HAT_LISTS_NOT_NEWLINE) |
3151 | bitset_set (sbcset, '\n'); | 3079 | bitset_set (sbcset, '\n'); |
@@ -3228,18 +3156,10 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, | |||
3228 | 3156 | ||
3229 | token_len = peek_token_bracket (token, regexp, syntax); | 3157 | token_len = peek_token_bracket (token, regexp, syntax); |
3230 | 3158 | ||
3231 | #ifdef _LIBC | ||
3232 | *err = build_range_exp (sbcset, mbcset, &range_alloc, | 3159 | *err = build_range_exp (sbcset, mbcset, &range_alloc, |
3233 | &start_elem, &end_elem); | 3160 | &start_elem, &end_elem, |
3234 | #else | 3161 | dfa, syntax, nrules, collseqmb, collseqwc, |
3235 | # ifdef RE_ENABLE_I18N | 3162 | table_size, symb_table, extra); |
3236 | *err = build_range_exp (syntax, sbcset, | ||
3237 | dfa->mb_cur_max > 1 ? mbcset : NULL, | ||
3238 | &range_alloc, &start_elem, &end_elem); | ||
3239 | # else | ||
3240 | *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem); | ||
3241 | # endif | ||
3242 | #endif /* RE_ENABLE_I18N */ | ||
3243 | if (__glibc_unlikely (*err != REG_NOERROR)) | 3163 | if (__glibc_unlikely (*err != REG_NOERROR)) |
3244 | goto parse_bracket_exp_free_return; | 3164 | goto parse_bracket_exp_free_return; |
3245 | } | 3165 | } |
@@ -3250,7 +3170,6 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, | |||
3250 | case SB_CHAR: | 3170 | case SB_CHAR: |
3251 | bitset_set (sbcset, start_elem.opr.ch); | 3171 | bitset_set (sbcset, start_elem.opr.ch); |
3252 | break; | 3172 | break; |
3253 | #ifdef RE_ENABLE_I18N | ||
3254 | case MB_CHAR: | 3173 | case MB_CHAR: |
3255 | /* Check whether the array has enough space. */ | 3174 | /* Check whether the array has enough space. */ |
3256 | if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars)) | 3175 | if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars)) |
@@ -3268,30 +3187,24 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, | |||
3268 | } | 3187 | } |
3269 | mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch; | 3188 | mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch; |
3270 | break; | 3189 | break; |
3271 | #endif /* RE_ENABLE_I18N */ | ||
3272 | case EQUIV_CLASS: | 3190 | case EQUIV_CLASS: |
3273 | *err = build_equiv_class (sbcset, | 3191 | *err = build_equiv_class (sbcset, |
3274 | #ifdef RE_ENABLE_I18N | ||
3275 | mbcset, &equiv_class_alloc, | 3192 | mbcset, &equiv_class_alloc, |
3276 | #endif /* RE_ENABLE_I18N */ | ||
3277 | start_elem.opr.name); | 3193 | start_elem.opr.name); |
3278 | if (__glibc_unlikely (*err != REG_NOERROR)) | 3194 | if (__glibc_unlikely (*err != REG_NOERROR)) |
3279 | goto parse_bracket_exp_free_return; | 3195 | goto parse_bracket_exp_free_return; |
3280 | break; | 3196 | break; |
3281 | case COLL_SYM: | 3197 | case COLL_SYM: |
3282 | *err = build_collating_symbol (sbcset, | 3198 | *err = build_collating_symbol (sbcset, |
3283 | #ifdef RE_ENABLE_I18N | ||
3284 | mbcset, &coll_sym_alloc, | 3199 | mbcset, &coll_sym_alloc, |
3285 | #endif /* RE_ENABLE_I18N */ | 3200 | start_elem.opr.name, |
3286 | start_elem.opr.name); | 3201 | nrules, table_size, symb_table, extra); |
3287 | if (__glibc_unlikely (*err != REG_NOERROR)) | 3202 | if (__glibc_unlikely (*err != REG_NOERROR)) |
3288 | goto parse_bracket_exp_free_return; | 3203 | goto parse_bracket_exp_free_return; |
3289 | break; | 3204 | break; |
3290 | case CHAR_CLASS: | 3205 | case CHAR_CLASS: |
3291 | *err = build_charclass (regexp->trans, sbcset, | 3206 | *err = build_charclass (regexp->trans, sbcset, |
3292 | #ifdef RE_ENABLE_I18N | ||
3293 | mbcset, &char_class_alloc, | 3207 | mbcset, &char_class_alloc, |
3294 | #endif /* RE_ENABLE_I18N */ | ||
3295 | (const char *) start_elem.opr.name, | 3208 | (const char *) start_elem.opr.name, |
3296 | syntax); | 3209 | syntax); |
3297 | if (__glibc_unlikely (*err != REG_NOERROR)) | 3210 | if (__glibc_unlikely (*err != REG_NOERROR)) |
@@ -3317,7 +3230,6 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, | |||
3317 | if (non_match) | 3230 | if (non_match) |
3318 | bitset_not (sbcset); | 3231 | bitset_not (sbcset); |
3319 | 3232 | ||
3320 | #ifdef RE_ENABLE_I18N | ||
3321 | /* Ensure only single byte characters are set. */ | 3233 | /* Ensure only single byte characters are set. */ |
3322 | if (dfa->mb_cur_max > 1) | 3234 | if (dfa->mb_cur_max > 1) |
3323 | bitset_mask (sbcset, dfa->sb_char); | 3235 | bitset_mask (sbcset, dfa->sb_char); |
@@ -3361,11 +3273,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, | |||
3361 | } | 3273 | } |
3362 | } | 3274 | } |
3363 | else | 3275 | else |
3364 | #endif /* not RE_ENABLE_I18N */ | ||
3365 | { | 3276 | { |
3366 | #ifdef RE_ENABLE_I18N | ||
3367 | free_charset (mbcset); | 3277 | free_charset (mbcset); |
3368 | #endif | ||
3369 | /* Build a tree for simple bracket. */ | 3278 | /* Build a tree for simple bracket. */ |
3370 | br_token.type = SIMPLE_BRACKET; | 3279 | br_token.type = SIMPLE_BRACKET; |
3371 | br_token.opr.sbcset = sbcset; | 3280 | br_token.opr.sbcset = sbcset; |
@@ -3379,9 +3288,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, | |||
3379 | *err = REG_ESPACE; | 3288 | *err = REG_ESPACE; |
3380 | parse_bracket_exp_free_return: | 3289 | parse_bracket_exp_free_return: |
3381 | re_free (sbcset); | 3290 | re_free (sbcset); |
3382 | #ifdef RE_ENABLE_I18N | ||
3383 | free_charset (mbcset); | 3291 | free_charset (mbcset); |
3384 | #endif /* RE_ENABLE_I18N */ | ||
3385 | return NULL; | 3292 | return NULL; |
3386 | } | 3293 | } |
3387 | 3294 | ||
@@ -3392,7 +3299,6 @@ parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp, | |||
3392 | re_token_t *token, int token_len, re_dfa_t *dfa, | 3299 | re_token_t *token, int token_len, re_dfa_t *dfa, |
3393 | reg_syntax_t syntax, bool accept_hyphen) | 3300 | reg_syntax_t syntax, bool accept_hyphen) |
3394 | { | 3301 | { |
3395 | #ifdef RE_ENABLE_I18N | ||
3396 | int cur_char_size; | 3302 | int cur_char_size; |
3397 | cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp)); | 3303 | cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp)); |
3398 | if (cur_char_size > 1) | 3304 | if (cur_char_size > 1) |
@@ -3402,7 +3308,6 @@ parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp, | |||
3402 | re_string_skip_bytes (regexp, cur_char_size); | 3308 | re_string_skip_bytes (regexp, cur_char_size); |
3403 | return REG_NOERROR; | 3309 | return REG_NOERROR; |
3404 | } | 3310 | } |
3405 | #endif /* RE_ENABLE_I18N */ | ||
3406 | re_string_skip_bytes (regexp, token_len); /* Skip a token. */ | 3311 | re_string_skip_bytes (regexp, token_len); /* Skip a token. */ |
3407 | if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS | 3312 | if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS |
3408 | || token->type == OP_OPEN_EQUIV_CLASS) | 3313 | || token->type == OP_OPEN_EQUIV_CLASS) |
@@ -3475,12 +3380,8 @@ parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp, | |||
3475 | is a pointer argument since we may update it. */ | 3380 | is a pointer argument since we may update it. */ |
3476 | 3381 | ||
3477 | static reg_errcode_t | 3382 | static reg_errcode_t |
3478 | #ifdef RE_ENABLE_I18N | ||
3479 | build_equiv_class (bitset_t sbcset, re_charset_t *mbcset, | 3383 | build_equiv_class (bitset_t sbcset, re_charset_t *mbcset, |
3480 | Idx *equiv_class_alloc, const unsigned char *name) | 3384 | Idx *equiv_class_alloc, const unsigned char *name) |
3481 | #else /* not RE_ENABLE_I18N */ | ||
3482 | build_equiv_class (bitset_t sbcset, const unsigned char *name) | ||
3483 | #endif /* not RE_ENABLE_I18N */ | ||
3484 | { | 3385 | { |
3485 | #ifdef _LIBC | 3386 | #ifdef _LIBC |
3486 | uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); | 3387 | uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); |
@@ -3560,14 +3461,9 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name) | |||
3560 | is a pointer argument since we may update it. */ | 3461 | is a pointer argument since we may update it. */ |
3561 | 3462 | ||
3562 | static reg_errcode_t | 3463 | static reg_errcode_t |
3563 | #ifdef RE_ENABLE_I18N | ||
3564 | build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, | 3464 | build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, |
3565 | re_charset_t *mbcset, Idx *char_class_alloc, | 3465 | re_charset_t *mbcset, Idx *char_class_alloc, |
3566 | const char *class_name, reg_syntax_t syntax) | 3466 | const char *class_name, reg_syntax_t syntax) |
3567 | #else /* not RE_ENABLE_I18N */ | ||
3568 | build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, | ||
3569 | const char *class_name, reg_syntax_t syntax) | ||
3570 | #endif /* not RE_ENABLE_I18N */ | ||
3571 | { | 3467 | { |
3572 | int i; | 3468 | int i; |
3573 | const char *name = class_name; | 3469 | const char *name = class_name; |
@@ -3578,7 +3474,6 @@ build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, | |||
3578 | && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0)) | 3474 | && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0)) |
3579 | name = "alpha"; | 3475 | name = "alpha"; |
3580 | 3476 | ||
3581 | #ifdef RE_ENABLE_I18N | ||
3582 | /* Check the space of the arrays. */ | 3477 | /* Check the space of the arrays. */ |
3583 | if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes)) | 3478 | if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes)) |
3584 | { | 3479 | { |
@@ -3594,7 +3489,6 @@ build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, | |||
3594 | *char_class_alloc = new_char_class_alloc; | 3489 | *char_class_alloc = new_char_class_alloc; |
3595 | } | 3490 | } |
3596 | mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name); | 3491 | mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name); |
3597 | #endif /* RE_ENABLE_I18N */ | ||
3598 | 3492 | ||
3599 | #define BUILD_CHARCLASS_LOOP(ctype_func) \ | 3493 | #define BUILD_CHARCLASS_LOOP(ctype_func) \ |
3600 | do { \ | 3494 | do { \ |
@@ -3649,10 +3543,8 @@ build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, | |||
3649 | reg_errcode_t *err) | 3543 | reg_errcode_t *err) |
3650 | { | 3544 | { |
3651 | re_bitset_ptr_t sbcset; | 3545 | re_bitset_ptr_t sbcset; |
3652 | #ifdef RE_ENABLE_I18N | ||
3653 | re_charset_t *mbcset; | 3546 | re_charset_t *mbcset; |
3654 | Idx alloc = 0; | 3547 | Idx alloc = 0; |
3655 | #endif /* not RE_ENABLE_I18N */ | ||
3656 | reg_errcode_t ret; | 3548 | reg_errcode_t ret; |
3657 | bin_tree_t *tree; | 3549 | bin_tree_t *tree; |
3658 | 3550 | ||
@@ -3662,7 +3554,6 @@ build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, | |||
3662 | *err = REG_ESPACE; | 3554 | *err = REG_ESPACE; |
3663 | return NULL; | 3555 | return NULL; |
3664 | } | 3556 | } |
3665 | #ifdef RE_ENABLE_I18N | ||
3666 | mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); | 3557 | mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); |
3667 | if (__glibc_unlikely (mbcset == NULL)) | 3558 | if (__glibc_unlikely (mbcset == NULL)) |
3668 | { | 3559 | { |
@@ -3671,21 +3562,14 @@ build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, | |||
3671 | return NULL; | 3562 | return NULL; |
3672 | } | 3563 | } |
3673 | mbcset->non_match = non_match; | 3564 | mbcset->non_match = non_match; |
3674 | #endif /* RE_ENABLE_I18N */ | ||
3675 | 3565 | ||
3676 | /* We don't care the syntax in this case. */ | 3566 | /* We don't care the syntax in this case. */ |
3677 | ret = build_charclass (trans, sbcset, | 3567 | ret = build_charclass (trans, sbcset, mbcset, &alloc, class_name, 0); |
3678 | #ifdef RE_ENABLE_I18N | ||
3679 | mbcset, &alloc, | ||
3680 | #endif /* RE_ENABLE_I18N */ | ||
3681 | class_name, 0); | ||
3682 | 3568 | ||
3683 | if (__glibc_unlikely (ret != REG_NOERROR)) | 3569 | if (__glibc_unlikely (ret != REG_NOERROR)) |
3684 | { | 3570 | { |
3685 | re_free (sbcset); | 3571 | re_free (sbcset); |
3686 | #ifdef RE_ENABLE_I18N | ||
3687 | free_charset (mbcset); | 3572 | free_charset (mbcset); |
3688 | #endif /* RE_ENABLE_I18N */ | ||
3689 | *err = ret; | 3573 | *err = ret; |
3690 | return NULL; | 3574 | return NULL; |
3691 | } | 3575 | } |
@@ -3697,11 +3581,9 @@ build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, | |||
3697 | if (non_match) | 3581 | if (non_match) |
3698 | bitset_not (sbcset); | 3582 | bitset_not (sbcset); |
3699 | 3583 | ||
3700 | #ifdef RE_ENABLE_I18N | ||
3701 | /* Ensure only single byte characters are set. */ | 3584 | /* Ensure only single byte characters are set. */ |
3702 | if (dfa->mb_cur_max > 1) | 3585 | if (dfa->mb_cur_max > 1) |
3703 | bitset_mask (sbcset, dfa->sb_char); | 3586 | bitset_mask (sbcset, dfa->sb_char); |
3704 | #endif | ||
3705 | 3587 | ||
3706 | /* Build a tree for simple bracket. */ | 3588 | /* Build a tree for simple bracket. */ |
3707 | re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset }; | 3589 | re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset }; |
@@ -3709,7 +3591,6 @@ build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, | |||
3709 | if (__glibc_unlikely (tree == NULL)) | 3591 | if (__glibc_unlikely (tree == NULL)) |
3710 | goto build_word_op_espace; | 3592 | goto build_word_op_espace; |
3711 | 3593 | ||
3712 | #ifdef RE_ENABLE_I18N | ||
3713 | if (dfa->mb_cur_max > 1) | 3594 | if (dfa->mb_cur_max > 1) |
3714 | { | 3595 | { |
3715 | bin_tree_t *mbc_tree; | 3596 | bin_tree_t *mbc_tree; |
@@ -3730,15 +3611,10 @@ build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, | |||
3730 | free_charset (mbcset); | 3611 | free_charset (mbcset); |
3731 | return tree; | 3612 | return tree; |
3732 | } | 3613 | } |
3733 | #else /* not RE_ENABLE_I18N */ | ||
3734 | return tree; | ||
3735 | #endif /* not RE_ENABLE_I18N */ | ||
3736 | 3614 | ||
3737 | build_word_op_espace: | 3615 | build_word_op_espace: |
3738 | re_free (sbcset); | 3616 | re_free (sbcset); |
3739 | #ifdef RE_ENABLE_I18N | ||
3740 | free_charset (mbcset); | 3617 | free_charset (mbcset); |
3741 | #endif /* RE_ENABLE_I18N */ | ||
3742 | *err = REG_ESPACE; | 3618 | *err = REG_ESPACE; |
3743 | return NULL; | 3619 | return NULL; |
3744 | } | 3620 | } |
@@ -3771,21 +3647,19 @@ fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax) | |||
3771 | return num; | 3647 | return num; |
3772 | } | 3648 | } |
3773 | 3649 | ||
3774 | #ifdef RE_ENABLE_I18N | ||
3775 | static void | 3650 | static void |
3776 | free_charset (re_charset_t *cset) | 3651 | free_charset (re_charset_t *cset) |
3777 | { | 3652 | { |
3778 | re_free (cset->mbchars); | 3653 | re_free (cset->mbchars); |
3779 | # ifdef _LIBC | 3654 | #ifdef _LIBC |
3780 | re_free (cset->coll_syms); | 3655 | re_free (cset->coll_syms); |
3781 | re_free (cset->equiv_classes); | 3656 | re_free (cset->equiv_classes); |
3782 | # endif | 3657 | #endif |
3783 | re_free (cset->range_starts); | 3658 | re_free (cset->range_starts); |
3784 | re_free (cset->range_ends); | 3659 | re_free (cset->range_ends); |
3785 | re_free (cset->char_classes); | 3660 | re_free (cset->char_classes); |
3786 | re_free (cset); | 3661 | re_free (cset); |
3787 | } | 3662 | } |
3788 | #endif /* RE_ENABLE_I18N */ | ||
3789 | 3663 | ||
3790 | /* Functions for binary tree operation. */ | 3664 | /* Functions for binary tree operation. */ |
3791 | 3665 | ||
@@ -3851,13 +3725,10 @@ mark_opt_subexp (void *extra, bin_tree_t *node) | |||
3851 | static void | 3725 | static void |
3852 | free_token (re_token_t *node) | 3726 | free_token (re_token_t *node) |
3853 | { | 3727 | { |
3854 | #ifdef RE_ENABLE_I18N | ||
3855 | if (node->type == COMPLEX_BRACKET && node->duplicated == 0) | 3728 | if (node->type == COMPLEX_BRACKET && node->duplicated == 0) |
3856 | free_charset (node->opr.mbcset); | 3729 | free_charset (node->opr.mbcset); |
3857 | else | 3730 | else if (node->type == SIMPLE_BRACKET && node->duplicated == 0) |
3858 | #endif /* RE_ENABLE_I18N */ | 3731 | re_free (node->opr.sbcset); |
3859 | if (node->type == SIMPLE_BRACKET && node->duplicated == 0) | ||
3860 | re_free (node->opr.sbcset); | ||
3861 | } | 3732 | } |
3862 | 3733 | ||
3863 | /* Worker function for tree walking. Free the allocated memory inside NODE | 3734 | /* Worker function for tree walking. Free the allocated memory inside NODE |