diff options
Diffstat (limited to 'gl/regcomp.c')
-rw-r--r-- | gl/regcomp.c | 359 |
1 files changed, 202 insertions, 157 deletions
diff --git a/gl/regcomp.c b/gl/regcomp.c index 86ca02b..f0b2e52 100644 --- a/gl/regcomp.c +++ b/gl/regcomp.c | |||
@@ -1,22 +1,21 @@ | |||
1 | /* Extended regular expression matching and search library. | 1 | /* Extended regular expression matching and search library. |
2 | Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free | 2 | Copyright (C) 2002-2013 Free Software Foundation, Inc. |
3 | Software Foundation, Inc. | ||
4 | This file is part of the GNU C Library. | 3 | This file is part of the GNU C Library. |
5 | Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. | 4 | Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. |
6 | 5 | ||
7 | This program is free software; you can redistribute it and/or modify | 6 | The GNU C Library is free software; you can redistribute it and/or |
8 | it under the terms of the GNU General Public License as published by | 7 | modify it under the terms of the GNU General Public |
9 | the Free Software Foundation; either version 3, or (at your option) | 8 | License as published by the Free Software Foundation; either |
10 | any later version. | 9 | version 3 of the License, or (at your option) any later version. |
11 | 10 | ||
12 | This program is distributed in the hope that it will be useful, | 11 | The GNU C Library is distributed in the hope that it will be useful, |
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of | 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
15 | GNU General Public License for more details. | 14 | General Public License for more details. |
16 | 15 | ||
17 | You should have received a copy of the GNU General Public License along | 16 | You should have received a copy of the GNU General Public |
18 | with this program; if not, write to the Free Software Foundation, | 17 | License along with the GNU C Library; if not, see |
19 | Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ | 18 | <http://www.gnu.org/licenses/>. */ |
20 | 19 | ||
21 | static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, | 20 | static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, |
22 | size_t length, reg_syntax_t syntax); | 21 | size_t length, reg_syntax_t syntax); |
@@ -95,20 +94,20 @@ static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, | |||
95 | bitset_t sbcset, | 94 | bitset_t sbcset, |
96 | re_charset_t *mbcset, | 95 | re_charset_t *mbcset, |
97 | Idx *char_class_alloc, | 96 | Idx *char_class_alloc, |
98 | const unsigned char *class_name, | 97 | const char *class_name, |
99 | reg_syntax_t syntax); | 98 | reg_syntax_t syntax); |
100 | #else /* not RE_ENABLE_I18N */ | 99 | #else /* not RE_ENABLE_I18N */ |
101 | static reg_errcode_t build_equiv_class (bitset_t sbcset, | 100 | static reg_errcode_t build_equiv_class (bitset_t sbcset, |
102 | const unsigned char *name); | 101 | const unsigned char *name); |
103 | static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, | 102 | static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, |
104 | bitset_t sbcset, | 103 | bitset_t sbcset, |
105 | const unsigned char *class_name, | 104 | const char *class_name, |
106 | reg_syntax_t syntax); | 105 | reg_syntax_t syntax); |
107 | #endif /* not RE_ENABLE_I18N */ | 106 | #endif /* not RE_ENABLE_I18N */ |
108 | static bin_tree_t *build_charclass_op (re_dfa_t *dfa, | 107 | static bin_tree_t *build_charclass_op (re_dfa_t *dfa, |
109 | RE_TRANSLATE_TYPE trans, | 108 | RE_TRANSLATE_TYPE trans, |
110 | const unsigned char *class_name, | 109 | const char *class_name, |
111 | const unsigned char *extra, | 110 | const char *extra, |
112 | bool non_match, reg_errcode_t *err); | 111 | bool non_match, reg_errcode_t *err); |
113 | static bin_tree_t *create_tree (re_dfa_t *dfa, | 112 | static bin_tree_t *create_tree (re_dfa_t *dfa, |
114 | bin_tree_t *left, bin_tree_t *right, | 113 | bin_tree_t *left, bin_tree_t *right, |
@@ -207,7 +206,7 @@ static const size_t __re_error_msgid_idx[] = | |||
207 | compiles PATTERN (of length LENGTH) and puts the result in BUFP. | 206 | compiles PATTERN (of length LENGTH) and puts the result in BUFP. |
208 | Returns 0 if the pattern was valid, otherwise an error string. | 207 | Returns 0 if the pattern was valid, otherwise an error string. |
209 | 208 | ||
210 | Assumes the `allocated' (and perhaps `buffer') and `translate' fields | 209 | Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields |
211 | are set in BUFP on entry. */ | 210 | are set in BUFP on entry. */ |
212 | 211 | ||
213 | #ifdef _LIBC | 212 | #ifdef _LIBC |
@@ -242,7 +241,7 @@ re_compile_pattern (const char *pattern, size_t length, | |||
242 | weak_alias (__re_compile_pattern, re_compile_pattern) | 241 | weak_alias (__re_compile_pattern, re_compile_pattern) |
243 | #endif | 242 | #endif |
244 | 243 | ||
245 | /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can | 244 | /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can |
246 | also be assigned to arbitrarily: each pattern buffer stores its own | 245 | also be assigned to arbitrarily: each pattern buffer stores its own |
247 | syntax, so it can be changed between regex compilations. */ | 246 | syntax, so it can be changed between regex compilations. */ |
248 | /* This has no initializer because initialized variables in Emacs | 247 | /* This has no initializer because initialized variables in Emacs |
@@ -274,7 +273,7 @@ int | |||
274 | re_compile_fastmap (bufp) | 273 | re_compile_fastmap (bufp) |
275 | struct re_pattern_buffer *bufp; | 274 | struct re_pattern_buffer *bufp; |
276 | { | 275 | { |
277 | re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; | 276 | re_dfa_t *dfa = bufp->buffer; |
278 | char *fastmap = bufp->fastmap; | 277 | char *fastmap = bufp->fastmap; |
279 | 278 | ||
280 | memset (fastmap, '\0', sizeof (char) * SBC_MAX); | 279 | memset (fastmap, '\0', sizeof (char) * SBC_MAX); |
@@ -293,7 +292,7 @@ weak_alias (__re_compile_fastmap, re_compile_fastmap) | |||
293 | #endif | 292 | #endif |
294 | 293 | ||
295 | static inline void | 294 | static inline void |
296 | __attribute ((always_inline)) | 295 | __attribute__ ((always_inline)) |
297 | re_set_fastmap (char *fastmap, bool icase, int ch) | 296 | re_set_fastmap (char *fastmap, bool icase, int ch) |
298 | { | 297 | { |
299 | fastmap[ch] = 1; | 298 | fastmap[ch] = 1; |
@@ -308,7 +307,7 @@ static void | |||
308 | re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, | 307 | re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, |
309 | char *fastmap) | 308 | char *fastmap) |
310 | { | 309 | { |
311 | re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; | 310 | re_dfa_t *dfa = bufp->buffer; |
312 | Idx node_cnt; | 311 | Idx node_cnt; |
313 | bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE)); | 312 | bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE)); |
314 | for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt) | 313 | for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt) |
@@ -440,15 +439,15 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, | |||
440 | PREG is a regex_t *. We do not expect any fields to be initialized, | 439 | PREG is a regex_t *. We do not expect any fields to be initialized, |
441 | since POSIX says we shouldn't. Thus, we set | 440 | since POSIX says we shouldn't. Thus, we set |
442 | 441 | ||
443 | `buffer' to the compiled pattern; | 442 | 'buffer' to the compiled pattern; |
444 | `used' to the length of the compiled pattern; | 443 | 'used' to the length of the compiled pattern; |
445 | `syntax' to RE_SYNTAX_POSIX_EXTENDED if the | 444 | 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the |
446 | REG_EXTENDED bit in CFLAGS is set; otherwise, to | 445 | REG_EXTENDED bit in CFLAGS is set; otherwise, to |
447 | RE_SYNTAX_POSIX_BASIC; | 446 | RE_SYNTAX_POSIX_BASIC; |
448 | `newline_anchor' to REG_NEWLINE being set in CFLAGS; | 447 | 'newline_anchor' to REG_NEWLINE being set in CFLAGS; |
449 | `fastmap' to an allocated space for the fastmap; | 448 | 'fastmap' to an allocated space for the fastmap; |
450 | `fastmap_accurate' to zero; | 449 | 'fastmap_accurate' to zero; |
451 | `re_nsub' to the number of subexpressions in PATTERN. | 450 | 're_nsub' to the number of subexpressions in PATTERN. |
452 | 451 | ||
453 | PATTERN is the address of the pattern string. | 452 | PATTERN is the address of the pattern string. |
454 | 453 | ||
@@ -587,19 +586,23 @@ weak_alias (__regerror, regerror) | |||
587 | static const bitset_t utf8_sb_map = | 586 | static const bitset_t utf8_sb_map = |
588 | { | 587 | { |
589 | /* Set the first 128 bits. */ | 588 | /* Set the first 128 bits. */ |
590 | # if 4 * BITSET_WORD_BITS < ASCII_CHARS | 589 | # if defined __GNUC__ && !defined __STRICT_ANSI__ |
591 | # error "bitset_word_t is narrower than 32 bits" | 590 | [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX |
592 | # elif 3 * BITSET_WORD_BITS < ASCII_CHARS | 591 | # else |
592 | # if 4 * BITSET_WORD_BITS < ASCII_CHARS | ||
593 | # error "bitset_word_t is narrower than 32 bits" | ||
594 | # elif 3 * BITSET_WORD_BITS < ASCII_CHARS | ||
593 | BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX, | 595 | BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX, |
594 | # elif 2 * BITSET_WORD_BITS < ASCII_CHARS | 596 | # elif 2 * BITSET_WORD_BITS < ASCII_CHARS |
595 | BITSET_WORD_MAX, BITSET_WORD_MAX, | 597 | BITSET_WORD_MAX, BITSET_WORD_MAX, |
596 | # elif 1 * BITSET_WORD_BITS < ASCII_CHARS | 598 | # elif 1 * BITSET_WORD_BITS < ASCII_CHARS |
597 | BITSET_WORD_MAX, | 599 | BITSET_WORD_MAX, |
598 | # endif | 600 | # endif |
599 | (BITSET_WORD_MAX | 601 | (BITSET_WORD_MAX |
600 | >> (SBC_MAX % BITSET_WORD_BITS == 0 | 602 | >> (SBC_MAX % BITSET_WORD_BITS == 0 |
601 | ? 0 | 603 | ? 0 |
602 | : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS)) | 604 | : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS)) |
605 | # endif | ||
603 | }; | 606 | }; |
604 | #endif | 607 | #endif |
605 | 608 | ||
@@ -658,9 +661,12 @@ void | |||
658 | regfree (preg) | 661 | regfree (preg) |
659 | regex_t *preg; | 662 | regex_t *preg; |
660 | { | 663 | { |
661 | re_dfa_t *dfa = (re_dfa_t *) preg->buffer; | 664 | re_dfa_t *dfa = preg->buffer; |
662 | if (BE (dfa != NULL, 1)) | 665 | if (BE (dfa != NULL, 1)) |
663 | free_dfa_content (dfa); | 666 | { |
667 | lock_fini (dfa->lock); | ||
668 | free_dfa_content (dfa); | ||
669 | } | ||
664 | preg->buffer = NULL; | 670 | preg->buffer = NULL; |
665 | preg->allocated = 0; | 671 | preg->allocated = 0; |
666 | 672 | ||
@@ -719,7 +725,7 @@ re_comp (s) | |||
719 | + __re_error_msgid_idx[(int) REG_ESPACE]); | 725 | + __re_error_msgid_idx[(int) REG_ESPACE]); |
720 | } | 726 | } |
721 | 727 | ||
722 | /* Since `re_exec' always passes NULL for the `regs' argument, we | 728 | /* Since 're_exec' always passes NULL for the 'regs' argument, we |
723 | don't need to initialize the pattern buffer fields which affect it. */ | 729 | don't need to initialize the pattern buffer fields which affect it. */ |
724 | 730 | ||
725 | /* Match anchors at newlines. */ | 731 | /* Match anchors at newlines. */ |
@@ -730,7 +736,7 @@ re_comp (s) | |||
730 | if (!ret) | 736 | if (!ret) |
731 | return NULL; | 737 | return NULL; |
732 | 738 | ||
733 | /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ | 739 | /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */ |
734 | return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); | 740 | return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); |
735 | } | 741 | } |
736 | 742 | ||
@@ -765,7 +771,7 @@ re_compile_internal (regex_t *preg, const char * pattern, size_t length, | |||
765 | preg->regs_allocated = REGS_UNALLOCATED; | 771 | preg->regs_allocated = REGS_UNALLOCATED; |
766 | 772 | ||
767 | /* Initialize the dfa. */ | 773 | /* Initialize the dfa. */ |
768 | dfa = (re_dfa_t *) preg->buffer; | 774 | dfa = preg->buffer; |
769 | if (BE (preg->allocated < sizeof (re_dfa_t), 0)) | 775 | if (BE (preg->allocated < sizeof (re_dfa_t), 0)) |
770 | { | 776 | { |
771 | /* If zero allocated, but buffer is non-null, try to realloc | 777 | /* If zero allocated, but buffer is non-null, try to realloc |
@@ -776,11 +782,13 @@ re_compile_internal (regex_t *preg, const char * pattern, size_t length, | |||
776 | if (dfa == NULL) | 782 | if (dfa == NULL) |
777 | return REG_ESPACE; | 783 | return REG_ESPACE; |
778 | preg->allocated = sizeof (re_dfa_t); | 784 | preg->allocated = sizeof (re_dfa_t); |
779 | preg->buffer = (unsigned char *) dfa; | 785 | preg->buffer = dfa; |
780 | } | 786 | } |
781 | preg->used = sizeof (re_dfa_t); | 787 | preg->used = sizeof (re_dfa_t); |
782 | 788 | ||
783 | err = init_dfa (dfa, length); | 789 | err = init_dfa (dfa, length); |
790 | if (BE (err == REG_NOERROR && lock_init (dfa->lock) != 0, 0)) | ||
791 | err = REG_ESPACE; | ||
784 | if (BE (err != REG_NOERROR, 0)) | 792 | if (BE (err != REG_NOERROR, 0)) |
785 | { | 793 | { |
786 | free_dfa_content (dfa); | 794 | free_dfa_content (dfa); |
@@ -794,8 +802,6 @@ re_compile_internal (regex_t *preg, const char * pattern, size_t length, | |||
794 | strncpy (dfa->re_str, pattern, length + 1); | 802 | strncpy (dfa->re_str, pattern, length + 1); |
795 | #endif | 803 | #endif |
796 | 804 | ||
797 | __libc_lock_init (dfa->lock); | ||
798 | |||
799 | err = re_string_construct (®exp, pattern, length, preg->translate, | 805 | err = re_string_construct (®exp, pattern, length, preg->translate, |
800 | (syntax & RE_ICASE) != 0, dfa); | 806 | (syntax & RE_ICASE) != 0, dfa); |
801 | if (BE (err != REG_NOERROR, 0)) | 807 | if (BE (err != REG_NOERROR, 0)) |
@@ -803,6 +809,7 @@ re_compile_internal (regex_t *preg, const char * pattern, size_t length, | |||
803 | re_compile_internal_free_return: | 809 | re_compile_internal_free_return: |
804 | free_workarea_compile (preg); | 810 | free_workarea_compile (preg); |
805 | re_string_destruct (®exp); | 811 | re_string_destruct (®exp); |
812 | lock_fini (dfa->lock); | ||
806 | free_dfa_content (dfa); | 813 | free_dfa_content (dfa); |
807 | preg->buffer = NULL; | 814 | preg->buffer = NULL; |
808 | preg->allocated = 0; | 815 | preg->allocated = 0; |
@@ -835,6 +842,7 @@ re_compile_internal (regex_t *preg, const char * pattern, size_t length, | |||
835 | 842 | ||
836 | if (BE (err != REG_NOERROR, 0)) | 843 | if (BE (err != REG_NOERROR, 0)) |
837 | { | 844 | { |
845 | lock_fini (dfa->lock); | ||
838 | free_dfa_content (dfa); | 846 | free_dfa_content (dfa); |
839 | preg->buffer = NULL; | 847 | preg->buffer = NULL; |
840 | preg->allocated = 0; | 848 | preg->allocated = 0; |
@@ -851,7 +859,7 @@ init_dfa (re_dfa_t *dfa, size_t pat_len) | |||
851 | { | 859 | { |
852 | __re_size_t table_size; | 860 | __re_size_t table_size; |
853 | #ifndef _LIBC | 861 | #ifndef _LIBC |
854 | char *codeset_name; | 862 | const char *codeset_name; |
855 | #endif | 863 | #endif |
856 | #ifdef RE_ENABLE_I18N | 864 | #ifdef RE_ENABLE_I18N |
857 | size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t)); | 865 | size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t)); |
@@ -874,7 +882,7 @@ init_dfa (re_dfa_t *dfa, size_t pat_len) | |||
874 | calculation below, and for similar doubling calculations | 882 | calculation below, and for similar doubling calculations |
875 | elsewhere. And it's <= rather than <, because some of the | 883 | elsewhere. And it's <= rather than <, because some of the |
876 | doubling calculations add 1 afterwards. */ | 884 | doubling calculations add 1 afterwards. */ |
877 | if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0)) | 885 | if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0)) |
878 | return REG_ESPACE; | 886 | return REG_ESPACE; |
879 | 887 | ||
880 | dfa->nodes_alloc = pat_len + 1; | 888 | dfa->nodes_alloc = pat_len + 1; |
@@ -897,8 +905,10 @@ init_dfa (re_dfa_t *dfa, size_t pat_len) | |||
897 | != 0); | 905 | != 0); |
898 | #else | 906 | #else |
899 | codeset_name = nl_langinfo (CODESET); | 907 | codeset_name = nl_langinfo (CODESET); |
900 | if (strcasecmp (codeset_name, "UTF-8") == 0 | 908 | if ((codeset_name[0] == 'U' || codeset_name[0] == 'u') |
901 | || strcasecmp (codeset_name, "UTF8") == 0) | 909 | && (codeset_name[1] == 'T' || codeset_name[1] == 't') |
910 | && (codeset_name[2] == 'F' || codeset_name[2] == 'f') | ||
911 | && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0) | ||
902 | dfa->is_utf8 = 1; | 912 | dfa->is_utf8 = 1; |
903 | 913 | ||
904 | /* We check exhaustively in the loop below if this charset is a | 914 | /* We check exhaustively in the loop below if this charset is a |
@@ -948,9 +958,43 @@ static void | |||
948 | internal_function | 958 | internal_function |
949 | init_word_char (re_dfa_t *dfa) | 959 | init_word_char (re_dfa_t *dfa) |
950 | { | 960 | { |
951 | int i, j, ch; | 961 | int i = 0; |
962 | int j; | ||
963 | int ch = 0; | ||
952 | dfa->word_ops_used = 1; | 964 | dfa->word_ops_used = 1; |
953 | for (i = 0, ch = 0; i < BITSET_WORDS; ++i) | 965 | if (BE (dfa->map_notascii == 0, 1)) |
966 | { | ||
967 | bitset_word_t bits0 = 0x00000000; | ||
968 | bitset_word_t bits1 = 0x03ff0000; | ||
969 | bitset_word_t bits2 = 0x87fffffe; | ||
970 | bitset_word_t bits3 = 0x07fffffe; | ||
971 | if (BITSET_WORD_BITS == 64) | ||
972 | { | ||
973 | dfa->word_char[0] = bits1 << 31 << 1 | bits0; | ||
974 | dfa->word_char[1] = bits3 << 31 << 1 | bits2; | ||
975 | i = 2; | ||
976 | } | ||
977 | else if (BITSET_WORD_BITS == 32) | ||
978 | { | ||
979 | dfa->word_char[0] = bits0; | ||
980 | dfa->word_char[1] = bits1; | ||
981 | dfa->word_char[2] = bits2; | ||
982 | dfa->word_char[3] = bits3; | ||
983 | i = 4; | ||
984 | } | ||
985 | else | ||
986 | goto general_case; | ||
987 | ch = 128; | ||
988 | |||
989 | if (BE (dfa->is_utf8, 1)) | ||
990 | { | ||
991 | memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8); | ||
992 | return; | ||
993 | } | ||
994 | } | ||
995 | |||
996 | general_case: | ||
997 | for (; i < BITSET_WORDS; ++i) | ||
954 | for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) | 998 | for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) |
955 | if (isalnum (ch) || ch == '_') | 999 | if (isalnum (ch) || ch == '_') |
956 | dfa->word_char[i] |= (bitset_word_t) 1 << j; | 1000 | dfa->word_char[i] |= (bitset_word_t) 1 << j; |
@@ -961,7 +1005,7 @@ init_word_char (re_dfa_t *dfa) | |||
961 | static void | 1005 | static void |
962 | free_workarea_compile (regex_t *preg) | 1006 | free_workarea_compile (regex_t *preg) |
963 | { | 1007 | { |
964 | re_dfa_t *dfa = (re_dfa_t *) preg->buffer; | 1008 | re_dfa_t *dfa = preg->buffer; |
965 | bin_tree_storage_t *storage, *next; | 1009 | bin_tree_storage_t *storage, *next; |
966 | for (storage = dfa->str_tree_storage; storage; storage = next) | 1010 | for (storage = dfa->str_tree_storage; storage; storage = next) |
967 | { | 1011 | { |
@@ -1145,7 +1189,7 @@ optimize_utf8 (re_dfa_t *dfa) | |||
1145 | static reg_errcode_t | 1189 | static reg_errcode_t |
1146 | analyze (regex_t *preg) | 1190 | analyze (regex_t *preg) |
1147 | { | 1191 | { |
1148 | re_dfa_t *dfa = (re_dfa_t *) preg->buffer; | 1192 | re_dfa_t *dfa = preg->buffer; |
1149 | reg_errcode_t ret; | 1193 | reg_errcode_t ret; |
1150 | 1194 | ||
1151 | /* Allocate arrays. */ | 1195 | /* Allocate arrays. */ |
@@ -1326,7 +1370,7 @@ lower_subexps (void *extra, bin_tree_t *node) | |||
1326 | static bin_tree_t * | 1370 | static bin_tree_t * |
1327 | lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node) | 1371 | lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node) |
1328 | { | 1372 | { |
1329 | re_dfa_t *dfa = (re_dfa_t *) preg->buffer; | 1373 | re_dfa_t *dfa = preg->buffer; |
1330 | bin_tree_t *body = node->left; | 1374 | bin_tree_t *body = node->left; |
1331 | bin_tree_t *op, *cls, *tree1, *tree; | 1375 | bin_tree_t *op, *cls, *tree1, *tree; |
1332 | 1376 | ||
@@ -1660,7 +1704,7 @@ calc_eclosure (re_dfa_t *dfa) | |||
1660 | /* If we have already calculated, skip it. */ | 1704 | /* If we have already calculated, skip it. */ |
1661 | if (dfa->eclosures[node_idx].nelem != 0) | 1705 | if (dfa->eclosures[node_idx].nelem != 0) |
1662 | continue; | 1706 | continue; |
1663 | /* Calculate epsilon closure of `node_idx'. */ | 1707 | /* Calculate epsilon closure of 'node_idx'. */ |
1664 | err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true); | 1708 | err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true); |
1665 | if (BE (err != REG_NOERROR, 0)) | 1709 | if (BE (err != REG_NOERROR, 0)) |
1666 | return err; | 1710 | return err; |
@@ -1710,14 +1754,14 @@ calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) | |||
1710 | { | 1754 | { |
1711 | re_node_set eclosure_elem; | 1755 | re_node_set eclosure_elem; |
1712 | Idx edest = dfa->edests[node].elems[i]; | 1756 | Idx edest = dfa->edests[node].elems[i]; |
1713 | /* If calculating the epsilon closure of `edest' is in progress, | 1757 | /* If calculating the epsilon closure of 'edest' is in progress, |
1714 | return intermediate result. */ | 1758 | return intermediate result. */ |
1715 | if (dfa->eclosures[edest].nelem == REG_MISSING) | 1759 | if (dfa->eclosures[edest].nelem == REG_MISSING) |
1716 | { | 1760 | { |
1717 | incomplete = true; | 1761 | incomplete = true; |
1718 | continue; | 1762 | continue; |
1719 | } | 1763 | } |
1720 | /* If we haven't calculated the epsilon closure of `edest' yet, | 1764 | /* If we haven't calculated the epsilon closure of 'edest' yet, |
1721 | calculate now. Otherwise use calculated epsilon closure. */ | 1765 | calculate now. Otherwise use calculated epsilon closure. */ |
1722 | if (dfa->eclosures[edest].nelem == 0) | 1766 | if (dfa->eclosures[edest].nelem == 0) |
1723 | { | 1767 | { |
@@ -1727,11 +1771,11 @@ calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) | |||
1727 | } | 1771 | } |
1728 | else | 1772 | else |
1729 | eclosure_elem = dfa->eclosures[edest]; | 1773 | eclosure_elem = dfa->eclosures[edest]; |
1730 | /* Merge the epsilon closure of `edest'. */ | 1774 | /* Merge the epsilon closure of 'edest'. */ |
1731 | err = re_node_set_merge (&eclosure, &eclosure_elem); | 1775 | err = re_node_set_merge (&eclosure, &eclosure_elem); |
1732 | if (BE (err != REG_NOERROR, 0)) | 1776 | if (BE (err != REG_NOERROR, 0)) |
1733 | return err; | 1777 | return err; |
1734 | /* If the epsilon closure of `edest' is incomplete, | 1778 | /* If the epsilon closure of 'edest' is incomplete, |
1735 | the epsilon closure of this node is also incomplete. */ | 1779 | the epsilon closure of this node is also incomplete. */ |
1736 | if (dfa->eclosures[edest].nelem == 0) | 1780 | if (dfa->eclosures[edest].nelem == 0) |
1737 | { | 1781 | { |
@@ -2093,7 +2137,7 @@ peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax) | |||
2093 | 2137 | ||
2094 | /* Entry point of the parser. | 2138 | /* Entry point of the parser. |
2095 | Parse the regular expression REGEXP and return the structure tree. | 2139 | Parse the regular expression REGEXP and return the structure tree. |
2096 | If an error is occured, ERR is set by error code, and return NULL. | 2140 | If an error occurs, ERR is set by error code, and return NULL. |
2097 | This function build the following tree, from regular expression <reg_exp>: | 2141 | This function build the following tree, from regular expression <reg_exp>: |
2098 | CAT | 2142 | CAT |
2099 | / \ | 2143 | / \ |
@@ -2107,7 +2151,7 @@ static bin_tree_t * | |||
2107 | parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax, | 2151 | parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax, |
2108 | reg_errcode_t *err) | 2152 | reg_errcode_t *err) |
2109 | { | 2153 | { |
2110 | re_dfa_t *dfa = (re_dfa_t *) preg->buffer; | 2154 | re_dfa_t *dfa = preg->buffer; |
2111 | bin_tree_t *tree, *eor, *root; | 2155 | bin_tree_t *tree, *eor, *root; |
2112 | re_token_t current_token; | 2156 | re_token_t current_token; |
2113 | dfa->syntax = syntax; | 2157 | dfa->syntax = syntax; |
@@ -2135,13 +2179,13 @@ parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax, | |||
2135 | / \ | 2179 | / \ |
2136 | <branch1> <branch2> | 2180 | <branch1> <branch2> |
2137 | 2181 | ||
2138 | ALT means alternative, which represents the operator `|'. */ | 2182 | ALT means alternative, which represents the operator '|'. */ |
2139 | 2183 | ||
2140 | static bin_tree_t * | 2184 | static bin_tree_t * |
2141 | parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, | 2185 | parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, |
2142 | reg_syntax_t syntax, Idx nest, reg_errcode_t *err) | 2186 | reg_syntax_t syntax, Idx nest, reg_errcode_t *err) |
2143 | { | 2187 | { |
2144 | re_dfa_t *dfa = (re_dfa_t *) preg->buffer; | 2188 | re_dfa_t *dfa = preg->buffer; |
2145 | bin_tree_t *tree, *branch = NULL; | 2189 | bin_tree_t *tree, *branch = NULL; |
2146 | tree = parse_branch (regexp, preg, token, syntax, nest, err); | 2190 | tree = parse_branch (regexp, preg, token, syntax, nest, err); |
2147 | if (BE (*err != REG_NOERROR && tree == NULL, 0)) | 2191 | if (BE (*err != REG_NOERROR && tree == NULL, 0)) |
@@ -2183,7 +2227,7 @@ parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token, | |||
2183 | reg_syntax_t syntax, Idx nest, reg_errcode_t *err) | 2227 | reg_syntax_t syntax, Idx nest, reg_errcode_t *err) |
2184 | { | 2228 | { |
2185 | bin_tree_t *tree, *expr; | 2229 | bin_tree_t *tree, *expr; |
2186 | re_dfa_t *dfa = (re_dfa_t *) preg->buffer; | 2230 | re_dfa_t *dfa = preg->buffer; |
2187 | tree = parse_expression (regexp, preg, token, syntax, nest, err); | 2231 | tree = parse_expression (regexp, preg, token, syntax, nest, err); |
2188 | if (BE (*err != REG_NOERROR && tree == NULL, 0)) | 2232 | if (BE (*err != REG_NOERROR && tree == NULL, 0)) |
2189 | return NULL; | 2233 | return NULL; |
@@ -2194,16 +2238,21 @@ parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token, | |||
2194 | expr = parse_expression (regexp, preg, token, syntax, nest, err); | 2238 | expr = parse_expression (regexp, preg, token, syntax, nest, err); |
2195 | if (BE (*err != REG_NOERROR && expr == NULL, 0)) | 2239 | if (BE (*err != REG_NOERROR && expr == NULL, 0)) |
2196 | { | 2240 | { |
2241 | if (tree != NULL) | ||
2242 | postorder (tree, free_tree, NULL); | ||
2197 | return NULL; | 2243 | return NULL; |
2198 | } | 2244 | } |
2199 | if (tree != NULL && expr != NULL) | 2245 | if (tree != NULL && expr != NULL) |
2200 | { | 2246 | { |
2201 | tree = create_tree (dfa, tree, expr, CONCAT); | 2247 | bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT); |
2202 | if (tree == NULL) | 2248 | if (newtree == NULL) |
2203 | { | 2249 | { |
2250 | postorder (expr, free_tree, NULL); | ||
2251 | postorder (tree, free_tree, NULL); | ||
2204 | *err = REG_ESPACE; | 2252 | *err = REG_ESPACE; |
2205 | return NULL; | 2253 | return NULL; |
2206 | } | 2254 | } |
2255 | tree = newtree; | ||
2207 | } | 2256 | } |
2208 | else if (tree == NULL) | 2257 | else if (tree == NULL) |
2209 | tree = expr; | 2258 | tree = expr; |
@@ -2222,7 +2271,7 @@ static bin_tree_t * | |||
2222 | parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, | 2271 | parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, |
2223 | reg_syntax_t syntax, Idx nest, reg_errcode_t *err) | 2272 | reg_syntax_t syntax, Idx nest, reg_errcode_t *err) |
2224 | { | 2273 | { |
2225 | re_dfa_t *dfa = (re_dfa_t *) preg->buffer; | 2274 | re_dfa_t *dfa = preg->buffer; |
2226 | bin_tree_t *tree; | 2275 | bin_tree_t *tree; |
2227 | switch (token->type) | 2276 | switch (token->type) |
2228 | { | 2277 | { |
@@ -2378,8 +2427,8 @@ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, | |||
2378 | case OP_WORD: | 2427 | case OP_WORD: |
2379 | case OP_NOTWORD: | 2428 | case OP_NOTWORD: |
2380 | tree = build_charclass_op (dfa, regexp->trans, | 2429 | tree = build_charclass_op (dfa, regexp->trans, |
2381 | (const unsigned char *) "alnum", | 2430 | "alnum", |
2382 | (const unsigned char *) "_", | 2431 | "_", |
2383 | token->type == OP_NOTWORD, err); | 2432 | token->type == OP_NOTWORD, err); |
2384 | if (BE (*err != REG_NOERROR && tree == NULL, 0)) | 2433 | if (BE (*err != REG_NOERROR && tree == NULL, 0)) |
2385 | return NULL; | 2434 | return NULL; |
@@ -2387,8 +2436,8 @@ parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, | |||
2387 | case OP_SPACE: | 2436 | case OP_SPACE: |
2388 | case OP_NOTSPACE: | 2437 | case OP_NOTSPACE: |
2389 | tree = build_charclass_op (dfa, regexp->trans, | 2438 | tree = build_charclass_op (dfa, regexp->trans, |
2390 | (const unsigned char *) "space", | 2439 | "space", |
2391 | (const unsigned char *) "", | 2440 | "", |
2392 | token->type == OP_NOTSPACE, err); | 2441 | token->type == OP_NOTSPACE, err); |
2393 | if (BE (*err != REG_NOERROR && tree == NULL, 0)) | 2442 | if (BE (*err != REG_NOERROR && tree == NULL, 0)) |
2394 | return NULL; | 2443 | return NULL; |
@@ -2438,7 +2487,7 @@ static bin_tree_t * | |||
2438 | parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, | 2487 | parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, |
2439 | reg_syntax_t syntax, Idx nest, reg_errcode_t *err) | 2488 | reg_syntax_t syntax, Idx nest, reg_errcode_t *err) |
2440 | { | 2489 | { |
2441 | re_dfa_t *dfa = (re_dfa_t *) preg->buffer; | 2490 | re_dfa_t *dfa = preg->buffer; |
2442 | bin_tree_t *tree; | 2491 | bin_tree_t *tree; |
2443 | size_t cur_nsub; | 2492 | size_t cur_nsub; |
2444 | cur_nsub = preg->re_nsub++; | 2493 | cur_nsub = preg->re_nsub++; |
@@ -2452,7 +2501,11 @@ parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, | |||
2452 | { | 2501 | { |
2453 | tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); | 2502 | tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); |
2454 | if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0)) | 2503 | if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0)) |
2455 | *err = REG_EPAREN; | 2504 | { |
2505 | if (tree != NULL) | ||
2506 | postorder (tree, free_tree, NULL); | ||
2507 | *err = REG_EPAREN; | ||
2508 | } | ||
2456 | if (BE (*err != REG_NOERROR, 0)) | 2509 | if (BE (*err != REG_NOERROR, 0)) |
2457 | return NULL; | 2510 | return NULL; |
2458 | } | 2511 | } |
@@ -2530,6 +2583,12 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, | |||
2530 | *err = REG_BADBR; | 2583 | *err = REG_BADBR; |
2531 | return NULL; | 2584 | return NULL; |
2532 | } | 2585 | } |
2586 | |||
2587 | if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0)) | ||
2588 | { | ||
2589 | *err = REG_ESIZE; | ||
2590 | return NULL; | ||
2591 | } | ||
2533 | } | 2592 | } |
2534 | else | 2593 | else |
2535 | { | 2594 | { |
@@ -2570,7 +2629,10 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, | |||
2570 | old_tree = NULL; | 2629 | old_tree = NULL; |
2571 | 2630 | ||
2572 | if (elem->token.type == SUBEXP) | 2631 | if (elem->token.type == SUBEXP) |
2573 | postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx); | 2632 | { |
2633 | uintptr_t subidx = elem->token.opr.idx; | ||
2634 | postorder (elem, mark_opt_subexp, (void *) subidx); | ||
2635 | } | ||
2574 | 2636 | ||
2575 | tree = create_tree (dfa, elem, NULL, | 2637 | tree = create_tree (dfa, elem, NULL, |
2576 | (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT)); | 2638 | (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT)); |
@@ -2616,7 +2678,7 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, | |||
2616 | Build the range expression which starts from START_ELEM, and ends | 2678 | Build the range expression which starts from START_ELEM, and ends |
2617 | at END_ELEM. The result are written to MBCSET and SBCSET. | 2679 | at END_ELEM. The result are written to MBCSET and SBCSET. |
2618 | RANGE_ALLOC is the allocated size of mbcset->range_starts, and | 2680 | RANGE_ALLOC is the allocated size of mbcset->range_starts, and |
2619 | mbcset->range_ends, is a pointer argument sinse we may | 2681 | mbcset->range_ends, is a pointer argument since we may |
2620 | update it. */ | 2682 | update it. */ |
2621 | 2683 | ||
2622 | static reg_errcode_t | 2684 | static reg_errcode_t |
@@ -2655,7 +2717,6 @@ build_range_exp (const reg_syntax_t syntax, | |||
2655 | wchar_t wc; | 2717 | wchar_t wc; |
2656 | wint_t start_wc; | 2718 | wint_t start_wc; |
2657 | wint_t end_wc; | 2719 | wint_t end_wc; |
2658 | wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; | ||
2659 | 2720 | ||
2660 | start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch | 2721 | start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch |
2661 | : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] | 2722 | : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] |
@@ -2669,11 +2730,7 @@ build_range_exp (const reg_syntax_t syntax, | |||
2669 | ? __btowc (end_ch) : end_elem->opr.wch); | 2730 | ? __btowc (end_ch) : end_elem->opr.wch); |
2670 | if (start_wc == WEOF || end_wc == WEOF) | 2731 | if (start_wc == WEOF || end_wc == WEOF) |
2671 | return REG_ECOLLATE; | 2732 | return REG_ECOLLATE; |
2672 | cmp_buf[0] = start_wc; | 2733 | else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0)) |
2673 | cmp_buf[4] = end_wc; | ||
2674 | |||
2675 | if (BE ((syntax & RE_NO_EMPTY_RANGES) | ||
2676 | && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0)) | ||
2677 | return REG_ERANGE; | 2734 | return REG_ERANGE; |
2678 | 2735 | ||
2679 | /* Got valid collation sequence values, add them as a new entry. | 2736 | /* Got valid collation sequence values, add them as a new entry. |
@@ -2714,9 +2771,7 @@ build_range_exp (const reg_syntax_t syntax, | |||
2714 | /* Build the table for single byte characters. */ | 2771 | /* Build the table for single byte characters. */ |
2715 | for (wc = 0; wc < SBC_MAX; ++wc) | 2772 | for (wc = 0; wc < SBC_MAX; ++wc) |
2716 | { | 2773 | { |
2717 | cmp_buf[2] = wc; | 2774 | if (start_wc <= wc && wc <= end_wc) |
2718 | if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 | ||
2719 | && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) | ||
2720 | bitset_set (sbcset, wc); | 2775 | bitset_set (sbcset, wc); |
2721 | } | 2776 | } |
2722 | } | 2777 | } |
@@ -2750,11 +2805,12 @@ build_range_exp (const reg_syntax_t syntax, | |||
2750 | 2805 | ||
2751 | static reg_errcode_t | 2806 | static reg_errcode_t |
2752 | internal_function | 2807 | internal_function |
2753 | build_collating_symbol (bitset_t sbcset, | ||
2754 | # ifdef RE_ENABLE_I18N | 2808 | # ifdef RE_ENABLE_I18N |
2755 | re_charset_t *mbcset, Idx *coll_sym_alloc, | 2809 | build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, |
2756 | # endif | 2810 | Idx *coll_sym_alloc, const unsigned char *name) |
2757 | const unsigned char *name) | 2811 | # else /* not RE_ENABLE_I18N */ |
2812 | build_collating_symbol (bitset_t sbcset, const unsigned char *name) | ||
2813 | # endif /* not RE_ENABLE_I18N */ | ||
2758 | { | 2814 | { |
2759 | size_t name_len = strlen ((const char *) name); | 2815 | size_t name_len = strlen ((const char *) name); |
2760 | if (BE (name_len != 1, 0)) | 2816 | if (BE (name_len != 1, 0)) |
@@ -2782,42 +2838,31 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, | |||
2782 | const int32_t *symb_table; | 2838 | const int32_t *symb_table; |
2783 | const unsigned char *extra; | 2839 | const unsigned char *extra; |
2784 | 2840 | ||
2785 | /* Local function for parse_bracket_exp used in _LIBC environement. | 2841 | /* Local function for parse_bracket_exp used in _LIBC environment. |
2786 | Seek the collating symbol entry correspondings to NAME. | 2842 | Seek the collating symbol entry corresponding to NAME. |
2787 | Return the index of the symbol in the SYMB_TABLE. */ | 2843 | Return the index of the symbol in the SYMB_TABLE, |
2844 | or -1 if not found. */ | ||
2788 | 2845 | ||
2789 | auto inline int32_t | 2846 | auto inline int32_t |
2790 | __attribute ((always_inline)) | 2847 | __attribute__ ((always_inline)) |
2791 | seek_collating_symbol_entry (name, name_len) | 2848 | seek_collating_symbol_entry (const unsigned char *name, size_t name_len) |
2792 | const unsigned char *name; | ||
2793 | size_t name_len; | ||
2794 | { | 2849 | { |
2795 | int32_t hash = elem_hash ((const char *) name, name_len); | 2850 | int32_t elem; |
2796 | int32_t elem = hash % table_size; | ||
2797 | if (symb_table[2 * elem] != 0) | ||
2798 | { | ||
2799 | int32_t second = hash % (table_size - 2) + 1; | ||
2800 | 2851 | ||
2801 | do | 2852 | for (elem = 0; elem < table_size; elem++) |
2802 | { | 2853 | if (symb_table[2 * elem] != 0) |
2803 | /* First compare the hashing value. */ | 2854 | { |
2804 | if (symb_table[2 * elem] == hash | 2855 | int32_t idx = symb_table[2 * elem + 1]; |
2805 | /* Compare the length of the name. */ | 2856 | /* Skip the name of collating element name. */ |
2806 | && name_len == extra[symb_table[2 * elem + 1]] | 2857 | idx += 1 + extra[idx]; |
2807 | /* Compare the name. */ | 2858 | if (/* Compare the length of the name. */ |
2808 | && memcmp (name, &extra[symb_table[2 * elem + 1] + 1], | 2859 | name_len == extra[idx] |
2809 | name_len) == 0) | 2860 | /* Compare the name. */ |
2810 | { | 2861 | && memcmp (name, &extra[idx + 1], name_len) == 0) |
2811 | /* Yep, this is the entry. */ | 2862 | /* Yep, this is the entry. */ |
2812 | break; | 2863 | return elem; |
2813 | } | 2864 | } |
2814 | 2865 | return -1; | |
2815 | /* Next entry. */ | ||
2816 | elem += second; | ||
2817 | } | ||
2818 | while (symb_table[2 * elem] != 0); | ||
2819 | } | ||
2820 | return elem; | ||
2821 | } | 2866 | } |
2822 | 2867 | ||
2823 | /* Local function for parse_bracket_exp used in _LIBC environment. | 2868 | /* Local function for parse_bracket_exp used in _LIBC environment. |
@@ -2825,9 +2870,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, | |||
2825 | Return the value if succeeded, UINT_MAX otherwise. */ | 2870 | Return the value if succeeded, UINT_MAX otherwise. */ |
2826 | 2871 | ||
2827 | auto inline unsigned int | 2872 | auto inline unsigned int |
2828 | __attribute ((always_inline)) | 2873 | __attribute__ ((always_inline)) |
2829 | lookup_collation_sequence_value (br_elem) | 2874 | lookup_collation_sequence_value (bracket_elem_t *br_elem) |
2830 | bracket_elem_t *br_elem; | ||
2831 | { | 2875 | { |
2832 | if (br_elem->type == SB_CHAR) | 2876 | if (br_elem->type == SB_CHAR) |
2833 | { | 2877 | { |
@@ -2855,7 +2899,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, | |||
2855 | int32_t elem, idx; | 2899 | int32_t elem, idx; |
2856 | elem = seek_collating_symbol_entry (br_elem->opr.name, | 2900 | elem = seek_collating_symbol_entry (br_elem->opr.name, |
2857 | sym_name_len); | 2901 | sym_name_len); |
2858 | if (symb_table[2 * elem] != 0) | 2902 | if (elem != -1) |
2859 | { | 2903 | { |
2860 | /* We found the entry. */ | 2904 | /* We found the entry. */ |
2861 | idx = symb_table[2 * elem + 1]; | 2905 | idx = symb_table[2 * elem + 1]; |
@@ -2873,7 +2917,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, | |||
2873 | /* Return the collation sequence value. */ | 2917 | /* Return the collation sequence value. */ |
2874 | return *(unsigned int *) (extra + idx); | 2918 | return *(unsigned int *) (extra + idx); |
2875 | } | 2919 | } |
2876 | else if (symb_table[2 * elem] == 0 && sym_name_len == 1) | 2920 | else if (sym_name_len == 1) |
2877 | { | 2921 | { |
2878 | /* No valid character. Match it as a single byte | 2922 | /* No valid character. Match it as a single byte |
2879 | character. */ | 2923 | character. */ |
@@ -2886,20 +2930,17 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, | |||
2886 | return UINT_MAX; | 2930 | return UINT_MAX; |
2887 | } | 2931 | } |
2888 | 2932 | ||
2889 | /* Local function for parse_bracket_exp used in _LIBC environement. | 2933 | /* Local function for parse_bracket_exp used in _LIBC environment. |
2890 | Build the range expression which starts from START_ELEM, and ends | 2934 | Build the range expression which starts from START_ELEM, and ends |
2891 | at END_ELEM. The result are written to MBCSET and SBCSET. | 2935 | at END_ELEM. The result are written to MBCSET and SBCSET. |
2892 | RANGE_ALLOC is the allocated size of mbcset->range_starts, and | 2936 | RANGE_ALLOC is the allocated size of mbcset->range_starts, and |
2893 | mbcset->range_ends, is a pointer argument sinse we may | 2937 | mbcset->range_ends, is a pointer argument since we may |
2894 | update it. */ | 2938 | update it. */ |
2895 | 2939 | ||
2896 | auto inline reg_errcode_t | 2940 | auto inline reg_errcode_t |
2897 | __attribute ((always_inline)) | 2941 | __attribute__ ((always_inline)) |
2898 | build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem) | 2942 | build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc, |
2899 | re_charset_t *mbcset; | 2943 | bracket_elem_t *start_elem, bracket_elem_t *end_elem) |
2900 | Idx *range_alloc; | ||
2901 | bitset_t sbcset; | ||
2902 | bracket_elem_t *start_elem, *end_elem; | ||
2903 | { | 2944 | { |
2904 | unsigned int ch; | 2945 | unsigned int ch; |
2905 | uint32_t start_collseq; | 2946 | uint32_t start_collseq; |
@@ -2912,6 +2953,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, | |||
2912 | 0)) | 2953 | 0)) |
2913 | return REG_ERANGE; | 2954 | return REG_ERANGE; |
2914 | 2955 | ||
2956 | /* FIXME: Implement rational ranges here, too. */ | ||
2915 | start_collseq = lookup_collation_sequence_value (start_elem); | 2957 | start_collseq = lookup_collation_sequence_value (start_elem); |
2916 | end_collseq = lookup_collation_sequence_value (end_elem); | 2958 | end_collseq = lookup_collation_sequence_value (end_elem); |
2917 | /* Check start/end collation sequence values. */ | 2959 | /* Check start/end collation sequence values. */ |
@@ -2970,33 +3012,30 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, | |||
2970 | return REG_NOERROR; | 3012 | return REG_NOERROR; |
2971 | } | 3013 | } |
2972 | 3014 | ||
2973 | /* Local function for parse_bracket_exp used in _LIBC environement. | 3015 | /* Local function for parse_bracket_exp used in _LIBC environment. |
2974 | Build the collating element which is represented by NAME. | 3016 | Build the collating element which is represented by NAME. |
2975 | The result are written to MBCSET and SBCSET. | 3017 | The result are written to MBCSET and SBCSET. |
2976 | COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a | 3018 | COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a |
2977 | pointer argument sinse we may update it. */ | 3019 | pointer argument since we may update it. */ |
2978 | 3020 | ||
2979 | auto inline reg_errcode_t | 3021 | auto inline reg_errcode_t |
2980 | __attribute ((always_inline)) | 3022 | __attribute__ ((always_inline)) |
2981 | build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name) | 3023 | build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, |
2982 | re_charset_t *mbcset; | 3024 | Idx *coll_sym_alloc, const unsigned char *name) |
2983 | Idx *coll_sym_alloc; | ||
2984 | bitset_t sbcset; | ||
2985 | const unsigned char *name; | ||
2986 | { | 3025 | { |
2987 | int32_t elem, idx; | 3026 | int32_t elem, idx; |
2988 | size_t name_len = strlen ((const char *) name); | 3027 | size_t name_len = strlen ((const char *) name); |
2989 | if (nrules != 0) | 3028 | if (nrules != 0) |
2990 | { | 3029 | { |
2991 | elem = seek_collating_symbol_entry (name, name_len); | 3030 | elem = seek_collating_symbol_entry (name, name_len); |
2992 | if (symb_table[2 * elem] != 0) | 3031 | if (elem != -1) |
2993 | { | 3032 | { |
2994 | /* We found the entry. */ | 3033 | /* We found the entry. */ |
2995 | idx = symb_table[2 * elem + 1]; | 3034 | idx = symb_table[2 * elem + 1]; |
2996 | /* Skip the name of collating element name. */ | 3035 | /* Skip the name of collating element name. */ |
2997 | idx += 1 + extra[idx]; | 3036 | idx += 1 + extra[idx]; |
2998 | } | 3037 | } |
2999 | else if (symb_table[2 * elem] == 0 && name_len == 1) | 3038 | else if (name_len == 1) |
3000 | { | 3039 | { |
3001 | /* No valid character, treat it as a normal | 3040 | /* No valid character, treat it as a normal |
3002 | character. */ | 3041 | character. */ |
@@ -3076,6 +3115,10 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, | |||
3076 | if (BE (sbcset == NULL, 0)) | 3115 | if (BE (sbcset == NULL, 0)) |
3077 | #endif /* RE_ENABLE_I18N */ | 3116 | #endif /* RE_ENABLE_I18N */ |
3078 | { | 3117 | { |
3118 | re_free (sbcset); | ||
3119 | #ifdef RE_ENABLE_I18N | ||
3120 | re_free (mbcset); | ||
3121 | #endif | ||
3079 | *err = REG_ESPACE; | 3122 | *err = REG_ESPACE; |
3080 | return NULL; | 3123 | return NULL; |
3081 | } | 3124 | } |
@@ -3235,7 +3278,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, | |||
3235 | #ifdef RE_ENABLE_I18N | 3278 | #ifdef RE_ENABLE_I18N |
3236 | mbcset, &char_class_alloc, | 3279 | mbcset, &char_class_alloc, |
3237 | #endif /* RE_ENABLE_I18N */ | 3280 | #endif /* RE_ENABLE_I18N */ |
3238 | start_elem.opr.name, syntax); | 3281 | (const char *) start_elem.opr.name, |
3282 | syntax); | ||
3239 | if (BE (*err != REG_NOERROR, 0)) | 3283 | if (BE (*err != REG_NOERROR, 0)) |
3240 | goto parse_bracket_exp_free_return; | 3284 | goto parse_bracket_exp_free_return; |
3241 | break; | 3285 | break; |
@@ -3414,7 +3458,7 @@ parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp, | |||
3414 | Build the equivalence class which is represented by NAME. | 3458 | Build the equivalence class which is represented by NAME. |
3415 | The result are written to MBCSET and SBCSET. | 3459 | The result are written to MBCSET and SBCSET. |
3416 | EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes, | 3460 | EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes, |
3417 | is a pointer argument sinse we may update it. */ | 3461 | is a pointer argument since we may update it. */ |
3418 | 3462 | ||
3419 | static reg_errcode_t | 3463 | static reg_errcode_t |
3420 | #ifdef RE_ENABLE_I18N | 3464 | #ifdef RE_ENABLE_I18N |
@@ -3445,19 +3489,18 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name) | |||
3445 | _NL_COLLATE_EXTRAMB); | 3489 | _NL_COLLATE_EXTRAMB); |
3446 | indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, | 3490 | indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, |
3447 | _NL_COLLATE_INDIRECTMB); | 3491 | _NL_COLLATE_INDIRECTMB); |
3448 | idx1 = findidx (&cp); | 3492 | idx1 = findidx (&cp, -1); |
3449 | if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0)) | 3493 | if (BE (idx1 == 0 || *cp != '\0', 0)) |
3450 | /* This isn't a valid character. */ | 3494 | /* This isn't a valid character. */ |
3451 | return REG_ECOLLATE; | 3495 | return REG_ECOLLATE; |
3452 | 3496 | ||
3453 | /* Build single byte matcing table for this equivalence class. */ | 3497 | /* Build single byte matching table for this equivalence class. */ |
3454 | char_buf[1] = (unsigned char) '\0'; | ||
3455 | len = weights[idx1 & 0xffffff]; | 3498 | len = weights[idx1 & 0xffffff]; |
3456 | for (ch = 0; ch < SBC_MAX; ++ch) | 3499 | for (ch = 0; ch < SBC_MAX; ++ch) |
3457 | { | 3500 | { |
3458 | char_buf[0] = ch; | 3501 | char_buf[0] = ch; |
3459 | cp = char_buf; | 3502 | cp = char_buf; |
3460 | idx2 = findidx (&cp); | 3503 | idx2 = findidx (&cp, 1); |
3461 | /* | 3504 | /* |
3462 | idx2 = table[ch]; | 3505 | idx2 = table[ch]; |
3463 | */ | 3506 | */ |
@@ -3510,20 +3553,20 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name) | |||
3510 | Build the character class which is represented by NAME. | 3553 | Build the character class which is represented by NAME. |
3511 | The result are written to MBCSET and SBCSET. | 3554 | The result are written to MBCSET and SBCSET. |
3512 | CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes, | 3555 | CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes, |
3513 | is a pointer argument sinse we may update it. */ | 3556 | is a pointer argument since we may update it. */ |
3514 | 3557 | ||
3515 | static reg_errcode_t | 3558 | static reg_errcode_t |
3516 | #ifdef RE_ENABLE_I18N | 3559 | #ifdef RE_ENABLE_I18N |
3517 | build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, | 3560 | build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, |
3518 | re_charset_t *mbcset, Idx *char_class_alloc, | 3561 | re_charset_t *mbcset, Idx *char_class_alloc, |
3519 | const unsigned char *class_name, reg_syntax_t syntax) | 3562 | const char *class_name, reg_syntax_t syntax) |
3520 | #else /* not RE_ENABLE_I18N */ | 3563 | #else /* not RE_ENABLE_I18N */ |
3521 | build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, | 3564 | build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, |
3522 | const unsigned char *class_name, reg_syntax_t syntax) | 3565 | const char *class_name, reg_syntax_t syntax) |
3523 | #endif /* not RE_ENABLE_I18N */ | 3566 | #endif /* not RE_ENABLE_I18N */ |
3524 | { | 3567 | { |
3525 | int i; | 3568 | int i; |
3526 | const char *name = (const char *) class_name; | 3569 | const char *name = class_name; |
3527 | 3570 | ||
3528 | /* In case of REG_ICASE "upper" and "lower" match the both of | 3571 | /* In case of REG_ICASE "upper" and "lower" match the both of |
3529 | upper and lower cases. */ | 3572 | upper and lower cases. */ |
@@ -3597,8 +3640,8 @@ build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, | |||
3597 | 3640 | ||
3598 | static bin_tree_t * | 3641 | static bin_tree_t * |
3599 | build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, | 3642 | build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, |
3600 | const unsigned char *class_name, | 3643 | const char *class_name, |
3601 | const unsigned char *extra, bool non_match, | 3644 | const char *extra, bool non_match, |
3602 | reg_errcode_t *err) | 3645 | reg_errcode_t *err) |
3603 | { | 3646 | { |
3604 | re_bitset_ptr_t sbcset; | 3647 | re_bitset_ptr_t sbcset; |
@@ -3704,8 +3747,9 @@ build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, | |||
3704 | } | 3747 | } |
3705 | 3748 | ||
3706 | /* This is intended for the expressions like "a{1,3}". | 3749 | /* This is intended for the expressions like "a{1,3}". |
3707 | Fetch a number from `input', and return the number. | 3750 | Fetch a number from 'input', and return the number. |
3708 | Return REG_MISSING if the number field is empty like "{,1}". | 3751 | Return REG_MISSING if the number field is empty like "{,1}". |
3752 | Return RE_DUP_MAX + 1 if the number field is too large. | ||
3709 | Return REG_ERROR if an error occurred. */ | 3753 | Return REG_ERROR if an error occurred. */ |
3710 | 3754 | ||
3711 | static Idx | 3755 | static Idx |
@@ -3724,8 +3768,9 @@ fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax) | |||
3724 | num = ((token->type != CHARACTER || c < '0' || '9' < c | 3768 | num = ((token->type != CHARACTER || c < '0' || '9' < c |
3725 | || num == REG_ERROR) | 3769 | || num == REG_ERROR) |
3726 | ? REG_ERROR | 3770 | ? REG_ERROR |
3727 | : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0')); | 3771 | : num == REG_MISSING |
3728 | num = (num > RE_DUP_MAX) ? REG_ERROR : num; | 3772 | ? c - '0' |
3773 | : MIN (RE_DUP_MAX + 1, num * 10 + c - '0')); | ||
3729 | } | 3774 | } |
3730 | return num; | 3775 | return num; |
3731 | } | 3776 | } |
@@ -3799,7 +3844,7 @@ create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, | |||
3799 | static reg_errcode_t | 3844 | static reg_errcode_t |
3800 | mark_opt_subexp (void *extra, bin_tree_t *node) | 3845 | mark_opt_subexp (void *extra, bin_tree_t *node) |
3801 | { | 3846 | { |
3802 | Idx idx = (Idx) (long) extra; | 3847 | Idx idx = (uintptr_t) extra; |
3803 | if (node->token.type == SUBEXP && node->token.opr.idx == idx) | 3848 | if (node->token.type == SUBEXP && node->token.opr.idx == idx) |
3804 | node->token.opt_subexp = 1; | 3849 | node->token.opt_subexp = 1; |
3805 | 3850 | ||