diff options
Diffstat (limited to 'gl/mbuiter.h')
-rw-r--r-- | gl/mbuiter.h | 203 |
1 files changed, 203 insertions, 0 deletions
diff --git a/gl/mbuiter.h b/gl/mbuiter.h new file mode 100644 index 0000000..9da3a6c --- /dev/null +++ b/gl/mbuiter.h | |||
@@ -0,0 +1,203 @@ | |||
1 | /* Iterating through multibyte strings: macros for multi-byte encodings. | ||
2 | Copyright (C) 2001, 2005 Free Software Foundation, Inc. | ||
3 | |||
4 | This program is free software; you can redistribute it and/or modify | ||
5 | it under the terms of the GNU General Public License as published by | ||
6 | the Free Software Foundation; either version 2, or (at your option) | ||
7 | any later version. | ||
8 | |||
9 | This program is distributed in the hope that it will be useful, | ||
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | GNU General Public License for more details. | ||
13 | |||
14 | You should have received a copy of the GNU General Public License | ||
15 | along with this program; if not, write to the Free Software Foundation, | ||
16 | Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ | ||
17 | |||
18 | /* Written by Bruno Haible <bruno@clisp.org>. */ | ||
19 | |||
20 | /* The macros in this file implement forward iteration through a | ||
21 | multi-byte string, without knowing its length a-priori. | ||
22 | |||
23 | With these macros, an iteration loop that looks like | ||
24 | |||
25 | char *iter; | ||
26 | for (iter = buf; *iter != '\0'; iter++) | ||
27 | { | ||
28 | do_something (*iter); | ||
29 | } | ||
30 | |||
31 | becomes | ||
32 | |||
33 | mbui_iterator_t iter; | ||
34 | for (mbui_init (iter, buf); mbui_avail (iter); mbui_advance (iter)) | ||
35 | { | ||
36 | do_something (mbui_cur_ptr (iter), mb_len (mbui_cur (iter))); | ||
37 | } | ||
38 | |||
39 | The benefit of these macros over plain use of mbrtowc is: | ||
40 | - Handling of invalid multibyte sequences is possible without | ||
41 | making the code more complicated, while still preserving the | ||
42 | invalid multibyte sequences. | ||
43 | |||
44 | Compared to mbiter.h, the macros here don't need to know the string's | ||
45 | length a-priori. The downside is that at each step, the look-ahead | ||
46 | that guards against overrunning the terminating '\0' is more expensive. | ||
47 | The mbui_* macros are therefore suitable when there is a high probability | ||
48 | that only the first few multibyte characters need to be inspected. | ||
49 | Whereas the mbi_* macros are better if usually the iteration runs | ||
50 | through the entire string. | ||
51 | |||
52 | mbui_iterator_t | ||
53 | is a type usable for variable declarations. | ||
54 | |||
55 | mbui_init (iter, startptr) | ||
56 | initializes the iterator, starting at startptr. | ||
57 | |||
58 | mbui_avail (iter) | ||
59 | returns true if there are more multibyte chracters available before | ||
60 | the end of string is reached. In this case, mbui_cur (iter) is | ||
61 | initialized to the next multibyte chracter. | ||
62 | |||
63 | mbui_advance (iter) | ||
64 | advances the iterator by one multibyte character. | ||
65 | |||
66 | mbui_cur (iter) | ||
67 | returns the current multibyte character, of type mbchar_t. All the | ||
68 | macros defined in mbchar.h can be used on it. | ||
69 | |||
70 | mbui_cur_ptr (iter) | ||
71 | return a pointer to the beginning of the current multibyte character. | ||
72 | |||
73 | mbui_reloc (iter, ptrdiff) | ||
74 | relocates iterator when the string is moved by ptrdiff bytes. | ||
75 | |||
76 | Here are the function prototypes of the macros. | ||
77 | |||
78 | extern void mbui_init (mbui_iterator_t iter, const char *startptr); | ||
79 | extern bool mbui_avail (mbui_iterator_t iter); | ||
80 | extern void mbui_advance (mbui_iterator_t iter); | ||
81 | extern mbchar_t mbui_cur (mbui_iterator_t iter); | ||
82 | extern const char * mbui_cur_ptr (mbui_iterator_t iter); | ||
83 | extern void mbui_reloc (mbui_iterator_t iter, ptrdiff_t ptrdiff); | ||
84 | */ | ||
85 | |||
86 | #ifndef _MBUITER_H | ||
87 | #define _MBUITER_H 1 | ||
88 | |||
89 | #include <assert.h> | ||
90 | #include <stdbool.h> | ||
91 | #include <stdlib.h> | ||
92 | |||
93 | /* Tru64 with Desktop Toolkit C has a bug: <stdio.h> must be included before | ||
94 | <wchar.h>. | ||
95 | BSD/OS 4.1 has a bug: <stdio.h> and <time.h> must be included before | ||
96 | <wchar.h>. */ | ||
97 | #include <stdio.h> | ||
98 | #include <time.h> | ||
99 | #include <wchar.h> | ||
100 | |||
101 | #include "mbchar.h" | ||
102 | #include "strnlen1.h" | ||
103 | |||
104 | struct mbuiter_multi | ||
105 | { | ||
106 | bool in_shift; /* true if next byte may not be interpreted as ASCII */ | ||
107 | mbstate_t state; /* if in_shift: current shift state */ | ||
108 | bool next_done; /* true if mbui_avail has already filled the following */ | ||
109 | struct mbchar cur; /* the current character: | ||
110 | const char *cur.ptr pointer to current character | ||
111 | The following are only valid after mbui_avail. | ||
112 | size_t cur.bytes number of bytes of current character | ||
113 | bool cur.wc_valid true if wc is a valid wide character | ||
114 | wchar_t cur.wc if wc_valid: the current character | ||
115 | */ | ||
116 | }; | ||
117 | |||
118 | static inline void | ||
119 | mbuiter_multi_next (struct mbuiter_multi *iter) | ||
120 | { | ||
121 | if (iter->next_done) | ||
122 | return; | ||
123 | if (iter->in_shift) | ||
124 | goto with_shift; | ||
125 | /* Handle most ASCII characters quickly, without calling mbrtowc(). */ | ||
126 | if (is_basic (*iter->cur.ptr)) | ||
127 | { | ||
128 | /* These characters are part of the basic character set. ISO C 99 | ||
129 | guarantees that their wide character code is identical to their | ||
130 | char code. */ | ||
131 | iter->cur.bytes = 1; | ||
132 | iter->cur.wc = *iter->cur.ptr; | ||
133 | iter->cur.wc_valid = true; | ||
134 | } | ||
135 | else | ||
136 | { | ||
137 | assert (mbsinit (&iter->state)); | ||
138 | iter->in_shift = true; | ||
139 | with_shift: | ||
140 | iter->cur.bytes = mbrtowc (&iter->cur.wc, iter->cur.ptr, | ||
141 | strnlen1 (iter->cur.ptr, MB_CUR_MAX), | ||
142 | &iter->state); | ||
143 | if (iter->cur.bytes == (size_t) -1) | ||
144 | { | ||
145 | /* An invalid multibyte sequence was encountered. */ | ||
146 | iter->cur.bytes = 1; | ||
147 | iter->cur.wc_valid = false; | ||
148 | /* Whether to set iter->in_shift = false and reset iter->state | ||
149 | or not is not very important; the string is bogus anyway. */ | ||
150 | } | ||
151 | else if (iter->cur.bytes == (size_t) -2) | ||
152 | { | ||
153 | /* An incomplete multibyte character at the end. */ | ||
154 | iter->cur.bytes = strlen (iter->cur.ptr); | ||
155 | iter->cur.wc_valid = false; | ||
156 | /* Whether to set iter->in_shift = false and reset iter->state | ||
157 | or not is not important; the string end is reached anyway. */ | ||
158 | } | ||
159 | else | ||
160 | { | ||
161 | if (iter->cur.bytes == 0) | ||
162 | { | ||
163 | /* A null wide character was encountered. */ | ||
164 | iter->cur.bytes = 1; | ||
165 | assert (*iter->cur.ptr == '\0'); | ||
166 | assert (iter->cur.wc == 0); | ||
167 | } | ||
168 | iter->cur.wc_valid = true; | ||
169 | |||
170 | /* When in the initial state, we can go back treating ASCII | ||
171 | characters more quickly. */ | ||
172 | if (mbsinit (&iter->state)) | ||
173 | iter->in_shift = false; | ||
174 | } | ||
175 | } | ||
176 | iter->next_done = true; | ||
177 | } | ||
178 | |||
179 | static inline void | ||
180 | mbuiter_multi_reloc (struct mbuiter_multi *iter, ptrdiff_t ptrdiff) | ||
181 | { | ||
182 | iter->cur.ptr += ptrdiff; | ||
183 | } | ||
184 | |||
185 | /* Iteration macros. */ | ||
186 | typedef struct mbuiter_multi mbui_iterator_t; | ||
187 | #define mbui_init(iter, startptr) \ | ||
188 | ((iter).cur.ptr = (startptr), \ | ||
189 | (iter).in_shift = false, memset (&(iter).state, '\0', sizeof (mbstate_t)), \ | ||
190 | (iter).next_done = false) | ||
191 | #define mbui_avail(iter) \ | ||
192 | (mbuiter_multi_next (&(iter)), !mb_isnul ((iter).cur)) | ||
193 | #define mbui_advance(iter) \ | ||
194 | ((iter).cur.ptr += (iter).cur.bytes, (iter).next_done = false) | ||
195 | |||
196 | /* Access to the current character. */ | ||
197 | #define mbui_cur(iter) (iter).cur | ||
198 | #define mbui_cur_ptr(iter) (iter).cur.ptr | ||
199 | |||
200 | /* Relocation. */ | ||
201 | #define mbui_reloc(iter, ptrdiff) mbuiter_multi_reloc (&iter, ptrdiff) | ||
202 | |||
203 | #endif /* _MBUITER_H */ | ||