summaryrefslogtreecommitdiffstats
path: root/gl/mbrtowc.c
diff options
context:
space:
mode:
Diffstat (limited to 'gl/mbrtowc.c')
-rw-r--r--gl/mbrtowc.c349
1 files changed, 349 insertions, 0 deletions
diff --git a/gl/mbrtowc.c b/gl/mbrtowc.c
new file mode 100644
index 00000000..603f0060
--- /dev/null
+++ b/gl/mbrtowc.c
@@ -0,0 +1,349 @@
1/* Convert multibyte character to wide character.
2 Copyright (C) 1999-2002, 2005-2008 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2008.
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17
18#include <config.h>
19
20/* Specification. */
21#include <wchar.h>
22
23#if GNULIB_defined_mbstate_t
24/* Implement mbrtowc() on top of mbtowc(). */
25
26# include <errno.h>
27# include <stdlib.h>
28
29# include "localcharset.h"
30# include "streq.h"
31# include "verify.h"
32
33
34verify (sizeof (mbstate_t) >= 4);
35
36static char internal_state[4];
37
38size_t
39mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
40{
41 char *pstate = (char *)ps;
42
43 if (pstate == NULL)
44 pstate = internal_state;
45
46 if (s == NULL)
47 {
48 pwc = NULL;
49 s = "";
50 n = 1;
51 }
52
53 if (n == 0)
54 return (size_t)(-2);
55
56 /* Here n > 0. */
57 {
58 size_t nstate = pstate[0];
59 char buf[4];
60 const char *p;
61 size_t m;
62
63 switch (nstate)
64 {
65 case 0:
66 p = s;
67 m = n;
68 break;
69 case 3:
70 buf[2] = pstate[3];
71 /*FALLTHROUGH*/
72 case 2:
73 buf[1] = pstate[2];
74 /*FALLTHROUGH*/
75 case 1:
76 buf[0] = pstate[1];
77 p = buf;
78 m = nstate;
79 buf[m++] = s[0];
80 if (n >= 2 && m < 4)
81 {
82 buf[m++] = s[1];
83 if (n >= 3 && m < 4)
84 buf[m++] = s[2];
85 }
86 break;
87 default:
88 errno = EINVAL;
89 return (size_t)(-1);
90 }
91
92 /* Here 0 < m ≤ 4. */
93
94# if __GLIBC__
95 /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
96 mbtowc (NULL, NULL, 0);
97# endif
98 {
99 int res = mbtowc (pwc, p, m);
100
101 if (res >= 0)
102 {
103 if (pwc != NULL && ((*pwc == 0) != (res == 0)))
104 abort ();
105 if (nstate >= (res > 0 ? res : 1))
106 abort ();
107 res -= nstate;
108 pstate[0] = 0;
109 return res;
110 }
111
112 /* mbtowc does not distinguish between invalid and incomplete multibyte
113 sequences. But mbrtowc needs to make this distinction.
114 There are two possible approaches:
115 - Use iconv() and its return value.
116 - Use built-in knowledge about the possible encodings.
117 Given the low quality of implementation of iconv() on the systems that
118 lack mbrtowc(), we use the second approach.
119 The possible encodings are:
120 - 8-bit encodings,
121 - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, SJIS,
122 - UTF-8.
123 Use specialized code for each. */
124 if (m >= 4 || m >= MB_CUR_MAX)
125 goto invalid;
126 /* Here MB_CUR_MAX > 1 and 0 < m < 4. */
127 {
128 const char *encoding = locale_charset ();
129
130 if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
131 {
132 /* Cf. unistr/u8-mblen.c. */
133 unsigned char c = (unsigned char) p[0];
134
135 if (c >= 0xc2)
136 {
137 if (c < 0xe0)
138 {
139 if (m == 1)
140 goto incomplete;
141 }
142 else if (c < 0xf0)
143 {
144 if (m == 1)
145 goto incomplete;
146 if (m == 2)
147 {
148 unsigned char c2 = (unsigned char) p[1];
149
150 if ((c2 ^ 0x80) < 0x40
151 && (c >= 0xe1 || c2 >= 0xa0)
152 && (c != 0xed || c2 < 0xa0))
153 goto incomplete;
154 }
155 }
156 else if (c <= 0xf4)
157 {
158 if (m == 1)
159 goto incomplete;
160 else /* m == 2 || m == 3 */
161 {
162 unsigned char c2 = (unsigned char) p[1];
163
164 if ((c2 ^ 0x80) < 0x40
165 && (c >= 0xf1 || c2 >= 0x90)
166 && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
167 {
168 if (m == 2)
169 goto incomplete;
170 else /* m == 3 */
171 {
172 unsigned char c3 = (unsigned char) p[2];
173
174 if ((c3 ^ 0x80) < 0x40)
175 goto incomplete;
176 }
177 }
178 }
179 }
180 }
181 goto invalid;
182 }
183
184 /* As a reference for this code, you can use the GNU libiconv
185 implementation. Look for uses of the RET_TOOFEW macro. */
186
187 if (STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
188 {
189 if (m == 1)
190 {
191 unsigned char c = (unsigned char) p[0];
192
193 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
194 goto incomplete;
195 }
196 if (m == 2)
197 {
198 unsigned char c = (unsigned char) p[0];
199
200 if (c == 0x8f)
201 {
202 unsigned char c2 = (unsigned char) p[1];
203
204 if (c2 >= 0xa1 && c2 < 0xff)
205 goto incomplete;
206 }
207 }
208 goto invalid;
209 }
210 if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
211 || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
212 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
213 {
214 if (m == 1)
215 {
216 unsigned char c = (unsigned char) p[0];
217
218 if (c >= 0xa1 && c < 0xff)
219 goto incomplete;
220 }
221 goto invalid;
222 }
223 if (STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
224 {
225 if (m == 1)
226 {
227 unsigned char c = (unsigned char) p[0];
228
229 if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
230 goto incomplete;
231 }
232 else /* m == 2 || m == 3 */
233 {
234 unsigned char c = (unsigned char) p[0];
235
236 if (c == 0x8e)
237 goto incomplete;
238 }
239 goto invalid;
240 }
241 if (STREQ (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
242 {
243 if (m == 1)
244 {
245 unsigned char c = (unsigned char) p[0];
246
247 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
248 || (c >= 0xf0 && c <= 0xf9))
249 goto incomplete;
250 }
251 goto invalid;
252 }
253
254 /* An unknown multibyte encoding. */
255 goto incomplete;
256 }
257
258 incomplete:
259 {
260 size_t k = nstate;
261 /* Here 0 < k < m < 4. */
262 pstate[++k] = s[0];
263 if (k < m)
264 pstate[++k] = s[1];
265 if (k != m)
266 abort ();
267 }
268 pstate[0] = m;
269 return (size_t)(-2);
270
271 invalid:
272 errno = EILSEQ;
273 /* The conversion state is undefined, says POSIX. */
274 return (size_t)(-1);
275 }
276 }
277}
278
279#else
280/* Override the system's mbrtowc() function. */
281
282# undef mbrtowc
283
284size_t
285rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
286{
287# if MBRTOWC_NULL_ARG_BUG || MBRTOWC_RETVAL_BUG
288 if (s == NULL)
289 {
290 pwc = NULL;
291 s = "";
292 n = 1;
293 }
294# endif
295
296# if MBRTOWC_RETVAL_BUG
297 {
298 static mbstate_t internal_state;
299
300 /* Override mbrtowc's internal state. We can not call mbsinit() on the
301 hidden internal state, but we can call it on our variable. */
302 if (ps == NULL)
303 ps = &internal_state;
304
305 if (!mbsinit (ps))
306 {
307 /* Parse the rest of the multibyte character byte for byte. */
308 size_t count = 0;
309 for (; n > 0; s++, n--)
310 {
311 wchar_t wc;
312 size_t ret = mbrtowc (&wc, s, 1, ps);
313
314 if (ret == (size_t)(-1))
315 return (size_t)(-1);
316 count++;
317 if (ret != (size_t)(-2))
318 {
319 /* The multibyte character has been completed. */
320 if (pwc != NULL)
321 *pwc = wc;
322 return (wc == 0 ? 0 : count);
323 }
324 }
325 return (size_t)(-2);
326 }
327 }
328# endif
329
330# if MBRTOWC_NUL_RETVAL_BUG
331 {
332 wchar_t wc;
333 size_t ret = mbrtowc (&wc, s, n, ps);
334
335 if (ret != (size_t)(-1) && ret != (size_t)(-2))
336 {
337 if (pwc != NULL)
338 *pwc = wc;
339 if (wc == 0)
340 ret = 0;
341 }
342 return ret;
343 }
344# else
345 return mbrtowc (pwc, s, n, ps);
346# endif
347}
348
349#endif