summaryrefslogtreecommitdiffstats
path: root/gl/mbrtowc.c
diff options
context:
space:
mode:
Diffstat (limited to 'gl/mbrtowc.c')
-rw-r--r--gl/mbrtowc.c394
1 files changed, 75 insertions, 319 deletions
diff --git a/gl/mbrtowc.c b/gl/mbrtowc.c
index 5ee44aea..4b164edd 100644
--- a/gl/mbrtowc.c
+++ b/gl/mbrtowc.c
@@ -1,19 +1,19 @@
1/* Convert multibyte character to wide character. 1/* Convert multibyte character to wide character.
2 Copyright (C) 1999-2002, 2005-2013 Free Software Foundation, Inc. 2 Copyright (C) 1999-2002, 2005-2022 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2008. 3 Written by Bruno Haible <bruno@clisp.org>, 2008.
4 4
5 This program is free software: you can redistribute it and/or modify 5 This file is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by 6 it under the terms of the GNU Lesser General Public License as
7 the Free Software Foundation; either version 3 of the License, or 7 published by the Free Software Foundation; either version 2.1 of the
8 (at your option) any later version. 8 License, or (at your option) any later version.
9 9
10 This program is distributed in the hope that it will be useful, 10 This file is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details. 13 GNU Lesser General Public License for more details.
14 14
15 You should have received a copy of the GNU General Public License 15 You should have received a copy of the GNU Lesser General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 17
18#include <config.h> 18#include <config.h>
19 19
@@ -21,314 +21,67 @@
21#include <wchar.h> 21#include <wchar.h>
22 22
23#if GNULIB_defined_mbstate_t 23#if GNULIB_defined_mbstate_t
24/* Implement mbrtowc() on top of mbtowc(). */ 24/* Implement mbrtowc() on top of mbtowc() for the non-UTF-8 locales
25 and directly for the UTF-8 locales. */
25 26
26# include <errno.h> 27# include <errno.h>
28# include <stdint.h>
27# include <stdlib.h> 29# include <stdlib.h>
28 30
29# include "localcharset.h" 31# if defined _WIN32 && !defined __CYGWIN__
30# include "streq.h"
31# include "verify.h"
32
33
34verify (sizeof (mbstate_t) >= 4);
35
36static char internal_state[4];
37
38size_t
39mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
40{
41 char *pstate = (char *)ps;
42
43 if (s == NULL)
44 {
45 pwc = NULL;
46 s = "";
47 n = 1;
48 }
49 32
50 if (n == 0) 33# define WIN32_LEAN_AND_MEAN /* avoid including junk */
51 return (size_t)(-2); 34# include <windows.h>
52 35
53 /* Here n > 0. */ 36# elif HAVE_PTHREAD_API
54 37
55 if (pstate == NULL) 38# include <pthread.h>
56 pstate = internal_state; 39# if HAVE_THREADS_H && HAVE_WEAK_SYMBOLS
40# include <threads.h>
41# pragma weak thrd_exit
42# define c11_threads_in_use() (thrd_exit != NULL)
43# else
44# define c11_threads_in_use() 0
45# endif
57 46
58 { 47# elif HAVE_THREADS_H
59 size_t nstate = pstate[0];
60 char buf[4];
61 const char *p;
62 size_t m;
63 48
64 switch (nstate) 49# include <threads.h>
65 {
66 case 0:
67 p = s;
68 m = n;
69 break;
70 case 3:
71 buf[2] = pstate[3];
72 /*FALLTHROUGH*/
73 case 2:
74 buf[1] = pstate[2];
75 /*FALLTHROUGH*/
76 case 1:
77 buf[0] = pstate[1];
78 p = buf;
79 m = nstate;
80 buf[m++] = s[0];
81 if (n >= 2 && m < 4)
82 {
83 buf[m++] = s[1];
84 if (n >= 3 && m < 4)
85 buf[m++] = s[2];
86 }
87 break;
88 default:
89 errno = EINVAL;
90 return (size_t)(-1);
91 }
92 50
93 /* Here m > 0. */
94
95# if __GLIBC__ || defined __UCLIBC__
96 /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
97 mbtowc (NULL, NULL, 0);
98# endif 51# endif
99 {
100 int res = mbtowc (pwc, p, m);
101
102 if (res >= 0)
103 {
104 if (pwc != NULL && ((*pwc == 0) != (res == 0)))
105 abort ();
106 if (nstate >= (res > 0 ? res : 1))
107 abort ();
108 res -= nstate;
109 pstate[0] = 0;
110 return res;
111 }
112
113 /* mbtowc does not distinguish between invalid and incomplete multibyte
114 sequences. But mbrtowc needs to make this distinction.
115 There are two possible approaches:
116 - Use iconv() and its return value.
117 - Use built-in knowledge about the possible encodings.
118 Given the low quality of implementation of iconv() on the systems that
119 lack mbrtowc(), we use the second approach.
120 The possible encodings are:
121 - 8-bit encodings,
122 - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
123 - UTF-8.
124 Use specialized code for each. */
125 if (m >= 4 || m >= MB_CUR_MAX)
126 goto invalid;
127 /* Here MB_CUR_MAX > 1 and 0 < m < 4. */
128 {
129 const char *encoding = locale_charset ();
130
131 if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
132 {
133 /* Cf. unistr/u8-mblen.c. */
134 unsigned char c = (unsigned char) p[0];
135
136 if (c >= 0xc2)
137 {
138 if (c < 0xe0)
139 {
140 if (m == 1)
141 goto incomplete;
142 }
143 else if (c < 0xf0)
144 {
145 if (m == 1)
146 goto incomplete;
147 if (m == 2)
148 {
149 unsigned char c2 = (unsigned char) p[1];
150
151 if ((c2 ^ 0x80) < 0x40
152 && (c >= 0xe1 || c2 >= 0xa0)
153 && (c != 0xed || c2 < 0xa0))
154 goto incomplete;
155 }
156 }
157 else if (c <= 0xf4)
158 {
159 if (m == 1)
160 goto incomplete;
161 else /* m == 2 || m == 3 */
162 {
163 unsigned char c2 = (unsigned char) p[1];
164
165 if ((c2 ^ 0x80) < 0x40
166 && (c >= 0xf1 || c2 >= 0x90)
167 && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
168 {
169 if (m == 2)
170 goto incomplete;
171 else /* m == 3 */
172 {
173 unsigned char c3 = (unsigned char) p[2];
174
175 if ((c3 ^ 0x80) < 0x40)
176 goto incomplete;
177 }
178 }
179 }
180 }
181 }
182 goto invalid;
183 }
184
185 /* As a reference for this code, you can use the GNU libiconv
186 implementation. Look for uses of the RET_TOOFEW macro. */
187
188 if (STREQ_OPT (encoding,
189 "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
190 {
191 if (m == 1)
192 {
193 unsigned char c = (unsigned char) p[0];
194
195 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
196 goto incomplete;
197 }
198 if (m == 2)
199 {
200 unsigned char c = (unsigned char) p[0];
201
202 if (c == 0x8f)
203 {
204 unsigned char c2 = (unsigned char) p[1];
205
206 if (c2 >= 0xa1 && c2 < 0xff)
207 goto incomplete;
208 }
209 }
210 goto invalid;
211 }
212 if (STREQ_OPT (encoding,
213 "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
214 || STREQ_OPT (encoding,
215 "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
216 || STREQ_OPT (encoding,
217 "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
218 {
219 if (m == 1)
220 {
221 unsigned char c = (unsigned char) p[0];
222
223 if (c >= 0xa1 && c < 0xff)
224 goto incomplete;
225 }
226 goto invalid;
227 }
228 if (STREQ_OPT (encoding,
229 "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
230 {
231 if (m == 1)
232 {
233 unsigned char c = (unsigned char) p[0];
234
235 if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
236 goto incomplete;
237 }
238 else /* m == 2 || m == 3 */
239 {
240 unsigned char c = (unsigned char) p[0];
241
242 if (c == 0x8e)
243 goto incomplete;
244 }
245 goto invalid;
246 }
247 if (STREQ_OPT (encoding,
248 "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
249 {
250 if (m == 1)
251 {
252 unsigned char c = (unsigned char) p[0];
253
254 if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
255 goto incomplete;
256 }
257 else /* m == 2 || m == 3 */
258 {
259 unsigned char c = (unsigned char) p[0];
260
261 if (c >= 0x90 && c <= 0xe3)
262 {
263 unsigned char c2 = (unsigned char) p[1];
264
265 if (c2 >= 0x30 && c2 <= 0x39)
266 {
267 if (m == 2)
268 goto incomplete;
269 else /* m == 3 */
270 {
271 unsigned char c3 = (unsigned char) p[2];
272
273 if (c3 >= 0x81 && c3 <= 0xfe)
274 goto incomplete;
275 }
276 }
277 }
278 }
279 goto invalid;
280 }
281 if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
282 {
283 if (m == 1)
284 {
285 unsigned char c = (unsigned char) p[0];
286
287 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
288 || (c >= 0xf0 && c <= 0xf9))
289 goto incomplete;
290 }
291 goto invalid;
292 }
293 52
294 /* An unknown multibyte encoding. */ 53# include "attribute.h"
295 goto incomplete; 54# include "verify.h"
296 } 55# include "lc-charset-dispatch.h"
56# include "mbtowc-lock.h"
297 57
298 incomplete: 58verify (sizeof (mbstate_t) >= 4);
299 { 59static char internal_state[4];
300 size_t k = nstate;
301 /* Here 0 <= k < m < 4. */
302 pstate[++k] = s[0];
303 if (k < m)
304 {
305 pstate[++k] = s[1];
306 if (k < m)
307 pstate[++k] = s[2];
308 }
309 if (k != m)
310 abort ();
311 }
312 pstate[0] = m;
313 return (size_t)(-2);
314 60
315 invalid: 61size_t
316 errno = EILSEQ; 62mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
317 /* The conversion state is undefined, says POSIX. */ 63{
318 return (size_t)(-1); 64# define FITS_IN_CHAR_TYPE(wc) ((wc) <= WCHAR_MAX)
319 } 65# include "mbrtowc-impl.h"
320 }
321} 66}
322 67
323#else 68#else
324/* Override the system's mbrtowc() function. */ 69/* Override the system's mbrtowc() function. */
325 70
71# if MBRTOWC_IN_C_LOCALE_MAYBE_EILSEQ
72# include "hard-locale.h"
73# include <locale.h>
74# endif
75
326# undef mbrtowc 76# undef mbrtowc
327 77
328size_t 78size_t
329rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) 79rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
330{ 80{
331# if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG 81 size_t ret;
82 wchar_t wc;
83
84# if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG
332 if (s == NULL) 85 if (s == NULL)
333 { 86 {
334 pwc = NULL; 87 pwc = NULL;
@@ -337,6 +90,14 @@ rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
337 } 90 }
338# endif 91# endif
339 92
93# if MBRTOWC_EMPTY_INPUT_BUG
94 if (n == 0)
95 return (size_t) -2;
96# endif
97
98 if (! pwc)
99 pwc = &wc;
100
340# if MBRTOWC_RETVAL_BUG 101# if MBRTOWC_RETVAL_BUG
341 { 102 {
342 static mbstate_t internal_state; 103 static mbstate_t internal_state;
@@ -352,8 +113,7 @@ rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
352 size_t count = 0; 113 size_t count = 0;
353 for (; n > 0; s++, n--) 114 for (; n > 0; s++, n--)
354 { 115 {
355 wchar_t wc; 116 ret = mbrtowc (&wc, s, 1, ps);
356 size_t ret = mbrtowc (&wc, s, 1, ps);
357 117
358 if (ret == (size_t)(-1)) 118 if (ret == (size_t)(-1))
359 return (size_t)(-1); 119 return (size_t)(-1);
@@ -361,8 +121,7 @@ rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
361 if (ret != (size_t)(-2)) 121 if (ret != (size_t)(-2))
362 { 122 {
363 /* The multibyte character has been completed. */ 123 /* The multibyte character has been completed. */
364 if (pwc != NULL) 124 *pwc = wc;
365 *pwc = wc;
366 return (wc == 0 ? 0 : count); 125 return (wc == 0 ? 0 : count);
367 } 126 }
368 } 127 }
@@ -371,32 +130,29 @@ rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
371 } 130 }
372# endif 131# endif
373 132
374# if MBRTOWC_NUL_RETVAL_BUG 133# if MBRTOWC_STORES_INCOMPLETE_BUG
375 { 134 ret = mbrtowc (&wc, s, n, ps);
376 wchar_t wc; 135 if (ret < (size_t) -2 && pwc != NULL)
377 size_t ret = mbrtowc (&wc, s, n, ps); 136 *pwc = wc;
378
379 if (ret != (size_t)(-1) && ret != (size_t)(-2))
380 {
381 if (pwc != NULL)
382 *pwc = wc;
383 if (wc == 0)
384 ret = 0;
385 }
386 return ret;
387 }
388# else 137# else
389 { 138 ret = mbrtowc (pwc, s, n, ps);
390# if MBRTOWC_NULL_ARG1_BUG 139# endif
391 wchar_t dummy;
392 140
393 if (pwc == NULL) 141# if MBRTOWC_NUL_RETVAL_BUG
394 pwc = &dummy; 142 if (ret < (size_t) -2 && !*pwc)
395# endif 143 return 0;
144# endif
396 145
397 return mbrtowc (pwc, s, n, ps); 146# if MBRTOWC_IN_C_LOCALE_MAYBE_EILSEQ
398 } 147 if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE))
148 {
149 unsigned char uc = *s;
150 *pwc = uc;
151 return 1;
152 }
399# endif 153# endif
154
155 return ret;
400} 156}
401 157
402#endif 158#endif