diff options
Diffstat (limited to 'gl/localcharset.c')
-rw-r--r-- | gl/localcharset.c | 460 |
1 files changed, 460 insertions, 0 deletions
diff --git a/gl/localcharset.c b/gl/localcharset.c new file mode 100644 index 00000000..4f319487 --- /dev/null +++ b/gl/localcharset.c | |||
@@ -0,0 +1,460 @@ | |||
1 | /* Determine a canonical name for the current locale's character encoding. | ||
2 | |||
3 | Copyright (C) 2000-2006 Free Software Foundation, Inc. | ||
4 | |||
5 | This program is free software; you can redistribute it and/or modify | ||
6 | it under the terms of the GNU General Public License as published by | ||
7 | the Free Software Foundation; either version 3, or (at your option) | ||
8 | any later version. | ||
9 | |||
10 | This program is distributed in the hope that it will be useful, | ||
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | GNU General Public License for more details. | ||
14 | |||
15 | You should have received a copy of the GNU General Public License along | ||
16 | with this program; if not, write to the Free Software Foundation, | ||
17 | Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ | ||
18 | |||
19 | /* Written by Bruno Haible <bruno@clisp.org>. */ | ||
20 | |||
21 | #include <config.h> | ||
22 | |||
23 | /* Specification. */ | ||
24 | #include "localcharset.h" | ||
25 | |||
26 | #include <stddef.h> | ||
27 | #include <stdio.h> | ||
28 | #include <string.h> | ||
29 | #include <stdlib.h> | ||
30 | |||
31 | #if defined _WIN32 || defined __WIN32__ | ||
32 | # define WIN32_NATIVE | ||
33 | #endif | ||
34 | |||
35 | #if defined __EMX__ | ||
36 | /* Assume EMX program runs on OS/2, even if compiled under DOS. */ | ||
37 | # define OS2 | ||
38 | #endif | ||
39 | |||
40 | #if !defined WIN32_NATIVE | ||
41 | # if HAVE_LANGINFO_CODESET | ||
42 | # include <langinfo.h> | ||
43 | # else | ||
44 | # if 0 /* see comment below */ | ||
45 | # include <locale.h> | ||
46 | # endif | ||
47 | # endif | ||
48 | # ifdef __CYGWIN__ | ||
49 | # define WIN32_LEAN_AND_MEAN | ||
50 | # include <windows.h> | ||
51 | # endif | ||
52 | #elif defined WIN32_NATIVE | ||
53 | # define WIN32_LEAN_AND_MEAN | ||
54 | # include <windows.h> | ||
55 | #endif | ||
56 | #if defined OS2 | ||
57 | # define INCL_DOS | ||
58 | # include <os2.h> | ||
59 | #endif | ||
60 | |||
61 | #if ENABLE_RELOCATABLE | ||
62 | # include "relocatable.h" | ||
63 | #else | ||
64 | # define relocate(pathname) (pathname) | ||
65 | #endif | ||
66 | |||
67 | /* Get LIBDIR. */ | ||
68 | #ifndef LIBDIR | ||
69 | # include "configmake.h" | ||
70 | #endif | ||
71 | |||
72 | #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__ | ||
73 | /* Win32, Cygwin, OS/2, DOS */ | ||
74 | # define ISSLASH(C) ((C) == '/' || (C) == '\\') | ||
75 | #endif | ||
76 | |||
77 | #ifndef DIRECTORY_SEPARATOR | ||
78 | # define DIRECTORY_SEPARATOR '/' | ||
79 | #endif | ||
80 | |||
81 | #ifndef ISSLASH | ||
82 | # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR) | ||
83 | #endif | ||
84 | |||
85 | #if HAVE_DECL_GETC_UNLOCKED | ||
86 | # undef getc | ||
87 | # define getc getc_unlocked | ||
88 | #endif | ||
89 | |||
90 | /* The following static variable is declared 'volatile' to avoid a | ||
91 | possible multithread problem in the function get_charset_aliases. If we | ||
92 | are running in a threaded environment, and if two threads initialize | ||
93 | 'charset_aliases' simultaneously, both will produce the same value, | ||
94 | and everything will be ok if the two assignments to 'charset_aliases' | ||
95 | are atomic. But I don't know what will happen if the two assignments mix. */ | ||
96 | #if __STDC__ != 1 | ||
97 | # define volatile /* empty */ | ||
98 | #endif | ||
99 | /* Pointer to the contents of the charset.alias file, if it has already been | ||
100 | read, else NULL. Its format is: | ||
101 | ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */ | ||
102 | static const char * volatile charset_aliases; | ||
103 | |||
104 | /* Return a pointer to the contents of the charset.alias file. */ | ||
105 | static const char * | ||
106 | get_charset_aliases (void) | ||
107 | { | ||
108 | const char *cp; | ||
109 | |||
110 | cp = charset_aliases; | ||
111 | if (cp == NULL) | ||
112 | { | ||
113 | #if !(defined VMS || defined WIN32_NATIVE || defined __CYGWIN__) | ||
114 | FILE *fp; | ||
115 | const char *dir; | ||
116 | const char *base = "charset.alias"; | ||
117 | char *file_name; | ||
118 | |||
119 | /* Make it possible to override the charset.alias location. This is | ||
120 | necessary for running the testsuite before "make install". */ | ||
121 | dir = getenv ("CHARSETALIASDIR"); | ||
122 | if (dir == NULL || dir[0] == '\0') | ||
123 | dir = relocate (LIBDIR); | ||
124 | |||
125 | /* Concatenate dir and base into freshly allocated file_name. */ | ||
126 | { | ||
127 | size_t dir_len = strlen (dir); | ||
128 | size_t base_len = strlen (base); | ||
129 | int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1])); | ||
130 | file_name = (char *) malloc (dir_len + add_slash + base_len + 1); | ||
131 | if (file_name != NULL) | ||
132 | { | ||
133 | memcpy (file_name, dir, dir_len); | ||
134 | if (add_slash) | ||
135 | file_name[dir_len] = DIRECTORY_SEPARATOR; | ||
136 | memcpy (file_name + dir_len + add_slash, base, base_len + 1); | ||
137 | } | ||
138 | } | ||
139 | |||
140 | if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL) | ||
141 | /* Out of memory or file not found, treat it as empty. */ | ||
142 | cp = ""; | ||
143 | else | ||
144 | { | ||
145 | /* Parse the file's contents. */ | ||
146 | char *res_ptr = NULL; | ||
147 | size_t res_size = 0; | ||
148 | |||
149 | for (;;) | ||
150 | { | ||
151 | int c; | ||
152 | char buf1[50+1]; | ||
153 | char buf2[50+1]; | ||
154 | size_t l1, l2; | ||
155 | char *old_res_ptr; | ||
156 | |||
157 | c = getc (fp); | ||
158 | if (c == EOF) | ||
159 | break; | ||
160 | if (c == '\n' || c == ' ' || c == '\t') | ||
161 | continue; | ||
162 | if (c == '#') | ||
163 | { | ||
164 | /* Skip comment, to end of line. */ | ||
165 | do | ||
166 | c = getc (fp); | ||
167 | while (!(c == EOF || c == '\n')); | ||
168 | if (c == EOF) | ||
169 | break; | ||
170 | continue; | ||
171 | } | ||
172 | ungetc (c, fp); | ||
173 | if (fscanf (fp, "%50s %50s", buf1, buf2) < 2) | ||
174 | break; | ||
175 | l1 = strlen (buf1); | ||
176 | l2 = strlen (buf2); | ||
177 | old_res_ptr = res_ptr; | ||
178 | if (res_size == 0) | ||
179 | { | ||
180 | res_size = l1 + 1 + l2 + 1; | ||
181 | res_ptr = (char *) malloc (res_size + 1); | ||
182 | } | ||
183 | else | ||
184 | { | ||
185 | res_size += l1 + 1 + l2 + 1; | ||
186 | res_ptr = (char *) realloc (res_ptr, res_size + 1); | ||
187 | } | ||
188 | if (res_ptr == NULL) | ||
189 | { | ||
190 | /* Out of memory. */ | ||
191 | res_size = 0; | ||
192 | if (old_res_ptr != NULL) | ||
193 | free (old_res_ptr); | ||
194 | break; | ||
195 | } | ||
196 | strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1); | ||
197 | strcpy (res_ptr + res_size - (l2 + 1), buf2); | ||
198 | } | ||
199 | fclose (fp); | ||
200 | if (res_size == 0) | ||
201 | cp = ""; | ||
202 | else | ||
203 | { | ||
204 | *(res_ptr + res_size) = '\0'; | ||
205 | cp = res_ptr; | ||
206 | } | ||
207 | } | ||
208 | |||
209 | if (file_name != NULL) | ||
210 | free (file_name); | ||
211 | |||
212 | #else | ||
213 | |||
214 | # if defined VMS | ||
215 | /* To avoid the troubles of an extra file charset.alias_vms in the | ||
216 | sources of many GNU packages, simply inline the aliases here. */ | ||
217 | /* The list of encodings is taken from the OpenVMS 7.3-1 documentation | ||
218 | "Compaq C Run-Time Library Reference Manual for OpenVMS systems" | ||
219 | section 10.7 "Handling Different Character Sets". */ | ||
220 | cp = "ISO8859-1" "\0" "ISO-8859-1" "\0" | ||
221 | "ISO8859-2" "\0" "ISO-8859-2" "\0" | ||
222 | "ISO8859-5" "\0" "ISO-8859-5" "\0" | ||
223 | "ISO8859-7" "\0" "ISO-8859-7" "\0" | ||
224 | "ISO8859-8" "\0" "ISO-8859-8" "\0" | ||
225 | "ISO8859-9" "\0" "ISO-8859-9" "\0" | ||
226 | /* Japanese */ | ||
227 | "eucJP" "\0" "EUC-JP" "\0" | ||
228 | "SJIS" "\0" "SHIFT_JIS" "\0" | ||
229 | "DECKANJI" "\0" "DEC-KANJI" "\0" | ||
230 | "SDECKANJI" "\0" "EUC-JP" "\0" | ||
231 | /* Chinese */ | ||
232 | "eucTW" "\0" "EUC-TW" "\0" | ||
233 | "DECHANYU" "\0" "DEC-HANYU" "\0" | ||
234 | "DECHANZI" "\0" "GB2312" "\0" | ||
235 | /* Korean */ | ||
236 | "DECKOREAN" "\0" "EUC-KR" "\0"; | ||
237 | # endif | ||
238 | |||
239 | # if defined WIN32_NATIVE || defined __CYGWIN__ | ||
240 | /* To avoid the troubles of installing a separate file in the same | ||
241 | directory as the DLL and of retrieving the DLL's directory at | ||
242 | runtime, simply inline the aliases here. */ | ||
243 | |||
244 | cp = "CP936" "\0" "GBK" "\0" | ||
245 | "CP1361" "\0" "JOHAB" "\0" | ||
246 | "CP20127" "\0" "ASCII" "\0" | ||
247 | "CP20866" "\0" "KOI8-R" "\0" | ||
248 | "CP20936" "\0" "GB2312" "\0" | ||
249 | "CP21866" "\0" "KOI8-RU" "\0" | ||
250 | "CP28591" "\0" "ISO-8859-1" "\0" | ||
251 | "CP28592" "\0" "ISO-8859-2" "\0" | ||
252 | "CP28593" "\0" "ISO-8859-3" "\0" | ||
253 | "CP28594" "\0" "ISO-8859-4" "\0" | ||
254 | "CP28595" "\0" "ISO-8859-5" "\0" | ||
255 | "CP28596" "\0" "ISO-8859-6" "\0" | ||
256 | "CP28597" "\0" "ISO-8859-7" "\0" | ||
257 | "CP28598" "\0" "ISO-8859-8" "\0" | ||
258 | "CP28599" "\0" "ISO-8859-9" "\0" | ||
259 | "CP28605" "\0" "ISO-8859-15" "\0" | ||
260 | "CP38598" "\0" "ISO-8859-8" "\0" | ||
261 | "CP51932" "\0" "EUC-JP" "\0" | ||
262 | "CP51936" "\0" "GB2312" "\0" | ||
263 | "CP51949" "\0" "EUC-KR" "\0" | ||
264 | "CP51950" "\0" "EUC-TW" "\0" | ||
265 | "CP54936" "\0" "GB18030" "\0" | ||
266 | "CP65001" "\0" "UTF-8" "\0"; | ||
267 | # endif | ||
268 | #endif | ||
269 | |||
270 | charset_aliases = cp; | ||
271 | } | ||
272 | |||
273 | return cp; | ||
274 | } | ||
275 | |||
276 | /* Determine the current locale's character encoding, and canonicalize it | ||
277 | into one of the canonical names listed in config.charset. | ||
278 | The result must not be freed; it is statically allocated. | ||
279 | If the canonical name cannot be determined, the result is a non-canonical | ||
280 | name. */ | ||
281 | |||
282 | #ifdef STATIC | ||
283 | STATIC | ||
284 | #endif | ||
285 | const char * | ||
286 | locale_charset (void) | ||
287 | { | ||
288 | const char *codeset; | ||
289 | const char *aliases; | ||
290 | |||
291 | #if !(defined WIN32_NATIVE || defined OS2) | ||
292 | |||
293 | # if HAVE_LANGINFO_CODESET | ||
294 | |||
295 | /* Most systems support nl_langinfo (CODESET) nowadays. */ | ||
296 | codeset = nl_langinfo (CODESET); | ||
297 | |||
298 | # ifdef __CYGWIN__ | ||
299 | /* Cygwin 2006 does not have locales. nl_langinfo (CODESET) always | ||
300 | returns "US-ASCII". As long as this is not fixed, return the suffix | ||
301 | of the locale name from the environment variables (if present) or | ||
302 | the codepage as a number. */ | ||
303 | if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0) | ||
304 | { | ||
305 | const char *locale; | ||
306 | static char buf[2 + 10 + 1]; | ||
307 | |||
308 | locale = getenv ("LC_ALL"); | ||
309 | if (locale == NULL || locale[0] == '\0') | ||
310 | { | ||
311 | locale = getenv ("LC_CTYPE"); | ||
312 | if (locale == NULL || locale[0] == '\0') | ||
313 | locale = getenv ("LANG"); | ||
314 | } | ||
315 | if (locale != NULL && locale[0] != '\0') | ||
316 | { | ||
317 | /* If the locale name contains an encoding after the dot, return | ||
318 | it. */ | ||
319 | const char *dot = strchr (locale, '.'); | ||
320 | |||
321 | if (dot != NULL) | ||
322 | { | ||
323 | const char *modifier; | ||
324 | |||
325 | dot++; | ||
326 | /* Look for the possible @... trailer and remove it, if any. */ | ||
327 | modifier = strchr (dot, '@'); | ||
328 | if (modifier == NULL) | ||
329 | return dot; | ||
330 | if (modifier - dot < sizeof (buf)) | ||
331 | { | ||
332 | memcpy (buf, dot, modifier - dot); | ||
333 | buf [modifier - dot] = '\0'; | ||
334 | return buf; | ||
335 | } | ||
336 | } | ||
337 | } | ||
338 | |||
339 | /* Woe32 has a function returning the locale's codepage as a number. */ | ||
340 | sprintf (buf, "CP%u", GetACP ()); | ||
341 | codeset = buf; | ||
342 | } | ||
343 | # endif | ||
344 | |||
345 | # else | ||
346 | |||
347 | /* On old systems which lack it, use setlocale or getenv. */ | ||
348 | const char *locale = NULL; | ||
349 | |||
350 | /* But most old systems don't have a complete set of locales. Some | ||
351 | (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't | ||
352 | use setlocale here; it would return "C" when it doesn't support the | ||
353 | locale name the user has set. */ | ||
354 | # if 0 | ||
355 | locale = setlocale (LC_CTYPE, NULL); | ||
356 | # endif | ||
357 | if (locale == NULL || locale[0] == '\0') | ||
358 | { | ||
359 | locale = getenv ("LC_ALL"); | ||
360 | if (locale == NULL || locale[0] == '\0') | ||
361 | { | ||
362 | locale = getenv ("LC_CTYPE"); | ||
363 | if (locale == NULL || locale[0] == '\0') | ||
364 | locale = getenv ("LANG"); | ||
365 | } | ||
366 | } | ||
367 | |||
368 | /* On some old systems, one used to set locale = "iso8859_1". On others, | ||
369 | you set it to "language_COUNTRY.charset". In any case, we resolve it | ||
370 | through the charset.alias file. */ | ||
371 | codeset = locale; | ||
372 | |||
373 | # endif | ||
374 | |||
375 | #elif defined WIN32_NATIVE | ||
376 | |||
377 | static char buf[2 + 10 + 1]; | ||
378 | |||
379 | /* Woe32 has a function returning the locale's codepage as a number. */ | ||
380 | sprintf (buf, "CP%u", GetACP ()); | ||
381 | codeset = buf; | ||
382 | |||
383 | #elif defined OS2 | ||
384 | |||
385 | const char *locale; | ||
386 | static char buf[2 + 10 + 1]; | ||
387 | ULONG cp[3]; | ||
388 | ULONG cplen; | ||
389 | |||
390 | /* Allow user to override the codeset, as set in the operating system, | ||
391 | with standard language environment variables. */ | ||
392 | locale = getenv ("LC_ALL"); | ||
393 | if (locale == NULL || locale[0] == '\0') | ||
394 | { | ||
395 | locale = getenv ("LC_CTYPE"); | ||
396 | if (locale == NULL || locale[0] == '\0') | ||
397 | locale = getenv ("LANG"); | ||
398 | } | ||
399 | if (locale != NULL && locale[0] != '\0') | ||
400 | { | ||
401 | /* If the locale name contains an encoding after the dot, return it. */ | ||
402 | const char *dot = strchr (locale, '.'); | ||
403 | |||
404 | if (dot != NULL) | ||
405 | { | ||
406 | const char *modifier; | ||
407 | |||
408 | dot++; | ||
409 | /* Look for the possible @... trailer and remove it, if any. */ | ||
410 | modifier = strchr (dot, '@'); | ||
411 | if (modifier == NULL) | ||
412 | return dot; | ||
413 | if (modifier - dot < sizeof (buf)) | ||
414 | { | ||
415 | memcpy (buf, dot, modifier - dot); | ||
416 | buf [modifier - dot] = '\0'; | ||
417 | return buf; | ||
418 | } | ||
419 | } | ||
420 | |||
421 | /* Resolve through the charset.alias file. */ | ||
422 | codeset = locale; | ||
423 | } | ||
424 | else | ||
425 | { | ||
426 | /* OS/2 has a function returning the locale's codepage as a number. */ | ||
427 | if (DosQueryCp (sizeof (cp), cp, &cplen)) | ||
428 | codeset = ""; | ||
429 | else | ||
430 | { | ||
431 | sprintf (buf, "CP%u", cp[0]); | ||
432 | codeset = buf; | ||
433 | } | ||
434 | } | ||
435 | |||
436 | #endif | ||
437 | |||
438 | if (codeset == NULL) | ||
439 | /* The canonical name cannot be determined. */ | ||
440 | codeset = ""; | ||
441 | |||
442 | /* Resolve alias. */ | ||
443 | for (aliases = get_charset_aliases (); | ||
444 | *aliases != '\0'; | ||
445 | aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1) | ||
446 | if (strcmp (codeset, aliases) == 0 | ||
447 | || (aliases[0] == '*' && aliases[1] == '\0')) | ||
448 | { | ||
449 | codeset = aliases + strlen (aliases) + 1; | ||
450 | break; | ||
451 | } | ||
452 | |||
453 | /* Don't return an empty string. GNU libc and GNU libiconv interpret | ||
454 | the empty string as denoting "the locale's character encoding", | ||
455 | thus GNU libiconv would call this function a second time. */ | ||
456 | if (codeset[0] == '\0') | ||
457 | codeset = "ASCII"; | ||
458 | |||
459 | return codeset; | ||
460 | } | ||