Line data Source code
1 : /* Determine a canonical name for the current locale's character
2 : encoding.
3 :
4 : Copyright 2000-2020 Free Software Foundation, Inc.
5 :
6 : This program is free software; you can redistribute it and/or modify
7 : it under the terms of the GNU General Public License as published by
8 : the Free Software Foundation; either version 2, or (at your option)
9 : any later version.
10 :
11 : This program is distributed in the hope that it will be useful,
12 : but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : GNU General Public License for more details.
15 :
16 : You should have received a copy of the GNU General Public License
17 : along with this program; if not, see <http://www.gnu.org/licenses/>.
18 : */
19 :
20 : /* Written by Bruno Haible <bruno@clisp.org>. */
21 :
22 : #ifdef HAVE_CONFIG_H
23 : #include <config.h>
24 : #endif
25 :
26 : /* Specification. */
27 : #include "localcharset.h"
28 :
29 : #include <fcntl.h>
30 : #include <stddef.h>
31 : #include <stdio.h>
32 : #include <string.h>
33 : #include <stdlib.h>
34 :
35 : #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
36 : # define DARWIN7 /* Darwin 7 or newer, i.e. Mac OS X 10.3 or newer */
37 : #endif
38 :
39 : #if defined _WIN32 || defined __WIN32__
40 : # define WINDOWS_NATIVE
41 : # include <locale.h>
42 : #endif
43 :
44 : #if defined __EMX__
45 : /* Assume EMX program runs on OS/2, even if compiled under DOS. */
46 : # ifndef OS2
47 : # define OS2
48 : # endif
49 : #endif
50 :
51 : #if !defined WINDOWS_NATIVE
52 : # include <unistd.h>
53 : # if HAVE_LANGINFO_CODESET
54 : # include <langinfo.h>
55 : # else
56 : # if 0 /* see comment below */
57 : # include <locale.h>
58 : # endif
59 : # endif
60 : # ifdef __CYGWIN__
61 : # define WIN32_LEAN_AND_MEAN
62 : # include <windows.h>
63 : # endif
64 : #elif defined WINDOWS_NATIVE
65 : # define WIN32_LEAN_AND_MEAN
66 : # include <windows.h>
67 : #endif
68 : #if defined OS2
69 : # define INCL_DOS
70 : # include <os2.h>
71 : #endif
72 :
73 : /* For MB_CUR_MAX_L */
74 : #if defined DARWIN7
75 : # include <xlocale.h>
76 : #endif
77 :
78 : #if ENABLE_RELOCATABLE
79 : # include "relocatable.h"
80 : #else
81 : # define relocate(pathname) (pathname)
82 : #endif
83 :
84 : /* Get LIBDIR. */
85 : #ifndef LIBDIR
86 : # include "configmake.h"
87 : #endif
88 :
89 : /* Define O_NOFOLLOW to 0 on platforms where it does not exist. */
90 : #ifndef O_NOFOLLOW
91 : # define O_NOFOLLOW 0
92 : #endif
93 :
94 : #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
95 : /* Native Windows, Cygwin, OS/2, DOS */
96 : # define ISSLASH(C) ((C) == '/' || (C) == '\\')
97 : #endif
98 :
99 : #ifndef DIRECTORY_SEPARATOR
100 : # define DIRECTORY_SEPARATOR '/'
101 : #endif
102 :
103 : #ifndef ISSLASH
104 : # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
105 : #endif
106 :
107 : #if HAVE_DECL_GETC_UNLOCKED
108 : # undef getc
109 : # define getc getc_unlocked
110 : #endif
111 :
112 : /* The following static variable is declared 'volatile' to avoid a
113 : possible multithread problem in the function get_charset_aliases. If we
114 : are running in a threaded environment, and if two threads initialize
115 : 'charset_aliases' simultaneously, both will produce the same value,
116 : and everything will be ok if the two assignments to 'charset_aliases'
117 : are atomic. But I don't know what will happen if the two assignments mix. */
118 : #if __STDC__ != 1
119 : # define volatile /* empty */
120 : #endif
121 : /* Pointer to the contents of the charset.alias file, if it has already been
122 : read, else NULL. Its format is:
123 : ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */
124 : static const char * volatile charset_aliases;
125 :
126 : /* Return a pointer to the contents of the charset.alias file. */
127 : static const char *
128 17 : get_charset_aliases (void)
129 : {
130 : const char *cp;
131 :
132 17 : cp = charset_aliases;
133 17 : if (cp == NULL)
134 : {
135 : #if !(defined DARWIN7 || defined VMS || defined WINDOWS_NATIVE || defined __CYGWIN__)
136 : const char *dir;
137 17 : const char *base = "charset.alias";
138 : char *file_name;
139 :
140 : /* Make it possible to override the charset.alias location. This is
141 : necessary for running the testsuite before "make install". */
142 17 : dir = getenv ("CHARSETALIASDIR");
143 17 : if (dir == NULL || dir[0] == '\0')
144 17 : dir = relocate (LIBDIR);
145 :
146 : /* Concatenate dir and base into freshly allocated file_name. */
147 : {
148 17 : size_t dir_len = strlen (dir);
149 17 : size_t base_len = strlen (base);
150 17 : int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
151 17 : file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
152 17 : if (file_name != NULL)
153 : {
154 17 : memcpy (file_name, dir, dir_len);
155 17 : if (add_slash)
156 17 : file_name[dir_len] = DIRECTORY_SEPARATOR;
157 17 : memcpy (file_name + dir_len + add_slash, base, base_len + 1);
158 : }
159 : }
160 :
161 17 : if (file_name == NULL)
162 : /* Out of memory. Treat the file as empty. */
163 0 : cp = "";
164 : else
165 : {
166 : int fd;
167 :
168 : /* Open the file. Reject symbolic links on platforms that support
169 : O_NOFOLLOW. This is a security feature. Without it, an attacker
170 : could retrieve parts of the contents (namely, the tail of the
171 : first line that starts with "* ") of an arbitrary file by placing
172 : a symbolic link to that file under the name "charset.alias" in
173 : some writable directory and defining the environment variable
174 : CHARSETALIASDIR to point to that directory. */
175 17 : fd = open (file_name,
176 : O_RDONLY | (HAVE_WORKING_O_NOFOLLOW ? O_NOFOLLOW : 0));
177 17 : if (fd < 0)
178 : /* File not found. Treat it as empty. */
179 17 : cp = "";
180 : else
181 : {
182 : FILE *fp;
183 :
184 0 : fp = fdopen (fd, "r");
185 0 : if (fp == NULL)
186 : {
187 : /* Out of memory. Treat the file as empty. */
188 0 : close (fd);
189 0 : cp = "";
190 : }
191 : else
192 : {
193 : /* Parse the file's contents. */
194 0 : char *res_ptr = NULL;
195 0 : size_t res_size = 0;
196 :
197 : for (;;)
198 0 : {
199 : int c;
200 : char buf1[50+1];
201 : char buf2[50+1];
202 : size_t l1, l2;
203 : char *old_res_ptr;
204 :
205 0 : c = getc (fp);
206 0 : if (c == EOF)
207 0 : break;
208 0 : if (c == '\n' || c == ' ' || c == '\t')
209 0 : continue;
210 0 : if (c == '#')
211 : {
212 : /* Skip comment, to end of line. */
213 : do
214 0 : c = getc (fp);
215 0 : while (!(c == EOF || c == '\n'));
216 0 : if (c == EOF)
217 0 : break;
218 0 : continue;
219 : }
220 0 : ungetc (c, fp);
221 0 : if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
222 0 : break;
223 0 : l1 = strlen (buf1);
224 0 : l2 = strlen (buf2);
225 0 : old_res_ptr = res_ptr;
226 0 : if (res_size == 0)
227 : {
228 0 : res_size = l1 + 1 + l2 + 1;
229 0 : res_ptr = (char *) malloc (res_size + 1);
230 : }
231 : else
232 : {
233 0 : res_size += l1 + 1 + l2 + 1;
234 0 : res_ptr = (char *) realloc (res_ptr, res_size + 1);
235 : }
236 0 : if (res_ptr == NULL)
237 : {
238 : /* Out of memory. */
239 0 : res_size = 0;
240 0 : free (old_res_ptr);
241 0 : break;
242 : }
243 0 : strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
244 0 : strcpy (res_ptr + res_size - (l2 + 1), buf2);
245 : }
246 0 : fclose (fp);
247 0 : if (res_size == 0)
248 0 : cp = "";
249 : else
250 : {
251 0 : *(res_ptr + res_size) = '\0';
252 0 : cp = res_ptr;
253 : }
254 : }
255 : }
256 :
257 17 : free (file_name);
258 : }
259 :
260 : #else
261 :
262 : # if defined DARWIN7
263 : /* To avoid the trouble of installing a file that is shared by many
264 : GNU packages -- many packaging systems have problems with this --,
265 : simply inline the aliases here. */
266 : cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
267 : "ISO8859-2" "\0" "ISO-8859-2" "\0"
268 : "ISO8859-4" "\0" "ISO-8859-4" "\0"
269 : "ISO8859-5" "\0" "ISO-8859-5" "\0"
270 : "ISO8859-7" "\0" "ISO-8859-7" "\0"
271 : "ISO8859-9" "\0" "ISO-8859-9" "\0"
272 : "ISO8859-13" "\0" "ISO-8859-13" "\0"
273 : "ISO8859-15" "\0" "ISO-8859-15" "\0"
274 : "KOI8-R" "\0" "KOI8-R" "\0"
275 : "KOI8-U" "\0" "KOI8-U" "\0"
276 : "CP866" "\0" "CP866" "\0"
277 : "CP949" "\0" "CP949" "\0"
278 : "CP1131" "\0" "CP1131" "\0"
279 : "CP1251" "\0" "CP1251" "\0"
280 : "eucCN" "\0" "GB2312" "\0"
281 : "GB2312" "\0" "GB2312" "\0"
282 : "eucJP" "\0" "EUC-JP" "\0"
283 : "eucKR" "\0" "EUC-KR" "\0"
284 : "Big5" "\0" "BIG5" "\0"
285 : "Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
286 : "GBK" "\0" "GBK" "\0"
287 : "GB18030" "\0" "GB18030" "\0"
288 : "SJIS" "\0" "SHIFT_JIS" "\0"
289 : "ARMSCII-8" "\0" "ARMSCII-8" "\0"
290 : "PT154" "\0" "PT154" "\0"
291 : /*"ISCII-DEV" "\0" "?" "\0"*/
292 : "*" "\0" "UTF-8" "\0";
293 : # endif
294 :
295 : # if defined VMS
296 : /* To avoid the troubles of an extra file charset.alias_vms in the
297 : sources of many GNU packages, simply inline the aliases here. */
298 : /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
299 : "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
300 : section 10.7 "Handling Different Character Sets". */
301 : cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
302 : "ISO8859-2" "\0" "ISO-8859-2" "\0"
303 : "ISO8859-5" "\0" "ISO-8859-5" "\0"
304 : "ISO8859-7" "\0" "ISO-8859-7" "\0"
305 : "ISO8859-8" "\0" "ISO-8859-8" "\0"
306 : "ISO8859-9" "\0" "ISO-8859-9" "\0"
307 : /* Japanese */
308 : "eucJP" "\0" "EUC-JP" "\0"
309 : "SJIS" "\0" "SHIFT_JIS" "\0"
310 : "DECKANJI" "\0" "DEC-KANJI" "\0"
311 : "SDECKANJI" "\0" "EUC-JP" "\0"
312 : /* Chinese */
313 : "eucTW" "\0" "EUC-TW" "\0"
314 : "DECHANYU" "\0" "DEC-HANYU" "\0"
315 : "DECHANZI" "\0" "GB2312" "\0"
316 : /* Korean */
317 : "DECKOREAN" "\0" "EUC-KR" "\0";
318 : # endif
319 :
320 : # if defined WINDOWS_NATIVE || defined __CYGWIN__
321 : /* To avoid the troubles of installing a separate file in the same
322 : directory as the DLL and of retrieving the DLL's directory at
323 : runtime, simply inline the aliases here. */
324 :
325 : cp = "CP936" "\0" "GBK" "\0"
326 : "CP1361" "\0" "JOHAB" "\0"
327 : "CP20127" "\0" "ASCII" "\0"
328 : "CP20866" "\0" "KOI8-R" "\0"
329 : "CP20936" "\0" "GB2312" "\0"
330 : "CP21866" "\0" "KOI8-RU" "\0"
331 : "CP28591" "\0" "ISO-8859-1" "\0"
332 : "CP28592" "\0" "ISO-8859-2" "\0"
333 : "CP28593" "\0" "ISO-8859-3" "\0"
334 : "CP28594" "\0" "ISO-8859-4" "\0"
335 : "CP28595" "\0" "ISO-8859-5" "\0"
336 : "CP28596" "\0" "ISO-8859-6" "\0"
337 : "CP28597" "\0" "ISO-8859-7" "\0"
338 : "CP28598" "\0" "ISO-8859-8" "\0"
339 : "CP28599" "\0" "ISO-8859-9" "\0"
340 : "CP28605" "\0" "ISO-8859-15" "\0"
341 : "CP38598" "\0" "ISO-8859-8" "\0"
342 : "CP51932" "\0" "EUC-JP" "\0"
343 : "CP51936" "\0" "GB2312" "\0"
344 : "CP51949" "\0" "EUC-KR" "\0"
345 : "CP51950" "\0" "EUC-TW" "\0"
346 : "CP54936" "\0" "GB18030" "\0"
347 : "CP65001" "\0" "UTF-8" "\0";
348 : # endif
349 : #endif
350 :
351 17 : charset_aliases = cp;
352 : }
353 :
354 17 : return cp;
355 : }
356 :
357 : /* Determine the current locale's character encoding, and canonicalize it
358 : into one of the canonical names listed in config.charset.
359 : The result must not be freed; it is statically allocated.
360 : If the canonical name cannot be determined, the result is a non-canonical
361 : name. */
362 :
363 : #ifdef STATIC
364 : STATIC
365 : #endif
366 : const char *
367 17 : locale_charset (void)
368 : {
369 : const char *codeset;
370 : const char *aliases;
371 :
372 : #if !(defined WINDOWS_NATIVE || defined OS2)
373 :
374 : # if HAVE_LANGINFO_CODESET
375 :
376 : /* Most systems support nl_langinfo (CODESET) nowadays. */
377 17 : codeset = nl_langinfo (CODESET);
378 :
379 : # ifdef __CYGWIN__
380 : /* Cygwin < 1.7 does not have locales. nl_langinfo (CODESET) always
381 : returns "US-ASCII". Return the suffix of the locale name from the
382 : environment variables (if present) or the codepage as a number. */
383 : if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
384 : {
385 : const char *locale;
386 : static char buf[2 + 10 + 1];
387 :
388 : locale = getenv ("LC_ALL");
389 : if (locale == NULL || locale[0] == '\0')
390 : {
391 : locale = getenv ("LC_CTYPE");
392 : if (locale == NULL || locale[0] == '\0')
393 : locale = getenv ("LANG");
394 : }
395 : if (locale != NULL && locale[0] != '\0')
396 : {
397 : /* If the locale name contains an encoding after the dot, return
398 : it. */
399 : const char *dot = strchr (locale, '.');
400 :
401 : if (dot != NULL)
402 : {
403 : const char *modifier;
404 :
405 : dot++;
406 : /* Look for the possible @... trailer and remove it, if any. */
407 : modifier = strchr (dot, '@');
408 : if (modifier == NULL)
409 : return dot;
410 : if (modifier - dot < sizeof (buf))
411 : {
412 : memcpy (buf, dot, modifier - dot);
413 : buf [modifier - dot] = '\0';
414 : return buf;
415 : }
416 : }
417 : }
418 :
419 : /* The Windows API has a function returning the locale's codepage as a
420 : number: GetACP(). This encoding is used by Cygwin, unless the user
421 : has set the environment variable CYGWIN=codepage:oem (which very few
422 : people do).
423 : Output directed to console windows needs to be converted (to
424 : GetOEMCP() if the console is using a raster font, or to
425 : GetConsoleOutputCP() if it is using a TrueType font). Cygwin does
426 : this conversion transparently (see winsup/cygwin/fhandler_console.cc),
427 : converting to GetConsoleOutputCP(). This leads to correct results,
428 : except when SetConsoleOutputCP has been called and a raster font is
429 : in use. */
430 : sprintf (buf, "CP%u", GetACP ());
431 : codeset = buf;
432 : }
433 : # endif
434 :
435 : # else
436 :
437 : /* On old systems which lack it, use setlocale or getenv. */
438 : const char *locale = NULL;
439 :
440 : /* But most old systems don't have a complete set of locales. Some
441 : (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
442 : use setlocale here; it would return "C" when it doesn't support the
443 : locale name the user has set. */
444 : # if 0
445 : locale = setlocale (LC_CTYPE, NULL);
446 : # endif
447 : if (locale == NULL || locale[0] == '\0')
448 : {
449 : locale = getenv ("LC_ALL");
450 : if (locale == NULL || locale[0] == '\0')
451 : {
452 : locale = getenv ("LC_CTYPE");
453 : if (locale == NULL || locale[0] == '\0')
454 : locale = getenv ("LANG");
455 : }
456 : }
457 :
458 : /* On some old systems, one used to set locale = "iso8859_1". On others,
459 : you set it to "language_COUNTRY.charset". In any case, we resolve it
460 : through the charset.alias file. */
461 : codeset = locale;
462 :
463 : # endif
464 :
465 : #elif defined WINDOWS_NATIVE
466 :
467 : static char buf[2 + 10 + 1];
468 :
469 : /* The Windows API has a function returning the locale's codepage as
470 : a number, but the value doesn't change according to what the
471 : 'setlocale' call specified. So we use it as a last resort, in
472 : case the string returned by 'setlocale' doesn't specify the
473 : codepage. */
474 : char *current_locale = setlocale (LC_ALL, NULL);
475 : char *pdot;
476 :
477 : /* If they set different locales for different categories,
478 : 'setlocale' will return a semi-colon separated list of locale
479 : values. To make sure we use the correct one, we choose LC_CTYPE. */
480 : if (strchr (current_locale, ';'))
481 : current_locale = setlocale (LC_CTYPE, NULL);
482 :
483 : pdot = strrchr (current_locale, '.');
484 : if (pdot)
485 : sprintf (buf, "CP%s", pdot + 1);
486 : else
487 : {
488 : /* The Windows API has a function returning the locale's codepage as a
489 : number: GetACP().
490 : When the output goes to a console window, it needs to be provided in
491 : GetOEMCP() encoding if the console is using a raster font, or in
492 : GetConsoleOutputCP() encoding if it is using a TrueType font.
493 : But in GUI programs and for output sent to files and pipes, GetACP()
494 : encoding is the best bet. */
495 : sprintf (buf, "CP%u", GetACP ());
496 : }
497 : codeset = buf;
498 :
499 : #elif defined OS2
500 :
501 : const char *locale;
502 : static char buf[2 + 10 + 1];
503 : ULONG cp[3];
504 : ULONG cplen;
505 :
506 : /* Allow user to override the codeset, as set in the operating system,
507 : with standard language environment variables. */
508 : locale = getenv ("LC_ALL");
509 : if (locale == NULL || locale[0] == '\0')
510 : {
511 : locale = getenv ("LC_CTYPE");
512 : if (locale == NULL || locale[0] == '\0')
513 : locale = getenv ("LANG");
514 : }
515 : if (locale != NULL && locale[0] != '\0')
516 : {
517 : /* If the locale name contains an encoding after the dot, return it. */
518 : const char *dot = strchr (locale, '.');
519 :
520 : if (dot != NULL)
521 : {
522 : const char *modifier;
523 :
524 : dot++;
525 : /* Look for the possible @... trailer and remove it, if any. */
526 : modifier = strchr (dot, '@');
527 : if (modifier == NULL)
528 : return dot;
529 : if (modifier - dot < sizeof (buf))
530 : {
531 : memcpy (buf, dot, modifier - dot);
532 : buf [modifier - dot] = '\0';
533 : return buf;
534 : }
535 : }
536 :
537 : /* Resolve through the charset.alias file. */
538 : codeset = locale;
539 : }
540 : else
541 : {
542 : /* OS/2 has a function returning the locale's codepage as a number. */
543 : if (DosQueryCp (sizeof (cp), cp, &cplen))
544 : codeset = "";
545 : else
546 : {
547 : sprintf (buf, "CP%u", cp[0]);
548 : codeset = buf;
549 : }
550 : }
551 :
552 : #endif
553 :
554 17 : if (codeset == NULL)
555 : /* The canonical name cannot be determined. */
556 0 : codeset = "";
557 :
558 : /* Resolve alias. */
559 17 : for (aliases = get_charset_aliases ();
560 17 : *aliases != '\0';
561 0 : aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
562 0 : if (strcmp (codeset, aliases) == 0
563 0 : || (aliases[0] == '*' && aliases[1] == '\0'))
564 : {
565 0 : codeset = aliases + strlen (aliases) + 1;
566 0 : break;
567 : }
568 :
569 : /* Don't return an empty string. GNU libc and GNU libiconv interpret
570 : the empty string as denoting "the locale's character encoding",
571 : thus GNU libiconv would call this function a second time. */
572 17 : if (codeset[0] == '\0')
573 0 : codeset = "ASCII";
574 :
575 : #ifdef DARWIN7
576 : /* Mac OS X sets MB_CUR_MAX to 1 when LC_ALL=C, and "UTF-8"
577 : (the default codeset) does not work when MB_CUR_MAX is 1. */
578 : if (strcmp (codeset, "UTF-8") == 0 && MB_CUR_MAX_L (uselocale (NULL)) <= 1)
579 : codeset = "ASCII";
580 : #endif
581 :
582 17 : return codeset;
583 : }
584 :
585 : // Local Variables:
586 : // fill-column: 72
587 : // mode: C++
588 : // End:
589 : // vim: set cindent noexpandtab shiftwidth=2 textwidth=72:
|