Line data Source code
1 : /* Copyright (C) 2005-2025 Free Software Foundation, Inc.
2 : Written by Werner Lemberg (wl@gnu.org)
3 :
4 : This file is part of groff, the GNU roff typesetting system.
5 :
6 : groff is free software; you can redistribute it and/or modify it under
7 : the terms of the GNU General Public License as published by the Free
8 : Software Foundation, either version 3 of the License, or
9 : (at your option) any later version.
10 :
11 : groff is distributed in the hope that it will be useful, but WITHOUT ANY
12 : WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 : for more details.
15 :
16 : You should have received a copy of the GNU General Public License
17 : along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 :
19 : #ifdef HAVE_CONFIG_H
20 : #include <config.h>
21 : #endif
22 :
23 : #include <assert.h>
24 : #include <errno.h>
25 : #if HAVE_ICONV
26 : # include <iconv.h> // iconv(), iconv_close(), iconv_open()
27 : # ifdef WORDS_BIGENDIAN
28 : # define UNICODE "UTF-32BE"
29 : # else
30 : # define UNICODE "UTF-32LE"
31 : # endif
32 : #endif
33 : #include <locale.h> // setlocale()
34 : #include <stdcountof.h>
35 : #include <stdio.h> // EOF, FILE, fclose(), ferror(), fflush(), fileno(),
36 : // fopen(), fprintf(), fread(), fseek(), ftell(),
37 : // getc(), printf(), putchar(), rewind(), SEEK_SET,
38 : // stderr, stdin, stdout, ungetc()
39 : #include <stdlib.h> // calloc(), exit(), EXIT_SUCCESS, free(), malloc()
40 : #include <string.h> // sterror()
41 : #include <sys/stat.h> // fstat(), stat
42 : #ifdef HAVE_UCHARDET
43 : #include <uchardet/uchardet.h>
44 : #endif
45 :
46 : #include <getopt.h> // getopt_long()
47 :
48 : #include "lib.h"
49 :
50 : #include "errarg.h"
51 : #include "error.h"
52 : #include "localcharset.h"
53 : #include "nonposix.h"
54 : #include "stringclass.h" // must precede lf.h
55 : #include "lf.h"
56 :
57 : #define MAX_VAR_LEN 100
58 :
59 : extern "C" const char *Version_string;
60 :
61 : char fallback_encoding[MAX_VAR_LEN];
62 : char user_encoding[MAX_VAR_LEN];
63 : char encoding_string[MAX_VAR_LEN];
64 : bool is_debugging = false;
65 : bool want_raw_output = false;
66 :
67 : struct conversion {
68 : const char *from;
69 : const char *to;
70 : };
71 :
72 : // The official list of MIME tags can be found at
73 : //
74 : // http://www.iana.org/assignments/character-sets
75 : //
76 : // For encodings which don't have a MIME tag we use GNU iconv's encoding
77 : // names (which also work with the portable GNU libiconv package). They
78 : // are marked with '*'.
79 : //
80 : // Encodings specific to XEmacs and Emacs are marked as such; no mark
81 : // means that they are used by both Emacs and XEmacs.
82 : //
83 : // Encodings marked with '--' are special to Emacs, XEmacs, or other
84 : // applications and shouldn't be used for data exchange.
85 : //
86 : // 'Not covered' means that the encoding can be handled neither by GNU
87 : // iconv nor by libiconv, or just one of them has support for it.
88 : //
89 : // A special case is VIQR encoding: Despite of having a MIME tag it is
90 : // missing in both libiconv 1.10 and iconv (coming with GNU libc 2.3.6).
91 : //
92 : // Finally, we add all aliases of GNU iconv for 'ascii', 'latin1', and
93 : // 'utf8' to catch those encoding names before iconv is called.
94 : //
95 : // Note that most entries are commented out -- only a small, (rather)
96 : // reliable and stable subset of encodings is recognized (for coding
97 : // tags) which are still in greater use today (January 2006). Most
98 : // notably, all Windows-specific encodings are not selected because they
99 : // lack stability: Microsoft has changed the mappings instead of
100 : // creating new versions.
101 : //
102 : // Please contact the groff list if you find the selection inadequate.
103 :
104 : static const conversion
105 : emacs_to_mime[] = {
106 : {"ascii", "US-ASCII"}, // Emacs
107 : {"big5", "Big5"},
108 : {"chinese-big5", "Big5"}, // Emacs
109 : {"chinese-euc", "GB2312"}, // XEmacs
110 : {"chinese-iso-8bit", "GB2312"}, // Emacs
111 : {"cn-big5", "Big5"},
112 : {"cn-gb", "GB2312"}, // Emacs
113 : {"cn-gb-2312", "GB2312"},
114 : {"cp878", "KOI8-R"}, // Emacs
115 : {"cp1047", "CP1047"}, // EBCDIC
116 : {"csascii", "US-ASCII"}, // alias
117 : {"csisolatin1", "ISO-8859-1"}, // alias
118 : {"cyrillic-iso-8bit", "ISO-8859-5"}, // Emacs
119 : {"cyrillic-koi8", "KOI8-R"}, // not KOI8!, Emacs
120 : {"euc-china", "GB2312"}, // Emacs
121 : {"euc-cn", "GB2312"}, // Emacs
122 : {"euc-japan", "EUC-JP"},
123 : {"euc-japan-1990", "EUC-JP"}, // Emacs
124 : {"euc-jp", "EUC-JP"},
125 : {"euc-korea", "EUC-KR"},
126 : {"euc-kr", "EUC-KR"},
127 : {"gb2312", "GB2312"},
128 : {"greek-iso-8bit", "ISO-8859-7"},
129 : {"iso-10646/utf8", "UTF-8"}, // alias
130 : {"iso-10646/utf-8", "UTF-8"}, // alias
131 : {"iso-8859-1", "ISO-8859-1"},
132 : {"iso-8859-13", "ISO-8859-13"}, // Emacs
133 : {"iso-8859-15", "ISO-8859-15"},
134 : {"iso-8859-2", "ISO-8859-2"},
135 : {"iso-8859-5", "ISO-8859-5"},
136 : {"iso-8859-7", "ISO-8859-7"},
137 : {"iso-8859-9", "ISO-8859-9"},
138 : {"iso-latin-1", "ISO-8859-1"},
139 : {"iso-latin-2", "ISO-8859-2"}, // Emacs
140 : {"iso-latin-5", "ISO-8859-9"}, // Emacs
141 : {"iso-latin-7", "ISO-8859-13"}, // Emacs
142 : {"iso-latin-9", "ISO-8859-15"}, // Emacs
143 : {"japanese-iso-8bit", "EUC-JP"}, // Emacs
144 : {"japanese-euc", "EUC-JP"}, // XEmacs
145 : {"jis8", "EUC-JP"}, // XEmacs
146 : {"koi8", "KOI8-R"}, // not KOI8!, Emacs
147 : {"koi8-r", "KOI8-R"},
148 : {"korean-euc", "EUC-KR"}, // XEmacs
149 : {"korean-iso-8bit", "EUC-KR"}, // Emacs
150 : {"latin1", "ISO-8859-1"}, // alias
151 : {"latin-0", "ISO-8859-15"}, // Emacs
152 : {"latin-1", "ISO-8859-1"}, // Emacs
153 : {"latin-2", "ISO-8859-2"}, // Emacs
154 : {"latin-5", "ISO-8859-9"}, // Emacs
155 : {"latin-7", "ISO-8859-13"}, // Emacs
156 : {"latin-9", "ISO-8859-15"}, // Emacs
157 : {"mule-utf-16", "UTF-16"}, // Emacs
158 : {"mule-utf-16be", "UTF-16BE"}, // Emacs
159 : {"mule-utf-16-be", "UTF-16BE"}, // Emacs
160 : {"mule-utf-16be-with-signature", "UTF-16"}, // Emacs, not UTF-16BE
161 : {"mule-utf-16le", "UTF-16LE"}, // Emacs
162 : {"mule-utf-16-le", "UTF-16LE"}, // Emacs
163 : {"mule-utf-16le-with-signature", "UTF-16"}, // Emacs, not UTF-16LE
164 : {"mule-utf-8", "UTF-8"}, // Emacs
165 : {"us-ascii", "US-ASCII"}, // Emacs
166 : {"utf8", "UTF-8"}, // alias
167 : {"utf-16", "UTF-16"}, // Emacs
168 : {"utf-16be", "UTF-16BE"}, // Emacs
169 : {"utf-16-be", "UTF-16BE"}, // Emacs
170 : {"utf-16be-with-signature", "UTF-16"}, // Emacs, not UTF-16BE
171 : {"utf-16-be-with-signature", "UTF-16"}, // Emacs, not UTF-16BE
172 : {"utf-16le", "UTF-16LE"}, // Emacs
173 : {"utf-16-le", "UTF-16LE"}, // Emacs
174 : {"utf-16le-with-signature", "UTF-16"}, // Emacs, not UTF-16LE
175 : {"utf-16-le-with-signature", "UTF-16"}, // Emacs, not UTF-16LE
176 : {"utf-8", "UTF-8"}, // Emacs
177 :
178 : // {"alternativnyj", ""}, // ?
179 : // {"arabic-iso-8bit", "ISO-8859-6"}, // Emacs
180 : // {"binary", ""}, // --
181 : // {"chinese-hz", "HZ-GB-2312"}, // Emacs
182 : // {"chinese-iso-7bit", "ISO-2022-CN"}, // Emacs
183 : // {"chinese-iso-8bit-with-esc", ""}, // --
184 : // {"compound-text", ""}, // --
185 : // {"compound-text-with-extension", ""}, // --
186 : // {"cp1125", "cp1125"}, // *
187 : // {"cp1250", "windows-1250"},// Emacs
188 : // {"cp1251", "windows-1251"},// Emacs
189 : // {"cp1252", "windows-1252"},// Emacs
190 : // {"cp1253", "windows-1253"},// Emacs
191 : // {"cp1254", "windows-1254"},// Emacs
192 : // {"cp1255", "windows-1255"},// Emacs
193 : // {"cp1256", "windows-1256"},// Emacs
194 : // {"cp1257", "windows-1257"},// Emacs
195 : // {"cp1258", "windows-1258"},// Emacs
196 : // {"cp437", "cp437"}, // Emacs
197 : // {"cp720", ""}, // not covered
198 : // {"cp737", "cp737"}, // *, Emacs
199 : // {"cp775", "cp775"}, // Emacs
200 : // {"cp850", "cp850"}, // Emacs
201 : // {"cp851", "cp851"}, // Emacs
202 : // {"cp852", "cp852"}, // Emacs
203 : // {"cp855", "cp855"}, // Emacs
204 : // {"cp857", "cp857"}, // Emacs
205 : // {"cp860", "cp860"}, // Emacs
206 : // {"cp861", "cp861"}, // Emacs
207 : // {"cp862", "cp862"}, // Emacs
208 : // {"cp863", "cp863"}, // Emacs
209 : // {"cp864", "cp864"}, // Emacs
210 : // {"cp865", "cp865"}, // Emacs
211 : // {"cp866", "cp866"}, // Emacs
212 : // {"cp866u", "cp1125"}, // *, Emacs
213 : // {"cp869", "cp869"}, // Emacs
214 : // {"cp874", "cp874"}, // *, Emacs
215 : // {"cp932", "cp932"}, // *, Emacs
216 : // {"cp936", "cp936"}, // Emacs
217 : // {"cp949", "cp949"}, // *, Emacs
218 : // {"cp950", "cp950"}, // *, Emacs
219 : // {"ctext", ""}, // --
220 : // {"ctext-no-compositions", ""}, // --
221 : // {"ctext-with-extensions", ""}, // --
222 : // {"cyrillic-alternativnyj", ""}, // ?, Emacs
223 : // {"cyrillic-iso-8bit-with-esc", ""}, // --
224 : // {"cyrillic-koi8-t", "KOI8-T"}, // *, Emacs
225 : // {"devanagari", ""}, // not covered
226 : // {"dos", ""}, // --
227 : // {"emacs-mule", ""}, // --
228 : // {"euc-jisx0213", "EUC-JISX0213"},// *, XEmacs?
229 : // {"euc-jisx0213-with-esc", ""}, // XEmacs?
230 : // {"euc-taiwan", "EUC-TW"}, // *, Emacs
231 : // {"euc-tw", "EUC-TW"}, // *, Emacs
232 : // {"georgian-ps", "GEORGIAN-PS"}, // *, Emacs
233 : // {"greek-iso-8bit-with-esc", ""}, // --
234 : // {"hebrew-iso-8bit", "ISO-8859-8"}, // Emacs
235 : // {"hebrew-iso-8bit-with-esc", ""}, // --
236 : // {"hz", "HZ-GB-2312"},
237 : // {"hz-gb-2312", "HZ-GB-2312"},
238 : // {"in-is13194", ""}, // not covered
239 : // {"in-is13194-devanagari", ""}, // not covered
240 : // {"in-is13194-with-esc", ""}, // --
241 : // {"iso-2022-7", ""}, // XEmacs?
242 : // {"iso-2022-7bit", ""}, // --
243 : // {"iso-2022-7bit-lock", ""}, // --
244 : // {"iso-2022-7bit-lock-ss2", ""}, // --
245 : // {"iso-2022-7bit-ss2", ""}, // --
246 : // {"iso-2022-8", ""}, // XEmacs?
247 : // {"iso-2022-8bit", ""}, // XEmacs?
248 : // {"iso-2022-8bit-lock", ""}, // XEmacs?
249 : // {"iso-2022-8bit-lock-ss2", ""}, // XEmacs?
250 : // {"iso-2022-8bit-ss2", ""}, // --
251 : // {"iso-2022-cjk", ""}, // --
252 : // {"iso-2022-cn", "ISO-2022-CN"}, // Emacs
253 : // {"iso-2022-cn-ext", "ISO-2022-CN-EXT"},// Emacs
254 : // {"iso-2022-int-1", ""}, // --
255 : // {"iso-2022-jp", "ISO-2022-JP"},
256 : // {"iso-2022-jp-1978-irv", "ISO-2022-JP"},
257 : // {"iso-2022-jp-2", "ISO-2022-JP-2"},
258 : // {"iso-2022-jp-3", "ISO-2022-JP-3"},// *, XEmacs?
259 : // {"iso-2022-jp-3-compatible", ""}, // XEmacs?
260 : // {"iso-2022-jp-3-strict", "ISO-2022-JP-3"},// *, XEmacs?
261 : // {"iso-2022-kr", "ISO-2022-KR"},
262 : // {"iso-2022-lock", ""}, // XEmacs?
263 : // {"iso-8859-10", "ISO-8859-10"}, // Emacs
264 : // {"iso-8859-11", "ISO-8859-11"}, // *, Emacs
265 : // {"iso-8859-14", "ISO-8859-14"}, // Emacs
266 : // {"iso-8859-16", "ISO-8859-16"},
267 : // {"iso-8859-3", "ISO-8859-3"},
268 : // {"iso-8859-4", "ISO-8859-4"},
269 : // {"iso-8859-6", "ISO-8859-6"},
270 : // {"iso-8859-8", "ISO-8859-8"},
271 : // {"iso-8859-8-e", "ISO-8859-8"},
272 : // {"iso-8859-8-i", "ISO-8859-8"}, // Emacs
273 : // {"iso-latin-10", "ISO-8859-16"}, // Emacs
274 : // {"iso-latin-1-with-esc", ""}, // --
275 : // {"iso-latin-2-with-esc", ""}, // --
276 : // {"iso-latin-3", "ISO-8859-3"}, // Emacs
277 : // {"iso-latin-3-with-esc", ""}, // --
278 : // {"iso-latin-4", "ISO-8859-4"}, // Emacs
279 : // {"iso-latin-4-with-esc", ""}, // --
280 : // {"iso-latin-5-with-esc", ""}, // --
281 : // {"iso-latin-6", "ISO-8859-10"}, // Emacs
282 : // {"iso-latin-8", "ISO-8859-14"}, // Emacs
283 : // {"iso-safe", ""}, // --
284 : // {"japanese-iso-7bit-1978-irv", "ISO-2022-JP"}, // Emacs
285 : // {"japanese-iso-8bit-with-esc", ""}, // --
286 : // {"japanese-shift-jis", "Shift_JIS"}, // Emacs
287 : // {"japanese-shift-jisx0213", ""}, // XEmacs?
288 : // {"jis7", "ISO-2022-JP"}, // Xemacs
289 : // {"junet", "ISO-2022-JP"},
290 : // {"koi8-t", "KOI8-T"}, // *, Emacs
291 : // {"koi8-u", "KOI8-U"}, // Emacs
292 : // {"korean-iso-7bit-lock", "ISO-2022-KR"},
293 : // {"korean-iso-8bit-with-esc", ""}, // --
294 : // {"lao", ""}, // not covered
295 : // {"lao-with-esc", ""}, // --
296 : // {"latin-10", "ISO-8859-16"}, // Emacs
297 : // {"latin-3", "ISO-8859-3"}, // Emacs
298 : // {"latin-4", "ISO-8859-4"}, // Emacs
299 : // {"latin-6", "ISO-8859-10"}, // Emacs
300 : // {"latin-8", "ISO-8859-14"}, // Emacs
301 : // {"mac", ""}, // --
302 : // {"mac-roman", "MACINTOSH"}, // Emacs
303 : // {"mik", ""}, // not covered
304 : // {"next", "NEXTSTEP"}, // *, Emacs
305 : // {"no-conversion", ""}, // --
306 : // {"old-jis", "ISO-2022-JP"},
307 : // {"pt154", "PT154"}, // Emacs
308 : // {"raw-text", ""}, // --
309 : // {"ruscii", "cp1125"}, // *, Emacs
310 : // {"shift-jis", "Shift_JIS"}, // XEmacs
311 : // {"shift_jis", "Shift_JIS"},
312 : // {"shift_jisx0213", "Shift_JISX0213"},// *, XEmacs?
313 : // {"sjis", "Shift_JIS"}, // Emacs
314 : // {"tcvn", "TCVN"}, // *, Emacs
315 : // {"tcvn-5712", "TCVN"}, // *, Emacs
316 : // {"thai-tis620", "TIS-620"},
317 : // {"thai-tis620-with-esc", ""}, // --
318 : // {"th-tis620", "TIS-620"},
319 : // {"tibetan", ""}, // not covered
320 : // {"tibetan-iso-8bit", ""}, // not covered
321 : // {"tibetan-iso-8bit-with-esc", ""}, // --
322 : // {"tis-620", "TIS-620"},
323 : // {"tis620", "TIS-620"},
324 : // {"undecided", ""}, // --
325 : // {"unix", ""}, // --
326 : // {"utf-7", "UTF-7"}, // Emacs
327 : // {"utf-7-safe", ""}, // XEmacs?
328 : // {"utf-8-ws", "UTF-8"}, // XEmacs?
329 : // {"vietnamese-tcvn", "TCVN"}, // *, Emacs
330 : // {"vietnamese-viqr", "VIQR"}, // not covered
331 : // {"vietnamese-viscii", "VISCII"},
332 : // {"vietnamese-vscii", ""}, // not covered
333 : // {"viqr", "VIQR"}, // not covered
334 : // {"viscii", "VISCII"},
335 : // {"vscii", ""}, // not covered
336 : // {"windows-037", ""}, // not covered
337 : // {"windows-10000", ""}, // not covered
338 : // {"windows-10001", ""}, // not covered
339 : // {"windows-10006", ""}, // not covered
340 : // {"windows-10007", ""}, // not covered
341 : // {"windows-10029", ""}, // not covered
342 : // {"windows-10079", ""}, // not covered
343 : // {"windows-10081", ""}, // not covered
344 : // {"windows-1026", ""}, // not covered
345 : // {"windows-1200", ""}, // not covered
346 : // {"windows-1250", "windows-1250"},
347 : // {"windows-1251", "windows-1251"},
348 : // {"windows-1252", "windows-1252"},
349 : // {"windows-1253", "windows-1253"},
350 : // {"windows-1254", "windows-1254"},
351 : // {"windows-1255", "windows-1255"},
352 : // {"windows-1256", "windows-1256"},
353 : // {"windows-1257", "windows-1257"},
354 : // {"windows-1258", "windows-1258"},
355 : // {"windows-1361", "cp1361"}, // *, XEmacs
356 : // {"windows-437", "cp437"}, // XEmacs
357 : // {"windows-500", ""}, // not covered
358 : // {"windows-708", ""}, // not covered
359 : // {"windows-709", ""}, // not covered
360 : // {"windows-710", ""}, // not covered
361 : // {"windows-720", ""}, // not covered
362 : // {"windows-737", "cp737"}, // *, XEmacs
363 : // {"windows-775", "cp775"}, // XEmacs
364 : // {"windows-850", "cp850"}, // XEmacs
365 : // {"windows-852", "cp852"}, // XEmacs
366 : // {"windows-855", "cp855"}, // XEmacs
367 : // {"windows-857", "cp857"}, // XEmacs
368 : // {"windows-860", "cp860"}, // XEmacs
369 : // {"windows-861", "cp861"}, // XEmacs
370 : // {"windows-862", "cp862"}, // XEmacs
371 : // {"windows-863", "cp863"}, // XEmacs
372 : // {"windows-864", "cp864"}, // XEmacs
373 : // {"windows-865", "cp865"}, // XEmacs
374 : // {"windows-866", "cp866"}, // XEmacs
375 : // {"windows-869", "cp869"}, // XEmacs
376 : // {"windows-874", "cp874"}, // XEmacs
377 : // {"windows-875", ""}, // not covered
378 : // {"windows-932", "cp932"}, // *, XEmacs
379 : // {"windows-936", "cp936"}, // XEmacs
380 : // {"windows-949", "cp949"}, // *, XEmacs
381 : // {"windows-950", "cp950"}, // *, XEmacs
382 : // {"x-ctext", ""}, // --
383 : // {"x-ctext-with-extensions", ""}, // --
384 :
385 : {0 /* nullptr */, 0 /* nullptr */},
386 : };
387 :
388 : // ---------------------------------------------------------
389 : // Convert encoding name from emacs to mime.
390 : // ---------------------------------------------------------
391 : char *
392 242 : emacs2mime(char *emacs_enc)
393 : {
394 242 : size_t emacs_enc_len = strlen(emacs_enc);
395 242 : if (emacs_enc_len > 4
396 196 : && !strcasecmp(emacs_enc + emacs_enc_len - 4, "-dos"))
397 0 : emacs_enc[emacs_enc_len - 4] = 0;
398 242 : if (emacs_enc_len > 4
399 196 : && !strcasecmp(emacs_enc + emacs_enc_len - 4, "-mac"))
400 0 : emacs_enc[emacs_enc_len - 4] = 0;
401 242 : if (emacs_enc_len > 5
402 8 : && !strcasecmp(emacs_enc + emacs_enc_len - 5, "-unix"))
403 0 : emacs_enc[emacs_enc_len - 5] = 0;
404 16554 : for (const conversion *table = emacs_to_mime;
405 16554 : table->from != 0 /* nullptr */;
406 : table++)
407 16552 : if (!strcasecmp(emacs_enc, table->from))
408 240 : return const_cast<char *>(table->to);
409 2 : return emacs_enc;
410 : }
411 :
412 : // ---------------------------------------------------------
413 : // Print out Unicode entity if value is greater than 0x7F.
414 : // ---------------------------------------------------------
415 : inline void
416 5286475 : unicode_entity(int u)
417 : {
418 5286475 : if (u < 0x80)
419 5284963 : putchar(u);
420 : else {
421 : // Handle no-break space and soft hyphen specially--they are input
422 : // characters only, not glyphs. See groff_char(7).
423 1512 : if (u == 0xA0) {
424 0 : putchar('\\');
425 0 : putchar('~');
426 : }
427 1512 : else if (u == 0xAD) {
428 0 : putchar('\\');
429 0 : putchar('%');
430 : }
431 : else
432 1512 : printf("\\[u%04X]", u);
433 : }
434 5286475 : }
435 :
436 : // ---------------------------------------------------------
437 : // Conversion functions. All functions take 'data', which
438 : // normally holds the first two lines, and a file pointer.
439 : // ---------------------------------------------------------
440 :
441 : // Conversion from ISO-8859-1 (aka Latin-1) to Unicode.
442 : void
443 2 : conversion_latin1(FILE *fp, const string &data)
444 : {
445 2 : int len = data.length();
446 : const unsigned char *ptr
447 2 : = reinterpret_cast<const unsigned char *>(data.contents());
448 48 : for (int i = 0; i < len; i++)
449 46 : unicode_entity(ptr[i]);
450 2 : int c = -1;
451 2 : while ((c = getc(fp)) != EOF)
452 0 : unicode_entity(c);
453 2 : }
454 :
455 : // A future version of groff shall support UTF-8 natively.
456 : // In this case, the UTF-8 stuff here in this file will be
457 : // moved to the troff program.
458 :
459 : struct utf8 {
460 : FILE *fp;
461 : unsigned char s[6];
462 : enum {
463 : FIRST = 0,
464 : SECOND,
465 : THIRD,
466 : FOURTH,
467 : FIFTH,
468 : SIXTH
469 : } byte;
470 : int expected_byte_count;
471 : bool emit_invalid_utf8_warning;
472 : bool emit_incomplete_utf8_warning;
473 : utf8(FILE *);
474 : ~utf8();
475 : void add(unsigned char);
476 : void invalid();
477 : void incomplete();
478 : };
479 :
480 234 : utf8::utf8(FILE *f) : fp(f), byte(FIRST), expected_byte_count(1),
481 : emit_invalid_utf8_warning(true),
482 234 : emit_incomplete_utf8_warning(true)
483 : {
484 : // empty
485 234 : }
486 :
487 468 : utf8::~utf8()
488 : {
489 234 : if (byte != FIRST)
490 0 : incomplete();
491 234 : }
492 :
493 : inline void
494 5287981 : utf8::add(unsigned char c)
495 : {
496 5287981 : s[byte] = c;
497 5287981 : if (byte == FIRST) {
498 5286401 : if (c < 0x80)
499 5284890 : unicode_entity(c);
500 1511 : else if (c < 0xC0)
501 0 : invalid();
502 1511 : else if (c < 0xE0) {
503 1406 : expected_byte_count = 2;
504 1406 : byte = SECOND;
505 : }
506 105 : else if (c < 0xF0) {
507 96 : expected_byte_count = 3;
508 96 : byte = SECOND;
509 : }
510 9 : else if (c < 0xF8) {
511 9 : expected_byte_count = 4;
512 9 : byte = SECOND;
513 : }
514 0 : else if (c < 0xFC) {
515 0 : expected_byte_count = 5;
516 0 : byte = SECOND;
517 : }
518 0 : else if (c < 0xFE) {
519 0 : expected_byte_count = 6;
520 0 : byte = SECOND;
521 : }
522 : else
523 0 : invalid();
524 5286401 : return;
525 : }
526 1580 : if (c < 0x80 || c > 0xBF) {
527 48 : incomplete();
528 48 : add(c);
529 48 : return;
530 : }
531 1532 : switch (byte) {
532 0 : case FIRST:
533 : // can't happen
534 0 : break;
535 1463 : case SECOND:
536 1463 : if (expected_byte_count == 2) {
537 1394 : if (s[0] < 0xC2)
538 0 : invalid();
539 : else
540 1394 : unicode_entity(((s[0] & 0x1F) << 6)
541 1394 : | (s[1] ^ 0x80));
542 1394 : byte = FIRST;
543 : }
544 : else
545 69 : byte = THIRD;
546 1463 : break;
547 69 : case THIRD:
548 69 : if (expected_byte_count == 3) {
549 69 : if (!(s[0] >= 0xE1 || s[1] >= 0xA0))
550 0 : invalid();
551 : else
552 69 : unicode_entity(((s[0] & 0x1F) << 12)
553 69 : | ((s[1] ^ 0x80) << 6)
554 69 : | (s[2] ^ 0x80));
555 69 : byte = FIRST;
556 : }
557 : else
558 0 : byte = FOURTH;
559 69 : break;
560 0 : case FOURTH:
561 : // We reject everything greater than 0x10FFFF.
562 0 : if (expected_byte_count == 4) {
563 0 : if (!((s[0] >= 0xF1 || s[1] >= 0x90)
564 0 : && (s[0] < 0xF4 || (s[0] == 0xF4 && s[1] < 0x90))))
565 0 : invalid();
566 : else
567 0 : unicode_entity(((s[0] & 0x07) << 18)
568 0 : | ((s[1] ^ 0x80) << 12)
569 0 : | ((s[2] ^ 0x80) << 6)
570 0 : | (s[3] ^ 0x80));
571 0 : byte = FIRST;
572 : }
573 : else
574 0 : byte = FIFTH;
575 0 : break;
576 0 : case FIFTH:
577 0 : if (expected_byte_count == 5) {
578 0 : invalid();
579 0 : byte = FIRST;
580 : }
581 : else
582 0 : byte = SIXTH;
583 0 : break;
584 0 : case SIXTH:
585 0 : invalid();
586 0 : byte = FIRST;
587 0 : break;
588 : }
589 : }
590 :
591 : // We use fprintf(stderr) instead of libgroff's debug() because we need
592 : // to output longs, and libgroff's errprint() doesn't support that.
593 :
594 : void
595 0 : utf8::invalid()
596 : {
597 0 : if (is_debugging && emit_invalid_utf8_warning) {
598 0 : fprintf(stderr, " invalid UTF-8 sequence(s) in input stream:"
599 : " replacing each such sequence with 0xFFFD\n");
600 0 : emit_invalid_utf8_warning = false;
601 : }
602 0 : unicode_entity(0xFFFD);
603 0 : byte = FIRST;
604 0 : }
605 :
606 : void
607 48 : utf8::incomplete()
608 : {
609 48 : if (is_debugging && emit_incomplete_utf8_warning) {
610 0 : fprintf(stderr, " incomplete UTF-8 sequence(s) in input stream:"
611 : " replacing each such sequence with 0xFFFD\n");
612 0 : emit_incomplete_utf8_warning = false;
613 : }
614 48 : unicode_entity(0xFFFD);
615 48 : byte = FIRST;
616 48 : }
617 :
618 : // Conversion from UTF-8 to Unicode.
619 : void
620 234 : conversion_utf8(FILE *fp, const string &data)
621 : {
622 468 : utf8 u(fp);
623 234 : int len = data.length();
624 : const unsigned char *ptr
625 234 : = reinterpret_cast<const unsigned char *>(data.contents());
626 1117 : for (int i = 0; i < len; i++)
627 883 : u.add(ptr[i]);
628 234 : int c = -1;
629 5287284 : while ((c = getc(fp)) != EOF)
630 5287050 : u.add(c);
631 468 : return;
632 : }
633 :
634 : // Conversion from cp1047 (EBCDIC) to UTF-8.
635 : void
636 0 : conversion_cp1047(FILE *fp, const string &data)
637 : {
638 : static unsigned char cp1047[] = {
639 : 0x00, 0x01, 0x02, 0x03, 0x9C, 0x09, 0x86, 0x7F, // 0x00
640 : 0x97, 0x8D, 0x8E, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
641 : 0x10, 0x11, 0x12, 0x13, 0x9D, 0x85, 0x08, 0x87, // 0x10
642 : 0x18, 0x19, 0x92, 0x8F, 0x1C, 0x1D, 0x1E, 0x1F,
643 : 0x80, 0x81, 0x82, 0x83, 0x84, 0x0A, 0x17, 0x1B, // 0x20
644 : 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x05, 0x06, 0x07,
645 : 0x90, 0x91, 0x16, 0x93, 0x94, 0x95, 0x96, 0x04, // 0x30
646 : 0x98, 0x99, 0x9A, 0x9B, 0x14, 0x15, 0x9E, 0x1A,
647 : 0x20, 0xA0, 0xE2, 0xE4, 0xE0, 0xE1, 0xE3, 0xE5, // 0x40
648 : 0xE7, 0xF1, 0xA2, 0x2E, 0x3C, 0x28, 0x2B, 0x7C,
649 : 0x26, 0xE9, 0xEA, 0xEB, 0xE8, 0xED, 0xEE, 0xEF, // 0x50
650 : 0xEC, 0xDF, 0x21, 0x24, 0x2A, 0x29, 0x3B, 0x5E,
651 : 0x2D, 0x2F, 0xC2, 0xC4, 0xC0, 0xC1, 0xC3, 0xC5, // 0x60
652 : 0xC7, 0xD1, 0xA6, 0x2C, 0x25, 0x5F, 0x3E, 0x3F,
653 : 0xF8, 0xC9, 0xCA, 0xCB, 0xC8, 0xCD, 0xCE, 0xCF, // 0x70
654 : 0xCC, 0x60, 0x3A, 0x23, 0x40, 0x27, 0x3D, 0x22,
655 : 0xD8, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, // 0x80
656 : 0x68, 0x69, 0xAB, 0xBB, 0xF0, 0xFD, 0xFE, 0xB1,
657 : 0xB0, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, // 0x90
658 : 0x71, 0x72, 0xAA, 0xBA, 0xE6, 0xB8, 0xC6, 0xA4,
659 : 0xB5, 0x7E, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, // 0xA0
660 : 0x79, 0x7A, 0xA1, 0xBF, 0xD0, 0x5B, 0xDE, 0xAE,
661 : 0xAC, 0xA3, 0xA5, 0xB7, 0xA9, 0xA7, 0xB6, 0xBC, // 0xB0
662 : 0xBD, 0xBE, 0xDD, 0xA8, 0xAF, 0x5D, 0xB4, 0xD7,
663 : 0x7B, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, // 0xC0
664 : 0x48, 0x49, 0xAD, 0xF4, 0xF6, 0xF2, 0xF3, 0xF5,
665 : 0x7D, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50, // 0xD0
666 : 0x51, 0x52, 0xB9, 0xFB, 0xFC, 0xF9, 0xFA, 0xFF,
667 : 0x5C, 0xF7, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, // 0xE0
668 : 0x59, 0x5A, 0xB2, 0xD4, 0xD6, 0xD2, 0xD3, 0xD5,
669 : 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, // 0xF0
670 : 0x38, 0x39, 0xB3, 0xDB, 0xDC, 0xD9, 0xDA, 0x9F,
671 : };
672 0 : int len = data.length();
673 : const unsigned char *ptr
674 0 : = reinterpret_cast<const unsigned char *>(data.contents());
675 0 : for (int i = 0; i < len; i++)
676 0 : unicode_entity(cp1047[ptr[i]]);
677 0 : int c = -1;
678 0 : while ((c = getc(fp)) != EOF)
679 0 : unicode_entity(cp1047[c]);
680 0 : }
681 :
682 : // Locale-sensible conversion.
683 : #if HAVE_ICONV
684 : void
685 6 : conversion_iconv(FILE *fp, const string &data, char *enc)
686 : {
687 6 : iconv_t handle = iconv_open(UNICODE, enc);
688 6 : if (handle == (iconv_t)-1) {
689 0 : if (EINVAL == errno) {
690 0 : error("character encoding '%1' not supported by iconv()", enc);
691 0 : return;
692 : }
693 0 : fatal("unable to convert character encoding: %1", strerror(errno));
694 : }
695 : char inbuf[BUFSIZ];
696 : int outbuf[BUFSIZ];
697 6 : char *outptr = reinterpret_cast<char *>(outbuf);
698 6 : size_t outbytes_left = BUFSIZ * sizeof (int);
699 : // Handle 'data'.
700 6 : char *inptr = const_cast<char *>(data.contents());
701 6 : size_t inbytes_left = data.length();
702 : char *limit;
703 15 : while (inbytes_left > 0) {
704 9 : size_t status = iconv(handle,
705 : const_cast<ICONV_CONST char **>(&inptr),
706 : &inbytes_left, &outptr, &outbytes_left);
707 9 : if (status == static_cast<size_t>(-1)) {
708 3 : if (EILSEQ == errno) {
709 : // Invalid byte sequence. XXX
710 3 : inptr++;
711 3 : inbytes_left--;
712 : }
713 0 : else if (E2BIG == errno) {
714 : // Output buffer is full.
715 0 : limit = reinterpret_cast<char *>(outbuf)
716 0 : + (BUFSIZ * sizeof (int)) - outbytes_left;
717 0 : for (int *ptr = outbuf; (char *)ptr < limit; ptr++)
718 0 : unicode_entity(*ptr);
719 0 : memmove(outbuf, outptr, outbytes_left);
720 0 : outptr = reinterpret_cast<char *>(outbuf) + outbytes_left;
721 0 : outbytes_left = BUFSIZ * sizeof (int) - outbytes_left;
722 : }
723 0 : else if (EINVAL == errno) {
724 : // 'data' ends with partial input sequence.
725 0 : memcpy(inbuf, inptr, inbytes_left);
726 0 : break;
727 : }
728 : }
729 : }
730 : // Handle 'fp' and switch to 'inbuf'.
731 : size_t read_bytes;
732 6 : char *read_start = inbuf + inbytes_left;
733 11 : while ((read_bytes = fread(read_start, 1, (BUFSIZ - inbytes_left),
734 : fp))
735 11 : > 0) {
736 5 : inptr = inbuf;
737 5 : inbytes_left += read_bytes;
738 10 : while (inbytes_left > 0) {
739 5 : size_t status = iconv(handle,
740 : const_cast<ICONV_CONST char **>(&inptr),
741 : &inbytes_left, &outptr, &outbytes_left);
742 5 : if (status == (size_t)-1) {
743 0 : if (EILSEQ == errno) {
744 : // Invalid byte sequence. XXX
745 0 : inptr++;
746 0 : inbytes_left--;
747 : }
748 0 : else if (E2BIG == errno) {
749 : // Output buffer is full.
750 0 : limit = reinterpret_cast<char *>(outbuf)
751 0 : + (BUFSIZ * sizeof (int)) - outbytes_left;
752 0 : for (int *ptr = outbuf; (char *)ptr < limit; ptr++)
753 0 : unicode_entity(*ptr);
754 0 : memmove(outbuf, outptr, outbytes_left);
755 0 : outptr = reinterpret_cast<char *>(outbuf) + outbytes_left;
756 0 : outbytes_left = (BUFSIZ * sizeof (int)) - outbytes_left;
757 : }
758 0 : else if (EINVAL == errno) {
759 : // 'inbuf' ends with partial input sequence.
760 0 : memmove(inbuf, inptr, inbytes_left);
761 0 : break;
762 : }
763 : }
764 : }
765 5 : read_start = inbuf + inbytes_left;
766 : }
767 6 : iconv_close(handle);
768 : // XXX use ferror?
769 6 : limit = reinterpret_cast<char *>(outbuf) + (BUFSIZ * sizeof (int))
770 6 : - outbytes_left;
771 34 : for (int *ptr = outbuf; (char *)ptr < limit; ptr++)
772 28 : unicode_entity(*ptr);
773 : }
774 : #endif /* HAVE_ICONV */
775 :
776 : static struct bom_s {
777 : int len;
778 : const char *str;
779 : const char *name;
780 : } BOM_table[] = {
781 : {4, "\x00\x00\xFE\xFF", "UTF-32"},
782 : {4, "\xFF\xFE\x00\x00", "UTF-32"},
783 : {3, "\xEF\xBB\xBF", "UTF-8"},
784 : {2, "\xFE\xFF", "UTF-16"},
785 : {2, "\xFF\xFE", "UTF-16"},
786 : };
787 :
788 : // ---------------------------------------------------------
789 : // Handle Byte Order Mark.
790 : //
791 : // Since we have a chicken-and-egg problem it's necessary
792 : // to handle the BOM manually if it is in the data stream.
793 : // As documented in the Unicode book it is very unlikely
794 : // that any normal text file (regardless of the encoding)
795 : // starts with the bytes which represent a BOM.
796 : //
797 : // Return the BOM in string 'BOM'; 'data' then starts with
798 : // the byte after the BOM. This function reads (at most)
799 : // four bytes from the data stream.
800 : //
801 : // Return encoding if a BOM is found, and a null pointer
802 : // otherwise.
803 : // ---------------------------------------------------------
804 : const char *
805 242 : get_BOM(FILE *fp, string &BOM, string &data)
806 : {
807 : // The BOM is U+FEFF. We have thus the following possible
808 : // representations.
809 : //
810 : // UTF-8: 0xEFBBBF
811 : // UTF-16: 0xFEFF or 0xFFFE
812 : // UTF-32: 0x0000FEFF or 0xFFFE0000
813 242 : const int BOM_table_len = countof(BOM_table);
814 : char BOM_string[4];
815 242 : const char *retval = 0 /* nullptr */;
816 : int len;
817 1160 : for (len = 0; len < 4; len++) {
818 931 : int c = getc(fp);
819 931 : if (c == EOF)
820 13 : break;
821 918 : BOM_string[len] = char(c);
822 : }
823 : int i;
824 1435 : for (i = 0; i < BOM_table_len; i++) {
825 1199 : if (BOM_table[i].len <= len
826 1136 : && memcmp(BOM_string, BOM_table[i].str, BOM_table[i].len) == 0)
827 6 : break;
828 : }
829 242 : int j = 0;
830 242 : if (i < BOM_table_len) {
831 23 : for (; j < BOM_table[i].len; j++)
832 17 : BOM += BOM_string[j];
833 6 : retval = BOM_table[i].name;
834 : }
835 1143 : for (; j < len; j++)
836 901 : data += BOM_string[j];
837 242 : return retval;
838 : }
839 :
840 : // ---------------------------------------------------------
841 : // Get first two lines from input stream.
842 : //
843 : // Return string (allocated with 'new') without zero bytes
844 : // or a null pointer in case no coding tag can occur in the
845 : // data (which is stored unmodified in 'data').
846 : // ---------------------------------------------------------
847 : char *
848 7 : get_tag_lines(FILE *fp, string &data)
849 : {
850 7 : int newline_count = 0;
851 7 : int c, prev = -1;
852 : // Handle CR, LF, and CRLF as line separators.
853 25 : for (int i = 0; i < data.length(); i++) {
854 18 : c = data[i];
855 18 : if (c == '\n' || c == '\r')
856 0 : newline_count++;
857 18 : if (c == '\n' && prev == '\r')
858 0 : newline_count--;
859 18 : prev = c;
860 : }
861 7 : if (newline_count > 1)
862 0 : return 0 /* nullptr */;
863 7 : bool emit_warning = true;
864 21 : for (int lines = newline_count; lines < 2; lines++) {
865 63 : while ((c = getc(fp)) != EOF) {
866 51 : if (c == '\0' && is_debugging && emit_warning) {
867 0 : warning("null byte(s) found in input stream:"
868 : " search for coding tag might return false result");
869 0 : emit_warning = false;
870 : }
871 51 : data += char(c);
872 51 : if (c == '\n' || c == '\r')
873 : break;
874 : }
875 : // Handle CR, LF, and CRLF as line separators.
876 14 : if (c == '\r') {
877 0 : c = getc(fp);
878 0 : if (c != EOF && c != '\n')
879 0 : ungetc(c, fp);
880 : else
881 0 : data += char(c);
882 : }
883 : }
884 7 : return data.extract();
885 : }
886 :
887 : // ---------------------------------------------------------
888 : // Indicate whether C string starts with a comment.
889 : // ---------------------------------------------------------
890 : bool
891 7 : is_comment_line(char *s)
892 : {
893 7 : if (!s || !*s)
894 2 : return false;
895 5 : if (*s == '.' || *s == '\'')
896 : {
897 1 : s++;
898 1 : while (*s == ' ' || *s == '\t')
899 0 : s++;
900 1 : if (*s && *s == '\\')
901 : {
902 1 : s++;
903 1 : if (*s == '"' || *s == '#')
904 1 : return true;
905 : }
906 : }
907 4 : else if (*s == '\\')
908 : {
909 1 : s++;
910 1 : if (*s == '#')
911 0 : return true;
912 : }
913 4 : return false;
914 : }
915 :
916 : // ---------------------------------------------------------
917 : // Get a value/variable pair from a local variables list
918 : // in a C string which look like this:
919 : //
920 : // <variable1>: <value1>; <variable2>: <value2>; ...
921 : //
922 : // Leading and trailing blanks are ignored. There might be
923 : // more than one blank after ':' and ';'.
924 : //
925 : // Return position of next value/variable pair or a null
926 : // pointer if at end of data.
927 : // ---------------------------------------------------------
928 : char *
929 0 : get_variable_value_pair(char *d1, char **variable, char **value)
930 : {
931 : static char var[MAX_VAR_LEN], val[MAX_VAR_LEN];
932 0 : *variable = var;
933 0 : *value = val;
934 0 : while (*d1 == ' ' || *d1 == '\t')
935 0 : d1++;
936 : // Get variable.
937 0 : int l = 0;
938 0 : while (l < MAX_VAR_LEN - 1 && *d1 && !strchr(";: \t", *d1))
939 0 : var[l++] = *(d1++);
940 0 : var[l] = 0;
941 : // Skip everything until ':', ';', or end of data.
942 0 : while (*d1 && *d1 != ':' && *d1 != ';')
943 0 : d1++;
944 0 : val[0] = 0;
945 0 : if (!*d1)
946 0 : return 0 /* nullptr */;
947 0 : if (*d1 == ';')
948 0 : return d1 + 1;
949 0 : d1++;
950 0 : while (*d1 == ' ' || *d1 == '\t')
951 0 : d1++;
952 : // Get value.
953 0 : l = 0;
954 0 : while (l < MAX_VAR_LEN - 1 && *d1 && !strchr("; \t", *d1))
955 0 : val[l++] = *(d1++);
956 0 : val[l] = 0;
957 : // Skip everything until ';' or end of data.
958 0 : while (*d1 && *d1 != ';')
959 0 : d1++;
960 0 : if (*d1 == ';')
961 0 : return d1 + 1;
962 0 : return 0 /* nullptr */;
963 : }
964 :
965 : // ---------------------------------------------------------
966 : // Check coding tag in the read buffer.
967 : //
968 : // We search for the following line:
969 : //
970 : // <comment> ... -*-<local variables list>-*-
971 : //
972 : // ('...' might be anything).
973 : //
974 : // <comment> can be one of the following syntax forms at the
975 : // beginning of the line:
976 : //
977 : // .\" .\# '\" '\# \#
978 : //
979 : // There can be whitespace after the leading '.' or "'".
980 : //
981 : // The local variables list must occur within the first
982 : // comment block at the very beginning of the data stream.
983 : //
984 : // Within the <local variables list>, we search for
985 : //
986 : // coding: <value>
987 : //
988 : // which specifies the coding system used for the data
989 : // stream.
990 : //
991 : // Return <value> if found, and a null pointer otherwise.
992 : //
993 : // Note that null bytes in the data are skipped before
994 : // applying the algorithm. This should work even with files
995 : // encoded as UTF-16 or UTF-32 (or its siblings) in most
996 : // cases.
997 : // ---------------------------------------------------------
998 : char *
999 7 : check_coding_tag(FILE *fp, string &data)
1000 : {
1001 7 : char *inbuf = get_tag_lines(fp, data);
1002 : char *lineend;
1003 7 : for (char *p = inbuf; is_comment_line(p); p = lineend + 1) {
1004 1 : if ((lineend = strchr(p, '\n')) == 0 /* nullptr */)
1005 1 : break;
1006 0 : *lineend = 0; // switch temporarily to '\0'
1007 0 : char *d1 = strstr(p, "-*-");
1008 0 : char *d2 = 0 /* nullptr */;
1009 0 : if (d1 != 0 /* nullptr */)
1010 0 : d2 = strstr(d1 + 3, "-*-");
1011 0 : *lineend = '\n'; // restore newline
1012 0 : if (!d1 || !d2)
1013 0 : continue;
1014 0 : *d2 = 0; // switch temporarily to '\0'
1015 0 : d1 += 3;
1016 0 : while (d1 != 0 /* nullptr */) {
1017 : char *variable, *value;
1018 0 : d1 = get_variable_value_pair(d1, &variable, &value);
1019 0 : if (!strcasecmp(variable, "coding")) {
1020 0 : *d2 = '-'; // restore '-'
1021 0 : free(inbuf);
1022 0 : return value;
1023 : }
1024 : }
1025 0 : *d2 = '-'; // restore '-'
1026 : }
1027 7 : free(inbuf);
1028 7 : return 0 /* nullptr */;
1029 : }
1030 :
1031 : char *
1032 0 : detect_file_encoding(FILE *fp)
1033 : {
1034 : #ifdef HAVE_UCHARDET
1035 0 : uchardet_t ud = 0 /* nullptr */;
1036 : struct stat stat_buf;
1037 : size_t len, read_bytes;
1038 0 : char *data = 0 /* nullptr */;
1039 : int res, current_position;
1040 : const char *charset;
1041 0 : char *ret = 0 /* nullptr */;
1042 :
1043 0 : current_position = ftell(fp);
1044 : /* Due to BOM and tag detection, we are not at the beginning of the
1045 : file. */
1046 0 : rewind(fp);
1047 0 : if (fstat(fileno(fp), &stat_buf) != 0) {
1048 0 : error("unable to get file status: %1", strerror(errno));
1049 0 : goto end;
1050 : }
1051 0 : len = stat_buf.st_size;
1052 0 : if (is_debugging)
1053 0 : fprintf(stderr, " len: %lu\n", (unsigned long)len);
1054 0 : if (len == 0)
1055 0 : goto end;
1056 0 : data = static_cast<char *>(calloc(len, 1));
1057 0 : read_bytes = fread(data, 1, len, fp);
1058 0 : if (read_bytes == 0) {
1059 0 : error("unable to read from file: %1", strerror(errno));
1060 0 : goto end;
1061 : }
1062 : /* We rewind back to the original position */
1063 0 : if (fseek(fp, current_position, SEEK_SET) != 0) {
1064 0 : fatal("unable to seek within file: %1", strerror(errno));
1065 0 : goto end;
1066 : }
1067 0 : ud = uchardet_new();
1068 0 : res = uchardet_handle_data(ud, data, len);
1069 0 : if (res != 0) {
1070 0 : debug(" uchardet_handle_data: error %1\n", res);
1071 0 : goto end;
1072 : }
1073 0 : if (is_debugging)
1074 0 : fprintf(stderr, " uchardet read: %lu bytes\n",
1075 : (unsigned long)read_bytes);
1076 0 : uchardet_data_end(ud);
1077 0 : charset = uchardet_get_charset(ud);
1078 0 : if (is_debugging) {
1079 0 : if (charset != 0 /* nullptr */)
1080 0 : fprintf(stderr, " charset: %s\n", charset);
1081 : else
1082 0 : fprintf(stderr, " charset is NULL\n");
1083 : }
1084 : /* uchardet 0.0.1 could return an empty string instead of a null
1085 : * pointer. */
1086 0 : if ((charset != 0 /* nullptr */) && (*charset != '\0')) {
1087 0 : ret = static_cast<char *>(malloc(strlen(charset) + 1));
1088 0 : strcpy(ret, charset);
1089 : }
1090 :
1091 0 : end:
1092 0 : if (ud != 0 /* nullptr */)
1093 0 : uchardet_delete(ud);
1094 0 : if (data != 0 /* nullptr */)
1095 0 : free(data);
1096 :
1097 0 : return ret;
1098 : #else /* not HAVE_UCHARDET */
1099 : return 0 /* nullptr */;
1100 : #endif /* not HAVE_UCHARDET */
1101 : }
1102 :
1103 : // ---------------------------------------------------------
1104 : // Process an input file. If `filename` is "-", read the
1105 : // standard input stream.
1106 : //
1107 : // Return Boolean indicating successful completion.
1108 : // ---------------------------------------------------------
1109 : bool
1110 242 : do_file(const char *filename)
1111 : {
1112 : FILE *fp;
1113 484 : string BOM, data;
1114 242 : bool is_seekable = false;
1115 484 : string reported_filename;
1116 :
1117 : // TODO: Consider moving some of this into a `quoted_file_name`
1118 : // function in libgroff.
1119 242 : if (strcmp(filename, "-") == 0) {
1120 40 : fp = stdin;
1121 40 : reported_filename = string("<standard input>");
1122 : }
1123 : else {
1124 202 : fp = fopen(filename, FOPEN_RB);
1125 202 : reported_filename = "'" + string(filename) + "'";
1126 : }
1127 242 : char *c_reported_filename = reported_filename.extract();
1128 242 : if (!fp) {
1129 0 : error("cannot open %1: %2", c_reported_filename, strerror(errno));
1130 0 : free(c_reported_filename);
1131 0 : return false;
1132 : }
1133 242 : if (is_debugging) {
1134 11 : fprintf(stderr, "processing %s\n", c_reported_filename);
1135 11 : fflush(stderr);
1136 : }
1137 242 : free(c_reported_filename);
1138 242 : if (fseek(fp, 0L, SEEK_SET) == 0)
1139 201 : is_seekable = true;
1140 : else {
1141 : SET_BINARY(fileno(fp));
1142 41 : if (is_debugging)
1143 11 : fprintf(stderr, " stream is not seekable: %s\n",
1144 11 : strerror(errno));
1145 : }
1146 242 : const char *BOM_encoding = get_BOM(fp, BOM, data);
1147 : // Determine the encoding.
1148 : char *encoding;
1149 242 : bool must_free_encoding = false;
1150 242 : if (user_encoding[0]) {
1151 230 : if (is_debugging) {
1152 1 : fprintf(stderr, " user-specified encoding '%s', "
1153 : "no search for coding tag\n",
1154 : user_encoding);
1155 1 : if (BOM_encoding && strcmp(BOM_encoding, user_encoding))
1156 1 : fprintf(stderr, " but BOM in data stream implies encoding"
1157 : " '%s'!\n", BOM_encoding);
1158 : }
1159 230 : encoding = static_cast<char *>(user_encoding);
1160 : }
1161 12 : else if (BOM_encoding != 0 /* nullptr */) {
1162 5 : if (is_debugging)
1163 5 : fprintf(stderr, " found BOM, no search for coding tag\n");
1164 5 : encoding = const_cast<char *>(BOM_encoding);
1165 : }
1166 : else {
1167 : // 'check_coding_tag' returns a pointer to a static array (or a null
1168 : // pointer).
1169 7 : char *file_encoding = check_coding_tag(fp, data);
1170 7 : if (!file_encoding) {
1171 7 : if (is_debugging)
1172 5 : fprintf(stderr, " no coding tag\n");
1173 7 : if (is_seekable)
1174 0 : file_encoding = detect_file_encoding(fp);
1175 7 : if (!file_encoding) {
1176 7 : if (is_debugging)
1177 5 : fprintf(stderr,
1178 : " could not detect encoding with uchardet\n");
1179 7 : file_encoding = fallback_encoding;
1180 : }
1181 : else
1182 0 : must_free_encoding = true;
1183 : }
1184 : else
1185 0 : if (is_debugging)
1186 0 : fprintf(stderr, " coding tag: '%s'\n", file_encoding);
1187 7 : encoding = file_encoding;
1188 : }
1189 242 : strncpy(encoding_string, encoding, MAX_VAR_LEN - 1);
1190 242 : encoding_string[MAX_VAR_LEN - 1] = '\0';
1191 242 : if (must_free_encoding)
1192 0 : free(encoding);
1193 242 : encoding = encoding_string;
1194 : // Translate from MIME/Emacs encoding names to locale encoding names.
1195 242 : encoding = emacs2mime(encoding_string);
1196 242 : if (encoding[0] == '\0') {
1197 0 : error("non-portable encoding '%1' not supported", encoding_string);
1198 0 : return false;
1199 : }
1200 242 : if (is_debugging)
1201 11 : fprintf(stderr, " encoding used: '%s'\n", encoding);
1202 242 : if (!want_raw_output) {
1203 484 : string fn(filename);
1204 242 : fn += '\0';
1205 242 : normalize_file_name_for_lf_request(fn);
1206 242 : (void) printf(".lf 1 %s%s\n", ('"' == filename[0]) ? "" : "\"",
1207 : fn.contents());
1208 : }
1209 242 : bool was_successful = true;
1210 : // Call converter (converters write to stdout).
1211 242 : if (!strcasecmp(encoding, "ISO-8859-1"))
1212 2 : conversion_latin1(fp, BOM + data);
1213 240 : else if (!strcasecmp(encoding, "UTF-8"))
1214 234 : conversion_utf8(fp, data);
1215 6 : else if (!strcasecmp(encoding, "cp1047"))
1216 0 : conversion_cp1047(fp, BOM + data);
1217 : else {
1218 : #if HAVE_ICONV
1219 6 : conversion_iconv(fp, BOM + data, encoding);
1220 : #else
1221 : error("encoding system '%1' not supported", encoding);
1222 : was_successful = false;
1223 : #endif /* HAVE_ICONV */
1224 : }
1225 242 : if (fp != stdin)
1226 202 : fclose(fp);
1227 242 : return was_successful;
1228 : }
1229 :
1230 : // ---------------------------------------------------------
1231 : // Print usage.
1232 : // ---------------------------------------------------------
1233 : void
1234 0 : usage(FILE *stream)
1235 : {
1236 0 : fprintf(stream,
1237 : "usage: %s [-dr] [-D fallback-encoding] [-e encoding] [file ...]\n"
1238 : "usage: %s {-v | --version}\n"
1239 : "usage: %s {-h | --help}\n",
1240 : program_name, program_name, program_name);
1241 0 : if (stdout == stream)
1242 0 : fprintf(stream,
1243 : "\n"
1244 : "Read each file, convert its encoded characters to a form GNU"
1245 : " troff(1)\n"
1246 : "can interpret, and send the result to the standard output stream.\n"
1247 : "The default fallback encoding is '%s'. See the preconv(1) manual"
1248 : " page.\n",
1249 : fallback_encoding);
1250 0 : }
1251 :
1252 : // ---------------------------------------------------------
1253 : // Main routine.
1254 : // ---------------------------------------------------------
1255 : int
1256 45 : main(int argc, char **argv)
1257 : {
1258 45 : program_name = argv[0];
1259 : // Determine the fallback encoding. This must be done before
1260 : // getopt() is called since the usage message shows the fallback
1261 : // encoding.
1262 45 : setlocale(LC_ALL, "");
1263 45 : char *locale = getlocale(LC_CTYPE);
1264 45 : if (!locale || !strcmp(locale, "C") || !strcmp(locale, "POSIX"))
1265 28 : strcpy(fallback_encoding, "latin1");
1266 : else {
1267 17 : strncpy(fallback_encoding, locale_charset(), MAX_VAR_LEN - 1);
1268 17 : fallback_encoding[MAX_VAR_LEN - 1] = 0;
1269 : }
1270 :
1271 45 : program_name = argv[0];
1272 : int opt;
1273 : static const struct option long_options[] = {
1274 : { "help", no_argument, 0 /* nullptr */, 'h' },
1275 : { "version", no_argument, 0 /* nullptr */, 'v' },
1276 : { 0 /* nullptr */, 0, 0 /* nullptr */, 0 }
1277 : };
1278 : // Parse the command-line options.
1279 89 : while ((opt = getopt_long(argc, argv, ":dD:e:hrv", long_options,
1280 : 0 /* nullptr */))
1281 89 : != EOF)
1282 45 : switch (opt) {
1283 1 : case 'v':
1284 1 : printf("GNU preconv (groff) version %s %s iconv support and %s"
1285 : " uchardet support\n",
1286 : Version_string,
1287 : #ifdef HAVE_ICONV
1288 : "with",
1289 : #else
1290 : "without",
1291 : #endif /* HAVE_ICONV */
1292 : #ifdef HAVE_UCHARDET
1293 : "with"
1294 : #else
1295 : "without"
1296 : #endif /* HAVE_UCHARDET */
1297 : );
1298 1 : exit(EXIT_SUCCESS);
1299 : break;
1300 11 : case 'd':
1301 11 : is_debugging = true;
1302 11 : break;
1303 32 : case 'e':
1304 32 : if (optarg != 0 /* nullptr */) {
1305 32 : strncpy(user_encoding, optarg, MAX_VAR_LEN - 1);
1306 32 : user_encoding[MAX_VAR_LEN - 1] = 0;
1307 : }
1308 : else
1309 0 : user_encoding[0] = 0;
1310 32 : break;
1311 1 : case 'D':
1312 1 : if (optarg != 0 /* nullptr */) {
1313 1 : strncpy(fallback_encoding, optarg, MAX_VAR_LEN - 1);
1314 1 : fallback_encoding[MAX_VAR_LEN - 1] = 0;
1315 : }
1316 1 : break;
1317 0 : case 'r':
1318 0 : want_raw_output = true;
1319 0 : break;
1320 0 : case 'h':
1321 0 : usage(stdout);
1322 0 : exit(EXIT_SUCCESS);
1323 : break;
1324 0 : case '?':
1325 0 : if (optopt != 0)
1326 0 : error("unrecognized command-line option '%1'", char(optopt));
1327 : else
1328 0 : error("unrecognized command-line option '%1'",
1329 0 : argv[(optind - 1)]);
1330 0 : usage(stderr);
1331 0 : exit(2);
1332 : break;
1333 0 : case ':':
1334 0 : error("command-line option '%1' requires an argument",
1335 0 : char(optopt));
1336 0 : usage(stderr);
1337 0 : exit(2);
1338 : break;
1339 0 : default:
1340 0 : assert(0 == "unhandled getopt_long return value");
1341 : }
1342 44 : int nbad = 0;
1343 44 : if (is_debugging)
1344 11 : fprintf(stderr, "fallback encoding: '%s'\n", fallback_encoding);
1345 44 : if (optind >= argc)
1346 20 : nbad += !do_file("-");
1347 : else
1348 246 : for (int i = optind; i < argc; i++)
1349 222 : nbad += !do_file(argv[i]);
1350 44 : if (ferror(stdout))
1351 0 : fatal("error status on standard output stream");
1352 44 : if (fflush(stdout) < 0)
1353 0 : fatal("cannot flush standard output stream: %1", strerror(errno));
1354 44 : return (nbad != 0);
1355 : }
1356 :
1357 : // Local Variables:
1358 : // fill-column: 72
1359 : // mode: C++
1360 : // End:
1361 : // vim: set cindent noexpandtab shiftwidth=2 textwidth=72:
|