LCOV - code coverage report
Current view: top level - preproc/preconv - preconv.cpp (source / functions) Hit Total Coverage
Test: GNU roff Lines: 274 504 54.4 %
Date: 2026-01-16 17:51:41 Functions: 15 20 75.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* Copyright (C) 2005-2025 Free Software Foundation, Inc.
       2             :      Written by Werner Lemberg (wl@gnu.org)
       3             : 
       4             : This file is part of groff, the GNU roff typesetting system.
       5             : 
       6             : groff is free software; you can redistribute it and/or modify it under
       7             : the terms of the GNU General Public License as published by the Free
       8             : Software Foundation, either version 3 of the License, or
       9             : (at your option) any later version.
      10             : 
      11             : groff is distributed in the hope that it will be useful, but WITHOUT ANY
      12             : WARRANTY; without even the implied warranty of MERCHANTABILITY or
      13             : FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      14             : for more details.
      15             : 
      16             : You should have received a copy of the GNU General Public License
      17             : along with this program.  If not, see <http://www.gnu.org/licenses/>. */
      18             : 
      19             : #ifdef HAVE_CONFIG_H
      20             : #include <config.h>
      21             : #endif
      22             : 
      23             : #include <assert.h>
      24             : #include <errno.h>
      25             : #if HAVE_ICONV
      26             : # include <iconv.h> // iconv(), iconv_close(), iconv_open()
      27             : # ifdef WORDS_BIGENDIAN
      28             : #  define UNICODE "UTF-32BE"
      29             : # else
      30             : #  define UNICODE "UTF-32LE"
      31             : # endif
      32             : #endif
      33             : #include <locale.h> // setlocale()
      34             : #include <stdcountof.h>
      35             : #include <stdio.h> // EOF, FILE, fclose(), ferror(), fflush(), fileno(),
      36             :                    // fopen(), fprintf(), fread(), fseek(), ftell(),
      37             :                    // getc(), printf(), putchar(), rewind(), SEEK_SET,
      38             :                    // stderr, stdin, stdout, ungetc()
      39             : #include <stdlib.h> // calloc(), exit(), EXIT_SUCCESS, free(), malloc()
      40             : #include <string.h> // sterror()
      41             : #include <sys/stat.h> // fstat(), stat
      42             : #ifdef HAVE_UCHARDET
      43             : #include <uchardet/uchardet.h>
      44             : #endif
      45             : 
      46             : #include <getopt.h> // getopt_long()
      47             : 
      48             : #include "lib.h"
      49             : 
      50             : #include "errarg.h"
      51             : #include "error.h"
      52             : #include "localcharset.h"
      53             : #include "nonposix.h"
      54             : #include "stringclass.h" // must precede lf.h
      55             : #include "lf.h"
      56             : 
      57             : #define MAX_VAR_LEN 100
      58             : 
      59             : extern "C" const char *Version_string;
      60             : 
      61             : char fallback_encoding[MAX_VAR_LEN];
      62             : char user_encoding[MAX_VAR_LEN];
      63             : char encoding_string[MAX_VAR_LEN];
      64             : bool is_debugging = false;
      65             : bool want_raw_output = false;
      66             : 
      67             : struct conversion {
      68             :   const char *from;
      69             :   const char *to;
      70             : };
      71             : 
      72             : // The official list of MIME tags can be found at
      73             : //
      74             : //   http://www.iana.org/assignments/character-sets
      75             : //
      76             : // For encodings which don't have a MIME tag we use GNU iconv's encoding
      77             : // names (which also work with the portable GNU libiconv package).  They
      78             : // are marked with '*'.
      79             : //
      80             : // Encodings specific to XEmacs and Emacs are marked as such; no mark
      81             : // means that they are used by both Emacs and XEmacs.
      82             : //
      83             : // Encodings marked with '--' are special to Emacs, XEmacs, or other
      84             : // applications and shouldn't be used for data exchange.
      85             : //
      86             : // 'Not covered' means that the encoding can be handled neither by GNU
      87             : // iconv nor by libiconv, or just one of them has support for it.
      88             : //
      89             : // A special case is VIQR encoding: Despite of having a MIME tag it is
      90             : // missing in both libiconv 1.10 and iconv (coming with GNU libc 2.3.6).
      91             : //
      92             : // Finally, we add all aliases of GNU iconv for 'ascii', 'latin1', and
      93             : // 'utf8' to catch those encoding names before iconv is called.
      94             : //
      95             : // Note that most entries are commented out -- only a small, (rather)
      96             : // reliable and stable subset of encodings is recognized (for coding
      97             : // tags) which are still in greater use today (January 2006).  Most
      98             : // notably, all Windows-specific encodings are not selected because they
      99             : // lack stability: Microsoft has changed the mappings instead of
     100             : // creating new versions.
     101             : //
     102             : // Please contact the groff list if you find the selection inadequate.
     103             : 
     104             : static const conversion
     105             : emacs_to_mime[] = {
     106             :   {"ascii",                           "US-ASCII"},  // Emacs
     107             :   {"big5",                            "Big5"},
     108             :   {"chinese-big5",                    "Big5"},      // Emacs
     109             :   {"chinese-euc",                     "GB2312"},    // XEmacs
     110             :   {"chinese-iso-8bit",                        "GB2312"},    // Emacs
     111             :   {"cn-big5",                         "Big5"},
     112             :   {"cn-gb",                           "GB2312"},    // Emacs
     113             :   {"cn-gb-2312",                      "GB2312"},
     114             :   {"cp878",                           "KOI8-R"},    // Emacs
     115             :   {"cp1047",                          "CP1047"},    // EBCDIC
     116             :   {"csascii",                         "US-ASCII"},  // alias
     117             :   {"csisolatin1",                     "ISO-8859-1"},        // alias
     118             :   {"cyrillic-iso-8bit",                       "ISO-8859-5"},        // Emacs
     119             :   {"cyrillic-koi8",                   "KOI8-R"},    // not KOI8!, Emacs
     120             :   {"euc-china",                               "GB2312"},    // Emacs
     121             :   {"euc-cn",                          "GB2312"},    // Emacs
     122             :   {"euc-japan",                               "EUC-JP"},
     123             :   {"euc-japan-1990",                  "EUC-JP"},    // Emacs
     124             :   {"euc-jp",                          "EUC-JP"},
     125             :   {"euc-korea",                               "EUC-KR"},
     126             :   {"euc-kr",                          "EUC-KR"},
     127             :   {"gb2312",                          "GB2312"},
     128             :   {"greek-iso-8bit",                  "ISO-8859-7"},
     129             :   {"iso-10646/utf8",                  "UTF-8"},     // alias
     130             :   {"iso-10646/utf-8",                 "UTF-8"},     // alias
     131             :   {"iso-8859-1",                      "ISO-8859-1"},
     132             :   {"iso-8859-13",                     "ISO-8859-13"},       // Emacs
     133             :   {"iso-8859-15",                     "ISO-8859-15"},
     134             :   {"iso-8859-2",                      "ISO-8859-2"},
     135             :   {"iso-8859-5",                      "ISO-8859-5"},
     136             :   {"iso-8859-7",                      "ISO-8859-7"},
     137             :   {"iso-8859-9",                      "ISO-8859-9"},
     138             :   {"iso-latin-1",                     "ISO-8859-1"},
     139             :   {"iso-latin-2",                     "ISO-8859-2"},        // Emacs
     140             :   {"iso-latin-5",                     "ISO-8859-9"},        // Emacs
     141             :   {"iso-latin-7",                     "ISO-8859-13"},       // Emacs
     142             :   {"iso-latin-9",                     "ISO-8859-15"},       // Emacs
     143             :   {"japanese-iso-8bit",                       "EUC-JP"},    // Emacs
     144             :   {"japanese-euc",                    "EUC-JP"},    // XEmacs
     145             :   {"jis8",                            "EUC-JP"},    // XEmacs
     146             :   {"koi8",                            "KOI8-R"},    // not KOI8!, Emacs
     147             :   {"koi8-r",                          "KOI8-R"},
     148             :   {"korean-euc",                      "EUC-KR"},    // XEmacs
     149             :   {"korean-iso-8bit",                 "EUC-KR"},    // Emacs
     150             :   {"latin1",                          "ISO-8859-1"},  // alias
     151             :   {"latin-0",                         "ISO-8859-15"},       // Emacs
     152             :   {"latin-1",                         "ISO-8859-1"},        // Emacs
     153             :   {"latin-2",                         "ISO-8859-2"},        // Emacs
     154             :   {"latin-5",                         "ISO-8859-9"},        // Emacs
     155             :   {"latin-7",                         "ISO-8859-13"},       // Emacs
     156             :   {"latin-9",                         "ISO-8859-15"},       // Emacs
     157             :   {"mule-utf-16",                     "UTF-16"},    // Emacs
     158             :   {"mule-utf-16be",                   "UTF-16BE"},  // Emacs
     159             :   {"mule-utf-16-be",                  "UTF-16BE"},  // Emacs
     160             :   {"mule-utf-16be-with-signature",    "UTF-16"},    // Emacs, not UTF-16BE
     161             :   {"mule-utf-16le",                   "UTF-16LE"},  // Emacs
     162             :   {"mule-utf-16-le",                  "UTF-16LE"},  // Emacs
     163             :   {"mule-utf-16le-with-signature",    "UTF-16"},    // Emacs, not UTF-16LE
     164             :   {"mule-utf-8",                      "UTF-8"},     // Emacs
     165             :   {"us-ascii",                                "US-ASCII"},  // Emacs
     166             :   {"utf8",                            "UTF-8"},     // alias
     167             :   {"utf-16",                          "UTF-16"},    // Emacs
     168             :   {"utf-16be",                                "UTF-16BE"},  // Emacs
     169             :   {"utf-16-be",                               "UTF-16BE"},  // Emacs
     170             :   {"utf-16be-with-signature",         "UTF-16"},    // Emacs, not UTF-16BE
     171             :   {"utf-16-be-with-signature",                "UTF-16"},    // Emacs, not UTF-16BE
     172             :   {"utf-16le",                                "UTF-16LE"},  // Emacs
     173             :   {"utf-16-le",                               "UTF-16LE"},  // Emacs
     174             :   {"utf-16le-with-signature",         "UTF-16"},    // Emacs, not UTF-16LE
     175             :   {"utf-16-le-with-signature",                "UTF-16"},    // Emacs, not UTF-16LE
     176             :   {"utf-8",                           "UTF-8"},     // Emacs
     177             : 
     178             : //  {"alternativnyj",                 ""},          // ?
     179             : //  {"arabic-iso-8bit",                       "ISO-8859-6"},        // Emacs
     180             : //  {"binary",                                ""},          // --
     181             : //  {"chinese-hz",                    "HZ-GB-2312"},        // Emacs
     182             : //  {"chinese-iso-7bit",              "ISO-2022-CN"},       // Emacs
     183             : //  {"chinese-iso-8bit-with-esc",     ""},          // --
     184             : //  {"compound-text",                 ""},          // --
     185             : //  {"compound-text-with-extension",  ""},          // --
     186             : //  {"cp1125",                                "cp1125"},    // *
     187             : //  {"cp1250",                                "windows-1250"},// Emacs
     188             : //  {"cp1251",                                "windows-1251"},// Emacs
     189             : //  {"cp1252",                                "windows-1252"},// Emacs
     190             : //  {"cp1253",                                "windows-1253"},// Emacs
     191             : //  {"cp1254",                                "windows-1254"},// Emacs
     192             : //  {"cp1255",                                "windows-1255"},// Emacs
     193             : //  {"cp1256",                                "windows-1256"},// Emacs
     194             : //  {"cp1257",                                "windows-1257"},// Emacs
     195             : //  {"cp1258",                                "windows-1258"},// Emacs
     196             : //  {"cp437",                         "cp437"},     // Emacs
     197             : //  {"cp720",                         ""},          // not covered
     198             : //  {"cp737",                         "cp737"},     // *, Emacs
     199             : //  {"cp775",                         "cp775"},     // Emacs
     200             : //  {"cp850",                         "cp850"},     // Emacs
     201             : //  {"cp851",                         "cp851"},     // Emacs
     202             : //  {"cp852",                         "cp852"},     // Emacs
     203             : //  {"cp855",                         "cp855"},     // Emacs
     204             : //  {"cp857",                         "cp857"},     // Emacs
     205             : //  {"cp860",                         "cp860"},     // Emacs
     206             : //  {"cp861",                         "cp861"},     // Emacs
     207             : //  {"cp862",                         "cp862"},     // Emacs
     208             : //  {"cp863",                         "cp863"},     // Emacs
     209             : //  {"cp864",                         "cp864"},     // Emacs
     210             : //  {"cp865",                         "cp865"},     // Emacs
     211             : //  {"cp866",                         "cp866"},     // Emacs
     212             : //  {"cp866u",                                "cp1125"},    // *, Emacs
     213             : //  {"cp869",                         "cp869"},     // Emacs
     214             : //  {"cp874",                         "cp874"},     // *, Emacs
     215             : //  {"cp932",                         "cp932"},     // *, Emacs
     216             : //  {"cp936",                         "cp936"},     // Emacs
     217             : //  {"cp949",                         "cp949"},     // *, Emacs
     218             : //  {"cp950",                         "cp950"},     // *, Emacs
     219             : //  {"ctext",                         ""},          // --
     220             : //  {"ctext-no-compositions",         ""},          // --
     221             : //  {"ctext-with-extensions",         ""},          // --
     222             : //  {"cyrillic-alternativnyj",                ""},          // ?, Emacs
     223             : //  {"cyrillic-iso-8bit-with-esc",    ""},          // --
     224             : //  {"cyrillic-koi8-t",                       "KOI8-T"},    // *, Emacs
     225             : //  {"devanagari",                    ""},          // not covered
     226             : //  {"dos",                           ""},          // --
     227             : //  {"emacs-mule",                    ""},          // --
     228             : //  {"euc-jisx0213",                  "EUC-JISX0213"},// *, XEmacs?
     229             : //  {"euc-jisx0213-with-esc",         ""},          // XEmacs?
     230             : //  {"euc-taiwan",                    "EUC-TW"},    // *, Emacs
     231             : //  {"euc-tw",                                "EUC-TW"},    // *, Emacs
     232             : //  {"georgian-ps",                   "GEORGIAN-PS"},       // *, Emacs
     233             : //  {"greek-iso-8bit-with-esc",               ""},          // --
     234             : //  {"hebrew-iso-8bit",                       "ISO-8859-8"},        // Emacs
     235             : //  {"hebrew-iso-8bit-with-esc",      ""},          // --
     236             : //  {"hz",                            "HZ-GB-2312"},
     237             : //  {"hz-gb-2312",                    "HZ-GB-2312"},
     238             : //  {"in-is13194",                    ""},          // not covered
     239             : //  {"in-is13194-devanagari",         ""},          // not covered
     240             : //  {"in-is13194-with-esc",           ""},          // --
     241             : //  {"iso-2022-7",                    ""},          // XEmacs?
     242             : //  {"iso-2022-7bit",                 ""},          // --
     243             : //  {"iso-2022-7bit-lock",            ""},          // --
     244             : //  {"iso-2022-7bit-lock-ss2",                ""},          // --
     245             : //  {"iso-2022-7bit-ss2",             ""},          // --
     246             : //  {"iso-2022-8",                    ""},          // XEmacs?
     247             : //  {"iso-2022-8bit",                 ""},          // XEmacs?
     248             : //  {"iso-2022-8bit-lock",            ""},          // XEmacs?
     249             : //  {"iso-2022-8bit-lock-ss2",                ""},          // XEmacs?
     250             : //  {"iso-2022-8bit-ss2",             ""},          // --
     251             : //  {"iso-2022-cjk",                  ""},          // --
     252             : //  {"iso-2022-cn",                   "ISO-2022-CN"},       // Emacs
     253             : //  {"iso-2022-cn-ext",                       "ISO-2022-CN-EXT"},// Emacs
     254             : //  {"iso-2022-int-1",                        ""},          // --
     255             : //  {"iso-2022-jp",                   "ISO-2022-JP"},
     256             : //  {"iso-2022-jp-1978-irv",          "ISO-2022-JP"},
     257             : //  {"iso-2022-jp-2",                 "ISO-2022-JP-2"},
     258             : //  {"iso-2022-jp-3",                 "ISO-2022-JP-3"},// *, XEmacs?
     259             : //  {"iso-2022-jp-3-compatible",      ""},          // XEmacs?
     260             : //  {"iso-2022-jp-3-strict",          "ISO-2022-JP-3"},// *, XEmacs?
     261             : //  {"iso-2022-kr",                   "ISO-2022-KR"},
     262             : //  {"iso-2022-lock",                 ""},          // XEmacs?
     263             : //  {"iso-8859-10",                   "ISO-8859-10"},       // Emacs
     264             : //  {"iso-8859-11",                   "ISO-8859-11"},       // *, Emacs
     265             : //  {"iso-8859-14",                   "ISO-8859-14"},       // Emacs
     266             : //  {"iso-8859-16",                   "ISO-8859-16"},
     267             : //  {"iso-8859-3",                    "ISO-8859-3"},
     268             : //  {"iso-8859-4",                    "ISO-8859-4"},
     269             : //  {"iso-8859-6",                    "ISO-8859-6"},
     270             : //  {"iso-8859-8",                    "ISO-8859-8"},
     271             : //  {"iso-8859-8-e",                  "ISO-8859-8"},
     272             : //  {"iso-8859-8-i",                  "ISO-8859-8"},        // Emacs
     273             : //  {"iso-latin-10",                  "ISO-8859-16"},       // Emacs
     274             : //  {"iso-latin-1-with-esc",          ""},          // --
     275             : //  {"iso-latin-2-with-esc",          ""},          // --
     276             : //  {"iso-latin-3",                   "ISO-8859-3"},        // Emacs
     277             : //  {"iso-latin-3-with-esc",          ""},          // --
     278             : //  {"iso-latin-4",                   "ISO-8859-4"},        // Emacs
     279             : //  {"iso-latin-4-with-esc",          ""},          // --
     280             : //  {"iso-latin-5-with-esc",          ""},          // --
     281             : //  {"iso-latin-6",                   "ISO-8859-10"},       // Emacs
     282             : //  {"iso-latin-8",                   "ISO-8859-14"},       // Emacs
     283             : //  {"iso-safe",                              ""},          // --
     284             : //  {"japanese-iso-7bit-1978-irv",    "ISO-2022-JP"},       // Emacs
     285             : //  {"japanese-iso-8bit-with-esc",    ""},          // --
     286             : //  {"japanese-shift-jis",            "Shift_JIS"}, // Emacs
     287             : //  {"japanese-shift-jisx0213",               ""},          // XEmacs?
     288             : //  {"jis7",                          "ISO-2022-JP"},       // Xemacs
     289             : //  {"junet",                         "ISO-2022-JP"},
     290             : //  {"koi8-t",                                "KOI8-T"},    // *, Emacs
     291             : //  {"koi8-u",                                "KOI8-U"},    // Emacs
     292             : //  {"korean-iso-7bit-lock",          "ISO-2022-KR"},
     293             : //  {"korean-iso-8bit-with-esc",      ""},          // --
     294             : //  {"lao",                           ""},          // not covered
     295             : //  {"lao-with-esc",                  ""},          // --
     296             : //  {"latin-10",                      "ISO-8859-16"},       // Emacs
     297             : //  {"latin-3",                               "ISO-8859-3"},        // Emacs
     298             : //  {"latin-4",                               "ISO-8859-4"},        // Emacs
     299             : //  {"latin-6",                               "ISO-8859-10"},       // Emacs
     300             : //  {"latin-8",                               "ISO-8859-14"},       // Emacs
     301             : //  {"mac",                           ""},          // --
     302             : //  {"mac-roman",                     "MACINTOSH"}, // Emacs
     303             : //  {"mik",                           ""},          // not covered
     304             : //  {"next",                          "NEXTSTEP"},  // *, Emacs
     305             : //  {"no-conversion",                 ""},          // --
     306             : //  {"old-jis",                               "ISO-2022-JP"},
     307             : //  {"pt154",                         "PT154"},     // Emacs
     308             : //  {"raw-text",                      ""},          // --
     309             : //  {"ruscii",                                "cp1125"},    // *, Emacs
     310             : //  {"shift-jis",                     "Shift_JIS"}, // XEmacs
     311             : //  {"shift_jis",                     "Shift_JIS"},
     312             : //  {"shift_jisx0213",                        "Shift_JISX0213"},// *, XEmacs?
     313             : //  {"sjis",                          "Shift_JIS"}, // Emacs
     314             : //  {"tcvn",                          "TCVN"},      // *, Emacs
     315             : //  {"tcvn-5712",                     "TCVN"},      // *, Emacs
     316             : //  {"thai-tis620",                   "TIS-620"},
     317             : //  {"thai-tis620-with-esc",          ""},          // --
     318             : //  {"th-tis620",                     "TIS-620"},
     319             : //  {"tibetan",                               ""},          // not covered
     320             : //  {"tibetan-iso-8bit",              ""},          // not covered
     321             : //  {"tibetan-iso-8bit-with-esc",     ""},          // --
     322             : //  {"tis-620",                               "TIS-620"},
     323             : //  {"tis620",                                "TIS-620"},
     324             : //  {"undecided",                     ""},          // --
     325             : //  {"unix",                          ""},          // --
     326             : //  {"utf-7",                         "UTF-7"},     // Emacs
     327             : //  {"utf-7-safe",                    ""},          // XEmacs?
     328             : //  {"utf-8-ws",                      "UTF-8"},     // XEmacs?
     329             : //  {"vietnamese-tcvn",                       "TCVN"},      // *, Emacs
     330             : //  {"vietnamese-viqr",                       "VIQR"},      // not covered
     331             : //  {"vietnamese-viscii",             "VISCII"},
     332             : //  {"vietnamese-vscii",              ""},          // not covered
     333             : //  {"viqr",                          "VIQR"},      // not covered
     334             : //  {"viscii",                                "VISCII"},
     335             : //  {"vscii",                         ""},          // not covered
     336             : //  {"windows-037",                   ""},          // not covered
     337             : //  {"windows-10000",                 ""},          // not covered
     338             : //  {"windows-10001",                 ""},          // not covered
     339             : //  {"windows-10006",                 ""},          // not covered
     340             : //  {"windows-10007",                 ""},          // not covered
     341             : //  {"windows-10029",                 ""},          // not covered
     342             : //  {"windows-10079",                 ""},          // not covered
     343             : //  {"windows-10081",                 ""},          // not covered
     344             : //  {"windows-1026",                  ""},          // not covered
     345             : //  {"windows-1200",                  ""},          // not covered
     346             : //  {"windows-1250",                  "windows-1250"},
     347             : //  {"windows-1251",                  "windows-1251"},
     348             : //  {"windows-1252",                  "windows-1252"},
     349             : //  {"windows-1253",                  "windows-1253"},
     350             : //  {"windows-1254",                  "windows-1254"},
     351             : //  {"windows-1255",                  "windows-1255"},
     352             : //  {"windows-1256",                  "windows-1256"},
     353             : //  {"windows-1257",                  "windows-1257"},
     354             : //  {"windows-1258",                  "windows-1258"},
     355             : //  {"windows-1361",                  "cp1361"},    // *, XEmacs
     356             : //  {"windows-437",                   "cp437"},     // XEmacs
     357             : //  {"windows-500",                   ""},          // not covered
     358             : //  {"windows-708",                   ""},          // not covered
     359             : //  {"windows-709",                   ""},          // not covered
     360             : //  {"windows-710",                   ""},          // not covered
     361             : //  {"windows-720",                   ""},          // not covered
     362             : //  {"windows-737",                   "cp737"},     // *, XEmacs
     363             : //  {"windows-775",                   "cp775"},     // XEmacs
     364             : //  {"windows-850",                   "cp850"},     // XEmacs
     365             : //  {"windows-852",                   "cp852"},     // XEmacs
     366             : //  {"windows-855",                   "cp855"},     // XEmacs
     367             : //  {"windows-857",                   "cp857"},     // XEmacs
     368             : //  {"windows-860",                   "cp860"},     // XEmacs
     369             : //  {"windows-861",                   "cp861"},     // XEmacs
     370             : //  {"windows-862",                   "cp862"},     // XEmacs
     371             : //  {"windows-863",                   "cp863"},     // XEmacs
     372             : //  {"windows-864",                   "cp864"},     // XEmacs
     373             : //  {"windows-865",                   "cp865"},     // XEmacs
     374             : //  {"windows-866",                   "cp866"},     // XEmacs
     375             : //  {"windows-869",                   "cp869"},     // XEmacs
     376             : //  {"windows-874",                   "cp874"},     // XEmacs
     377             : //  {"windows-875",                   ""},          // not covered
     378             : //  {"windows-932",                   "cp932"},     // *, XEmacs
     379             : //  {"windows-936",                   "cp936"},     // XEmacs
     380             : //  {"windows-949",                   "cp949"},     // *, XEmacs
     381             : //  {"windows-950",                   "cp950"},     // *, XEmacs
     382             : //  {"x-ctext",                               ""},          // --
     383             : //  {"x-ctext-with-extensions",               ""},          // --
     384             : 
     385             :   {0 /* nullptr */,                             0 /* nullptr */},
     386             : };
     387             : 
     388             : // ---------------------------------------------------------
     389             : // Convert encoding name from emacs to mime.
     390             : // ---------------------------------------------------------
     391             : char *
     392         242 : emacs2mime(char *emacs_enc)
     393             : {
     394         242 :   size_t emacs_enc_len = strlen(emacs_enc);
     395         242 :   if (emacs_enc_len > 4
     396         196 :       && !strcasecmp(emacs_enc + emacs_enc_len - 4, "-dos"))
     397           0 :     emacs_enc[emacs_enc_len - 4] = 0;
     398         242 :   if (emacs_enc_len > 4
     399         196 :       && !strcasecmp(emacs_enc + emacs_enc_len - 4, "-mac"))
     400           0 :     emacs_enc[emacs_enc_len - 4] = 0;
     401         242 :   if (emacs_enc_len > 5
     402           8 :       && !strcasecmp(emacs_enc + emacs_enc_len - 5, "-unix"))
     403           0 :     emacs_enc[emacs_enc_len - 5] = 0;
     404       16554 :   for (const conversion *table = emacs_to_mime;
     405       16554 :        table->from != 0 /* nullptr */;
     406             :        table++)
     407       16552 :     if (!strcasecmp(emacs_enc, table->from))
     408         240 :       return const_cast<char *>(table->to);
     409           2 :   return emacs_enc;
     410             : }
     411             : 
     412             : // ---------------------------------------------------------
     413             : // Print out Unicode entity if value is greater than 0x7F.
     414             : // ---------------------------------------------------------
     415             : inline void
     416     5286475 : unicode_entity(int u)
     417             : {
     418     5286475 :   if (u < 0x80)
     419     5284963 :     putchar(u);
     420             :   else {
     421             :     // Handle no-break space and soft hyphen specially--they are input
     422             :     // characters only, not glyphs.  See groff_char(7).
     423        1512 :     if (u == 0xA0) {
     424           0 :       putchar('\\');
     425           0 :       putchar('~');
     426             :     }
     427        1512 :     else if (u == 0xAD) {
     428           0 :       putchar('\\');
     429           0 :       putchar('%');
     430             :     }
     431             :     else
     432        1512 :       printf("\\[u%04X]", u);
     433             :   }
     434     5286475 : }
     435             : 
     436             : // ---------------------------------------------------------
     437             : // Conversion functions.  All functions take 'data', which
     438             : // normally holds the first two lines, and a file pointer.
     439             : // ---------------------------------------------------------
     440             : 
     441             : // Conversion from ISO-8859-1 (aka Latin-1) to Unicode.
     442             : void
     443           2 : conversion_latin1(FILE *fp, const string &data)
     444             : {
     445           2 :   int len = data.length();
     446             :   const unsigned char *ptr
     447           2 :     = reinterpret_cast<const unsigned char *>(data.contents());
     448          48 :   for (int i = 0; i < len; i++)
     449          46 :     unicode_entity(ptr[i]);
     450           2 :   int c = -1;
     451           2 :   while ((c = getc(fp)) != EOF)
     452           0 :     unicode_entity(c);
     453           2 : }
     454             : 
     455             : // A future version of groff shall support UTF-8 natively.
     456             : // In this case, the UTF-8 stuff here in this file will be
     457             : // moved to the troff program.
     458             : 
     459             : struct utf8 {
     460             :   FILE *fp;
     461             :   unsigned char s[6];
     462             :   enum {
     463             :     FIRST = 0,
     464             :     SECOND,
     465             :     THIRD,
     466             :     FOURTH,
     467             :     FIFTH,
     468             :     SIXTH
     469             :   } byte;
     470             :   int expected_byte_count;
     471             :   bool emit_invalid_utf8_warning;
     472             :   bool emit_incomplete_utf8_warning;
     473             :   utf8(FILE *);
     474             :   ~utf8();
     475             :   void add(unsigned char);
     476             :   void invalid();
     477             :   void incomplete();
     478             : };
     479             : 
     480         234 : utf8::utf8(FILE *f) : fp(f), byte(FIRST), expected_byte_count(1),
     481             :                       emit_invalid_utf8_warning(true),
     482         234 :                       emit_incomplete_utf8_warning(true)
     483             : {
     484             :   // empty
     485         234 : }
     486             : 
     487         468 : utf8::~utf8()
     488             : {
     489         234 :   if (byte != FIRST)
     490           0 :     incomplete();
     491         234 : }
     492             : 
     493             : inline void
     494     5287981 : utf8::add(unsigned char c)
     495             : {
     496     5287981 :   s[byte] = c;
     497     5287981 :   if (byte == FIRST) {
     498     5286401 :     if (c < 0x80)
     499     5284890 :       unicode_entity(c);
     500        1511 :     else if (c < 0xC0)
     501           0 :       invalid();
     502        1511 :     else if (c < 0xE0) {
     503        1406 :       expected_byte_count = 2;
     504        1406 :       byte = SECOND;
     505             :     }
     506         105 :     else if (c < 0xF0) {
     507          96 :       expected_byte_count = 3;
     508          96 :       byte = SECOND;
     509             :     }
     510           9 :     else if (c < 0xF8) {
     511           9 :       expected_byte_count = 4;
     512           9 :       byte = SECOND;
     513             :     }
     514           0 :     else if (c < 0xFC) {
     515           0 :       expected_byte_count = 5;
     516           0 :       byte = SECOND;
     517             :     }
     518           0 :     else if (c < 0xFE) {
     519           0 :       expected_byte_count = 6;
     520           0 :       byte = SECOND;
     521             :     }
     522             :     else
     523           0 :       invalid();
     524     5286401 :     return;
     525             :   }
     526        1580 :   if (c < 0x80 || c > 0xBF) {
     527          48 :     incomplete();
     528          48 :     add(c);
     529          48 :     return;
     530             :   }
     531        1532 :   switch (byte) {
     532           0 :   case FIRST:
     533             :     // can't happen
     534           0 :     break;
     535        1463 :   case SECOND:
     536        1463 :     if (expected_byte_count == 2) {
     537        1394 :       if (s[0] < 0xC2)
     538           0 :         invalid();
     539             :       else
     540        1394 :         unicode_entity(((s[0] & 0x1F) << 6)
     541        1394 :                        | (s[1] ^ 0x80));
     542        1394 :       byte = FIRST;
     543             :     }
     544             :     else
     545          69 :       byte = THIRD;
     546        1463 :     break;
     547          69 :   case THIRD:
     548          69 :     if (expected_byte_count == 3) {
     549          69 :       if (!(s[0] >= 0xE1 || s[1] >= 0xA0))
     550           0 :         invalid();
     551             :       else
     552          69 :         unicode_entity(((s[0] & 0x1F) << 12)
     553          69 :                        | ((s[1] ^ 0x80) << 6)
     554          69 :                        | (s[2] ^ 0x80));
     555          69 :       byte = FIRST;
     556             :     }
     557             :     else
     558           0 :       byte = FOURTH;
     559          69 :     break;
     560           0 :   case FOURTH:
     561             :     // We reject everything greater than 0x10FFFF.
     562           0 :     if (expected_byte_count == 4) {
     563           0 :       if (!((s[0] >= 0xF1 || s[1] >= 0x90)
     564           0 :             && (s[0] < 0xF4 || (s[0] == 0xF4 && s[1] < 0x90))))
     565           0 :         invalid();
     566             :       else
     567           0 :         unicode_entity(((s[0] & 0x07) << 18)
     568           0 :                        | ((s[1] ^ 0x80) << 12)
     569           0 :                        | ((s[2] ^ 0x80) << 6)
     570           0 :                        | (s[3] ^ 0x80));
     571           0 :       byte = FIRST;
     572             :     }
     573             :     else
     574           0 :       byte = FIFTH;
     575           0 :     break;
     576           0 :   case FIFTH:
     577           0 :     if (expected_byte_count == 5) {
     578           0 :       invalid();
     579           0 :       byte = FIRST;
     580             :     }
     581             :     else
     582           0 :       byte = SIXTH;
     583           0 :     break;
     584           0 :   case SIXTH:
     585           0 :     invalid();
     586           0 :     byte = FIRST;
     587           0 :     break;
     588             :   }
     589             : }
     590             : 
     591             : // We use fprintf(stderr) instead of libgroff's debug() because we need
     592             : // to output longs, and libgroff's errprint() doesn't support that.
     593             : 
     594             : void
     595           0 : utf8::invalid()
     596             : {
     597           0 :   if (is_debugging && emit_invalid_utf8_warning) {
     598           0 :     fprintf(stderr, "  invalid UTF-8 sequence(s) in input stream:"
     599             :                     " replacing each such sequence with 0xFFFD\n");
     600           0 :     emit_invalid_utf8_warning = false;
     601             :   }
     602           0 :   unicode_entity(0xFFFD);
     603           0 :   byte = FIRST;
     604           0 : }
     605             : 
     606             : void
     607          48 : utf8::incomplete()
     608             : {
     609          48 :   if (is_debugging && emit_incomplete_utf8_warning) {
     610           0 :     fprintf(stderr, "  incomplete UTF-8 sequence(s) in input stream:"
     611             :                     " replacing each such sequence with 0xFFFD\n");
     612           0 :     emit_incomplete_utf8_warning = false;
     613             :   }
     614          48 :   unicode_entity(0xFFFD);
     615          48 :   byte = FIRST;
     616          48 : }
     617             : 
     618             : // Conversion from UTF-8 to Unicode.
     619             : void
     620         234 : conversion_utf8(FILE *fp, const string &data)
     621             : {
     622         468 :   utf8 u(fp);
     623         234 :   int len = data.length();
     624             :   const unsigned char *ptr
     625         234 :     = reinterpret_cast<const unsigned char *>(data.contents());
     626        1117 :   for (int i = 0; i < len; i++)
     627         883 :     u.add(ptr[i]);
     628         234 :   int c = -1;
     629     5287284 :   while ((c = getc(fp)) != EOF)
     630     5287050 :     u.add(c);
     631         468 :   return;
     632             : }
     633             : 
     634             : // Conversion from cp1047 (EBCDIC) to UTF-8.
     635             : void
     636           0 : conversion_cp1047(FILE *fp, const string &data)
     637             : {
     638             :   static unsigned char cp1047[] = {
     639             :     0x00, 0x01, 0x02, 0x03, 0x9C, 0x09, 0x86, 0x7F,     // 0x00
     640             :     0x97, 0x8D, 0x8E, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
     641             :     0x10, 0x11, 0x12, 0x13, 0x9D, 0x85, 0x08, 0x87,     // 0x10
     642             :     0x18, 0x19, 0x92, 0x8F, 0x1C, 0x1D, 0x1E, 0x1F,
     643             :     0x80, 0x81, 0x82, 0x83, 0x84, 0x0A, 0x17, 0x1B,     // 0x20
     644             :     0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x05, 0x06, 0x07,
     645             :     0x90, 0x91, 0x16, 0x93, 0x94, 0x95, 0x96, 0x04,     // 0x30
     646             :     0x98, 0x99, 0x9A, 0x9B, 0x14, 0x15, 0x9E, 0x1A,
     647             :     0x20, 0xA0, 0xE2, 0xE4, 0xE0, 0xE1, 0xE3, 0xE5,     // 0x40
     648             :     0xE7, 0xF1, 0xA2, 0x2E, 0x3C, 0x28, 0x2B, 0x7C,
     649             :     0x26, 0xE9, 0xEA, 0xEB, 0xE8, 0xED, 0xEE, 0xEF,     // 0x50
     650             :     0xEC, 0xDF, 0x21, 0x24, 0x2A, 0x29, 0x3B, 0x5E,
     651             :     0x2D, 0x2F, 0xC2, 0xC4, 0xC0, 0xC1, 0xC3, 0xC5,     // 0x60
     652             :     0xC7, 0xD1, 0xA6, 0x2C, 0x25, 0x5F, 0x3E, 0x3F,
     653             :     0xF8, 0xC9, 0xCA, 0xCB, 0xC8, 0xCD, 0xCE, 0xCF,     // 0x70
     654             :     0xCC, 0x60, 0x3A, 0x23, 0x40, 0x27, 0x3D, 0x22,
     655             :     0xD8, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,     // 0x80
     656             :     0x68, 0x69, 0xAB, 0xBB, 0xF0, 0xFD, 0xFE, 0xB1,
     657             :     0xB0, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70,     // 0x90
     658             :     0x71, 0x72, 0xAA, 0xBA, 0xE6, 0xB8, 0xC6, 0xA4,
     659             :     0xB5, 0x7E, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,     // 0xA0
     660             :     0x79, 0x7A, 0xA1, 0xBF, 0xD0, 0x5B, 0xDE, 0xAE,
     661             :     0xAC, 0xA3, 0xA5, 0xB7, 0xA9, 0xA7, 0xB6, 0xBC,     // 0xB0
     662             :     0xBD, 0xBE, 0xDD, 0xA8, 0xAF, 0x5D, 0xB4, 0xD7,
     663             :     0x7B, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,     // 0xC0
     664             :     0x48, 0x49, 0xAD, 0xF4, 0xF6, 0xF2, 0xF3, 0xF5,
     665             :     0x7D, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50,     // 0xD0
     666             :     0x51, 0x52, 0xB9, 0xFB, 0xFC, 0xF9, 0xFA, 0xFF,
     667             :     0x5C, 0xF7, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,     // 0xE0
     668             :     0x59, 0x5A, 0xB2, 0xD4, 0xD6, 0xD2, 0xD3, 0xD5,
     669             :     0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,     // 0xF0
     670             :     0x38, 0x39, 0xB3, 0xDB, 0xDC, 0xD9, 0xDA, 0x9F,
     671             :   };
     672           0 :   int len = data.length();
     673             :   const unsigned char *ptr
     674           0 :     = reinterpret_cast<const unsigned char *>(data.contents());
     675           0 :   for (int i = 0; i < len; i++)
     676           0 :     unicode_entity(cp1047[ptr[i]]);
     677           0 :   int c = -1;
     678           0 :   while ((c = getc(fp)) != EOF)
     679           0 :     unicode_entity(cp1047[c]);
     680           0 : }
     681             : 
     682             : // Locale-sensible conversion.
     683             : #if HAVE_ICONV
     684             : void
     685           6 : conversion_iconv(FILE *fp, const string &data, char *enc)
     686             : {
     687           6 :   iconv_t handle = iconv_open(UNICODE, enc);
     688           6 :   if (handle == (iconv_t)-1) {
     689           0 :     if (EINVAL == errno) {
     690           0 :       error("character encoding '%1' not supported by iconv()", enc);
     691           0 :       return;
     692             :     }
     693           0 :     fatal("unable to convert character encoding: %1", strerror(errno));
     694             :   }
     695             :   char inbuf[BUFSIZ];
     696             :   int outbuf[BUFSIZ];
     697           6 :   char *outptr = reinterpret_cast<char *>(outbuf);
     698           6 :   size_t outbytes_left = BUFSIZ * sizeof (int);
     699             :   // Handle 'data'.
     700           6 :   char *inptr = const_cast<char *>(data.contents());
     701           6 :   size_t inbytes_left = data.length();
     702             :   char *limit;
     703          15 :   while (inbytes_left > 0) {
     704           9 :     size_t status = iconv(handle,
     705             :                           const_cast<ICONV_CONST char **>(&inptr),
     706             :                           &inbytes_left, &outptr, &outbytes_left);
     707           9 :     if (status == static_cast<size_t>(-1)) {
     708           3 :       if (EILSEQ == errno) {
     709             :         // Invalid byte sequence.  XXX
     710           3 :         inptr++;
     711           3 :         inbytes_left--;
     712             :       }
     713           0 :       else if (E2BIG == errno) {
     714             :         // Output buffer is full.
     715           0 :         limit = reinterpret_cast<char *>(outbuf)
     716           0 :           + (BUFSIZ * sizeof (int)) - outbytes_left;
     717           0 :         for (int *ptr = outbuf; (char *)ptr < limit; ptr++)
     718           0 :           unicode_entity(*ptr);
     719           0 :         memmove(outbuf, outptr, outbytes_left);
     720           0 :         outptr = reinterpret_cast<char *>(outbuf) + outbytes_left;
     721           0 :         outbytes_left = BUFSIZ * sizeof (int) - outbytes_left;
     722             :       }
     723           0 :       else if (EINVAL == errno) {
     724             :         // 'data' ends with partial input sequence.
     725           0 :         memcpy(inbuf, inptr, inbytes_left);
     726           0 :         break;
     727             :       }
     728             :     }
     729             :   }
     730             :   // Handle 'fp' and switch to 'inbuf'.
     731             :   size_t read_bytes;
     732           6 :   char *read_start = inbuf + inbytes_left;
     733          11 :   while ((read_bytes = fread(read_start, 1, (BUFSIZ - inbytes_left),
     734             :                              fp))
     735          11 :          > 0) {
     736           5 :     inptr = inbuf;
     737           5 :     inbytes_left += read_bytes;
     738          10 :     while (inbytes_left > 0) {
     739           5 :       size_t status = iconv(handle,
     740             :                             const_cast<ICONV_CONST char **>(&inptr),
     741             :                             &inbytes_left, &outptr, &outbytes_left);
     742           5 :       if (status == (size_t)-1) {
     743           0 :         if (EILSEQ == errno) {
     744             :           // Invalid byte sequence.  XXX
     745           0 :           inptr++;
     746           0 :           inbytes_left--;
     747             :         }
     748           0 :         else if (E2BIG == errno) {
     749             :           // Output buffer is full.
     750           0 :           limit = reinterpret_cast<char *>(outbuf)
     751           0 :             + (BUFSIZ * sizeof (int)) - outbytes_left;
     752           0 :           for (int *ptr = outbuf; (char *)ptr < limit; ptr++)
     753           0 :             unicode_entity(*ptr);
     754           0 :           memmove(outbuf, outptr, outbytes_left);
     755           0 :           outptr = reinterpret_cast<char *>(outbuf) + outbytes_left;
     756           0 :           outbytes_left = (BUFSIZ * sizeof (int)) - outbytes_left;
     757             :         }
     758           0 :         else if (EINVAL == errno) {
     759             :           // 'inbuf' ends with partial input sequence.
     760           0 :           memmove(inbuf, inptr, inbytes_left);
     761           0 :           break;
     762             :         }
     763             :       }
     764             :     }
     765           5 :     read_start = inbuf + inbytes_left;
     766             :   }
     767           6 :   iconv_close(handle);
     768             :   // XXX use ferror?
     769           6 :   limit = reinterpret_cast<char *>(outbuf) + (BUFSIZ * sizeof (int))
     770           6 :     - outbytes_left;
     771          34 :   for (int *ptr = outbuf; (char *)ptr < limit; ptr++)
     772          28 :     unicode_entity(*ptr);
     773             : }
     774             : #endif /* HAVE_ICONV */
     775             : 
     776             : static struct bom_s {
     777             :   int len;
     778             :   const char *str;
     779             :   const char *name;
     780             : } BOM_table[] = {
     781             :   {4, "\x00\x00\xFE\xFF", "UTF-32"},
     782             :   {4, "\xFF\xFE\x00\x00", "UTF-32"},
     783             :   {3, "\xEF\xBB\xBF", "UTF-8"},
     784             :   {2, "\xFE\xFF", "UTF-16"},
     785             :   {2, "\xFF\xFE", "UTF-16"},
     786             : };
     787             : 
     788             : // ---------------------------------------------------------
     789             : // Handle Byte Order Mark.
     790             : //
     791             : // Since we have a chicken-and-egg problem it's necessary
     792             : // to handle the BOM manually if it is in the data stream.
     793             : // As documented in the Unicode book it is very unlikely
     794             : // that any normal text file (regardless of the encoding)
     795             : // starts with the bytes which represent a BOM.
     796             : //
     797             : // Return the BOM in string 'BOM'; 'data' then starts with
     798             : // the byte after the BOM.  This function reads (at most)
     799             : // four bytes from the data stream.
     800             : //
     801             : // Return encoding if a BOM is found, and a null pointer
     802             : // otherwise.
     803             : // ---------------------------------------------------------
     804             : const char *
     805         242 : get_BOM(FILE *fp, string &BOM, string &data)
     806             : {
     807             :   // The BOM is U+FEFF.  We have thus the following possible
     808             :   // representations.
     809             :   //
     810             :   //   UTF-8: 0xEFBBBF
     811             :   //   UTF-16: 0xFEFF or 0xFFFE
     812             :   //   UTF-32: 0x0000FEFF or 0xFFFE0000
     813         242 :   const int BOM_table_len = countof(BOM_table);
     814             :   char BOM_string[4];
     815         242 :   const char *retval = 0 /* nullptr */;
     816             :   int len;
     817        1160 :   for (len = 0; len < 4; len++) {
     818         931 :     int c = getc(fp);
     819         931 :     if (c == EOF)
     820          13 :       break;
     821         918 :     BOM_string[len] = char(c);
     822             :   }
     823             :   int i;
     824        1435 :   for (i = 0; i < BOM_table_len; i++) {
     825        1199 :     if (BOM_table[i].len <= len
     826        1136 :         && memcmp(BOM_string, BOM_table[i].str, BOM_table[i].len) == 0)
     827           6 :       break;
     828             :   }
     829         242 :   int j = 0;
     830         242 :   if (i < BOM_table_len) {
     831          23 :     for (; j < BOM_table[i].len; j++)
     832          17 :       BOM += BOM_string[j];
     833           6 :     retval = BOM_table[i].name;
     834             :   }
     835        1143 :   for (; j < len; j++)
     836         901 :     data += BOM_string[j];
     837         242 :   return retval;
     838             : }
     839             : 
     840             : // ---------------------------------------------------------
     841             : // Get first two lines from input stream.
     842             : //
     843             : // Return string (allocated with 'new') without zero bytes
     844             : // or a null pointer in case no coding tag can occur in the
     845             : // data (which is stored unmodified in 'data').
     846             : // ---------------------------------------------------------
     847             : char *
     848           7 : get_tag_lines(FILE *fp, string &data)
     849             : {
     850           7 :   int newline_count = 0;
     851           7 :   int c, prev = -1;
     852             :   // Handle CR, LF, and CRLF as line separators.
     853          25 :   for (int i = 0; i < data.length(); i++) {
     854          18 :     c = data[i];
     855          18 :     if (c == '\n' || c == '\r')
     856           0 :       newline_count++;
     857          18 :     if (c == '\n' && prev == '\r')
     858           0 :       newline_count--;
     859          18 :     prev = c;
     860             :   }
     861           7 :   if (newline_count > 1)
     862           0 :     return 0 /* nullptr */;
     863           7 :   bool emit_warning = true;
     864          21 :   for (int lines = newline_count; lines < 2; lines++) {
     865          63 :     while ((c = getc(fp)) != EOF) {
     866          51 :       if (c == '\0' && is_debugging && emit_warning) {
     867           0 :         warning("null byte(s) found in input stream:"
     868             :                 " search for coding tag might return false result");
     869           0 :         emit_warning = false;
     870             :       }
     871          51 :       data += char(c);
     872          51 :       if (c == '\n' || c == '\r')
     873             :         break;
     874             :     }
     875             :     // Handle CR, LF, and CRLF as line separators.
     876          14 :     if (c == '\r') {
     877           0 :       c = getc(fp);
     878           0 :       if (c != EOF && c != '\n')
     879           0 :         ungetc(c, fp);
     880             :       else
     881           0 :         data += char(c);
     882             :     }
     883             :   }
     884           7 :   return data.extract();
     885             : }
     886             : 
     887             : // ---------------------------------------------------------
     888             : // Indicate whether C string starts with a comment.
     889             : // ---------------------------------------------------------
     890             : bool
     891           7 : is_comment_line(char *s)
     892             : {
     893           7 :   if (!s || !*s)
     894           2 :     return false;
     895           5 :   if (*s == '.' || *s == '\'')
     896             :   {
     897           1 :     s++;
     898           1 :     while (*s == ' ' || *s == '\t')
     899           0 :       s++;
     900           1 :     if (*s && *s == '\\')
     901             :     {
     902           1 :       s++;
     903           1 :       if (*s == '"' || *s == '#')
     904           1 :         return true;
     905             :     }
     906             :   }
     907           4 :   else if (*s == '\\')
     908             :   {
     909           1 :     s++;
     910           1 :     if (*s == '#')
     911           0 :       return true;
     912             :   }
     913           4 :   return false;
     914             : }
     915             : 
     916             : // ---------------------------------------------------------
     917             : // Get a value/variable pair from a local variables list
     918             : // in a C string which look like this:
     919             : //
     920             : //   <variable1>: <value1>; <variable2>: <value2>; ...
     921             : //
     922             : // Leading and trailing blanks are ignored.  There might be
     923             : // more than one blank after ':' and ';'.
     924             : //
     925             : // Return position of next value/variable pair or a null
     926             : // pointer if at end of data.
     927             : // ---------------------------------------------------------
     928             : char *
     929           0 : get_variable_value_pair(char *d1, char **variable, char **value)
     930             : {
     931             :   static char var[MAX_VAR_LEN], val[MAX_VAR_LEN];
     932           0 :   *variable = var;
     933           0 :   *value = val;
     934           0 :   while (*d1 == ' ' || *d1 == '\t')
     935           0 :     d1++;
     936             :   // Get variable.
     937           0 :   int l = 0;
     938           0 :   while (l < MAX_VAR_LEN - 1 && *d1 && !strchr(";: \t", *d1))
     939           0 :     var[l++] = *(d1++);
     940           0 :   var[l] = 0;
     941             :   // Skip everything until ':', ';', or end of data.
     942           0 :   while (*d1 && *d1 != ':' && *d1 != ';')
     943           0 :     d1++;
     944           0 :   val[0] = 0;
     945           0 :   if (!*d1)
     946           0 :     return 0 /* nullptr */;
     947           0 :   if (*d1 == ';')
     948           0 :     return d1 + 1;
     949           0 :   d1++;
     950           0 :   while (*d1 == ' ' || *d1 == '\t')
     951           0 :     d1++;
     952             :   // Get value.
     953           0 :   l = 0;
     954           0 :   while (l < MAX_VAR_LEN - 1 && *d1 && !strchr("; \t", *d1))
     955           0 :     val[l++] = *(d1++);
     956           0 :   val[l] = 0;
     957             :   // Skip everything until ';' or end of data.
     958           0 :   while (*d1 && *d1 != ';')
     959           0 :     d1++;
     960           0 :   if (*d1 == ';')
     961           0 :     return d1 + 1;
     962           0 :   return 0 /* nullptr */;
     963             : }
     964             : 
     965             : // ---------------------------------------------------------
     966             : // Check coding tag in the read buffer.
     967             : //
     968             : // We search for the following line:
     969             : //
     970             : //   <comment> ... -*-<local variables list>-*-
     971             : //
     972             : // ('...' might be anything).
     973             : //
     974             : // <comment> can be one of the following syntax forms at the
     975             : // beginning of the line:
     976             : //
     977             : //   .\"   .\#   '\"   '\#   \#
     978             : //
     979             : // There can be whitespace after the leading '.' or "'".
     980             : //
     981             : // The local variables list must occur within the first
     982             : // comment block at the very beginning of the data stream.
     983             : //
     984             : // Within the <local variables list>, we search for
     985             : //
     986             : //   coding: <value>
     987             : //
     988             : // which specifies the coding system used for the data
     989             : // stream.
     990             : //
     991             : // Return <value> if found, and a null pointer otherwise.
     992             : //
     993             : // Note that null bytes in the data are skipped before
     994             : // applying the algorithm.  This should work even with files
     995             : // encoded as UTF-16 or UTF-32 (or its siblings) in most
     996             : // cases.
     997             : // ---------------------------------------------------------
     998             : char *
     999           7 : check_coding_tag(FILE *fp, string &data)
    1000             : {
    1001           7 :   char *inbuf = get_tag_lines(fp, data);
    1002             :   char *lineend;
    1003           7 :   for (char *p = inbuf; is_comment_line(p); p = lineend + 1) {
    1004           1 :     if ((lineend = strchr(p, '\n')) == 0 /* nullptr */)
    1005           1 :       break;
    1006           0 :     *lineend = 0;               // switch temporarily to '\0'
    1007           0 :     char *d1 = strstr(p, "-*-");
    1008           0 :     char *d2 = 0 /* nullptr */;
    1009           0 :     if (d1 != 0 /* nullptr */)
    1010           0 :       d2 = strstr(d1 + 3, "-*-");
    1011           0 :     *lineend = '\n';            // restore newline
    1012           0 :     if (!d1 || !d2)
    1013           0 :       continue;
    1014           0 :     *d2 = 0;                    // switch temporarily to '\0'
    1015           0 :     d1 += 3;
    1016           0 :     while (d1 != 0 /* nullptr */) {
    1017             :       char *variable, *value;
    1018           0 :       d1 = get_variable_value_pair(d1, &variable, &value);
    1019           0 :       if (!strcasecmp(variable, "coding")) {
    1020           0 :         *d2 = '-';              // restore '-'
    1021           0 :         free(inbuf);
    1022           0 :         return value;
    1023             :       }
    1024             :     }
    1025           0 :     *d2 = '-';                  // restore '-'
    1026             :   }
    1027           7 :   free(inbuf);
    1028           7 :   return 0 /* nullptr */;
    1029             : }
    1030             : 
    1031             : char *
    1032           0 : detect_file_encoding(FILE *fp)
    1033             : {
    1034             : #ifdef HAVE_UCHARDET
    1035           0 :   uchardet_t ud = 0 /* nullptr */;
    1036             :   struct stat stat_buf;
    1037             :   size_t len, read_bytes;
    1038           0 :   char *data = 0 /* nullptr */;
    1039             :   int res, current_position;
    1040             :   const char *charset;
    1041           0 :   char *ret = 0 /* nullptr */;
    1042             : 
    1043           0 :   current_position = ftell(fp);
    1044             :   /* Due to BOM and tag detection, we are not at the beginning of the
    1045             :      file. */
    1046           0 :   rewind(fp);
    1047           0 :   if (fstat(fileno(fp), &stat_buf) != 0) {
    1048           0 :     error("unable to get file status: %1", strerror(errno));
    1049           0 :     goto end;
    1050             :   }
    1051           0 :   len = stat_buf.st_size;
    1052           0 :   if (is_debugging)
    1053           0 :     fprintf(stderr, "  len: %lu\n", (unsigned long)len);
    1054           0 :   if (len == 0)
    1055           0 :     goto end;
    1056           0 :   data = static_cast<char *>(calloc(len, 1));
    1057           0 :   read_bytes = fread(data, 1, len, fp);
    1058           0 :   if (read_bytes == 0) {
    1059           0 :     error("unable to read from file: %1", strerror(errno));
    1060           0 :     goto end;
    1061             :   }
    1062             :   /* We rewind back to the original position */
    1063           0 :   if (fseek(fp, current_position, SEEK_SET) != 0) {
    1064           0 :     fatal("unable to seek within file: %1", strerror(errno));
    1065           0 :     goto end;
    1066             :   }
    1067           0 :   ud = uchardet_new();
    1068           0 :   res = uchardet_handle_data(ud, data, len);
    1069           0 :   if (res != 0) {
    1070           0 :     debug("  uchardet_handle_data: error %1\n", res);
    1071           0 :     goto end;
    1072             :   }
    1073           0 :   if (is_debugging)
    1074           0 :     fprintf(stderr, "  uchardet read: %lu bytes\n",
    1075             :             (unsigned long)read_bytes);
    1076           0 :   uchardet_data_end(ud);
    1077           0 :   charset = uchardet_get_charset(ud);
    1078           0 :   if (is_debugging) {
    1079           0 :     if (charset != 0 /* nullptr */)
    1080           0 :        fprintf(stderr, "  charset: %s\n", charset);
    1081             :     else
    1082           0 :        fprintf(stderr, "  charset is NULL\n");
    1083             :   }
    1084             :   /* uchardet 0.0.1 could return an empty string instead of a null
    1085             :    * pointer. */
    1086           0 :   if ((charset != 0 /* nullptr */) && (*charset != '\0')) {
    1087           0 :     ret = static_cast<char *>(malloc(strlen(charset) + 1));
    1088           0 :     strcpy(ret, charset);
    1089             :   }
    1090             : 
    1091           0 : end:
    1092           0 :   if (ud != 0 /* nullptr */)
    1093           0 :      uchardet_delete(ud);
    1094           0 :   if (data != 0 /* nullptr */)
    1095           0 :      free(data);
    1096             : 
    1097           0 :   return ret;
    1098             : #else /* not HAVE_UCHARDET */
    1099             :   return 0 /* nullptr */;
    1100             : #endif /* not HAVE_UCHARDET */
    1101             : }
    1102             : 
    1103             : // ---------------------------------------------------------
    1104             : // Process an input file.  If `filename` is "-", read the
    1105             : // standard input stream.
    1106             : //
    1107             : // Return Boolean indicating successful completion.
    1108             : // ---------------------------------------------------------
    1109             : bool
    1110         242 : do_file(const char *filename)
    1111             : {
    1112             :   FILE *fp;
    1113         484 :   string BOM, data;
    1114         242 :   bool is_seekable = false;
    1115         484 :   string reported_filename;
    1116             : 
    1117             :   // TODO: Consider moving some of this into a `quoted_file_name`
    1118             :   // function in libgroff.
    1119         242 :   if (strcmp(filename, "-") == 0) {
    1120          40 :     fp = stdin;
    1121          40 :     reported_filename = string("<standard input>");
    1122             :   }
    1123             :   else {
    1124         202 :     fp = fopen(filename, FOPEN_RB);
    1125         202 :     reported_filename = "'" + string(filename) + "'";
    1126             :   }
    1127         242 :   char *c_reported_filename = reported_filename.extract();
    1128         242 :   if (!fp) {
    1129           0 :     error("cannot open %1: %2", c_reported_filename, strerror(errno));
    1130           0 :     free(c_reported_filename);
    1131           0 :     return false;
    1132             :   }
    1133         242 :   if (is_debugging) {
    1134          11 :     fprintf(stderr, "processing %s\n", c_reported_filename);
    1135          11 :     fflush(stderr);
    1136             :   }
    1137         242 :   free(c_reported_filename);
    1138         242 :   if (fseek(fp, 0L, SEEK_SET) == 0)
    1139         201 :     is_seekable = true;
    1140             :   else {
    1141             :     SET_BINARY(fileno(fp));
    1142          41 :     if (is_debugging)
    1143          11 :       fprintf(stderr, "  stream is not seekable: %s\n",
    1144          11 :               strerror(errno));
    1145             :   }
    1146         242 :   const char *BOM_encoding = get_BOM(fp, BOM, data);
    1147             :   // Determine the encoding.
    1148             :   char *encoding;
    1149         242 :   bool must_free_encoding = false;
    1150         242 :   if (user_encoding[0]) {
    1151         230 :     if (is_debugging) {
    1152           1 :       fprintf(stderr, "  user-specified encoding '%s', "
    1153             :                       "no search for coding tag\n",
    1154             :                       user_encoding);
    1155           1 :       if (BOM_encoding && strcmp(BOM_encoding, user_encoding))
    1156           1 :         fprintf(stderr, "  but BOM in data stream implies encoding"
    1157             :                 " '%s'!\n", BOM_encoding);
    1158             :     }
    1159         230 :     encoding = static_cast<char *>(user_encoding);
    1160             :   }
    1161          12 :   else if (BOM_encoding != 0 /* nullptr */) {
    1162           5 :     if (is_debugging)
    1163           5 :       fprintf(stderr, "  found BOM, no search for coding tag\n");
    1164           5 :     encoding = const_cast<char *>(BOM_encoding);
    1165             :   }
    1166             :   else {
    1167             :     // 'check_coding_tag' returns a pointer to a static array (or a null
    1168             :     // pointer).
    1169           7 :     char *file_encoding = check_coding_tag(fp, data);
    1170           7 :     if (!file_encoding) {
    1171           7 :       if (is_debugging)
    1172           5 :         fprintf(stderr, "  no coding tag\n");
    1173           7 :       if (is_seekable)
    1174           0 :          file_encoding = detect_file_encoding(fp);
    1175           7 :       if (!file_encoding) {
    1176           7 :         if (is_debugging)
    1177           5 :           fprintf(stderr,
    1178             :                   "  could not detect encoding with uchardet\n");
    1179           7 :         file_encoding = fallback_encoding;
    1180             :       }
    1181             :       else
    1182           0 :         must_free_encoding = true;
    1183             :     }
    1184             :     else
    1185           0 :       if (is_debugging)
    1186           0 :         fprintf(stderr, "  coding tag: '%s'\n", file_encoding);
    1187           7 :     encoding = file_encoding;
    1188             :   }
    1189         242 :   strncpy(encoding_string, encoding, MAX_VAR_LEN - 1);
    1190         242 :   encoding_string[MAX_VAR_LEN - 1] = '\0';
    1191         242 :   if (must_free_encoding)
    1192           0 :     free(encoding);
    1193         242 :   encoding = encoding_string;
    1194             :   // Translate from MIME/Emacs encoding names to locale encoding names.
    1195         242 :   encoding = emacs2mime(encoding_string);
    1196         242 :   if (encoding[0] == '\0') {
    1197           0 :     error("non-portable encoding '%1' not supported", encoding_string);
    1198           0 :     return false;
    1199             :   }
    1200         242 :   if (is_debugging)
    1201          11 :     fprintf(stderr, "  encoding used: '%s'\n", encoding);
    1202         242 :   if (!want_raw_output) {
    1203         484 :     string fn(filename);
    1204         242 :     fn += '\0';
    1205         242 :     normalize_file_name_for_lf_request(fn);
    1206         242 :     (void) printf(".lf 1 %s%s\n", ('"' == filename[0]) ? "" : "\"",
    1207             :                   fn.contents());
    1208             :   }
    1209         242 :   bool was_successful = true;
    1210             :   // Call converter (converters write to stdout).
    1211         242 :   if (!strcasecmp(encoding, "ISO-8859-1"))
    1212           2 :     conversion_latin1(fp, BOM + data);
    1213         240 :   else if (!strcasecmp(encoding, "UTF-8"))
    1214         234 :     conversion_utf8(fp, data);
    1215           6 :   else if (!strcasecmp(encoding, "cp1047"))
    1216           0 :     conversion_cp1047(fp, BOM + data);
    1217             :   else {
    1218             : #if HAVE_ICONV
    1219           6 :     conversion_iconv(fp, BOM + data, encoding);
    1220             : #else
    1221             :     error("encoding system '%1' not supported", encoding);
    1222             :     was_successful = false;
    1223             : #endif /* HAVE_ICONV */
    1224             :   }
    1225         242 :   if (fp != stdin)
    1226         202 :     fclose(fp);
    1227         242 :   return was_successful;
    1228             : }
    1229             : 
    1230             : // ---------------------------------------------------------
    1231             : // Print usage.
    1232             : // ---------------------------------------------------------
    1233             : void
    1234           0 : usage(FILE *stream)
    1235             : {
    1236           0 :   fprintf(stream,
    1237             : "usage: %s [-dr] [-D fallback-encoding] [-e encoding] [file ...]\n"
    1238             : "usage: %s {-v | --version}\n"
    1239             : "usage: %s {-h | --help}\n",
    1240             :           program_name, program_name, program_name);
    1241           0 :   if (stdout == stream)
    1242           0 :     fprintf(stream,
    1243             : "\n"
    1244             : "Read each file, convert its encoded characters to a form GNU"
    1245             : " troff(1)\n"
    1246             : "can interpret, and send the result to the standard output stream.\n"
    1247             : "The default fallback encoding is '%s'.  See the preconv(1) manual"
    1248             : " page.\n",
    1249             :           fallback_encoding);
    1250           0 : }
    1251             : 
    1252             : // ---------------------------------------------------------
    1253             : // Main routine.
    1254             : // ---------------------------------------------------------
    1255             : int
    1256          45 : main(int argc, char **argv)
    1257             : {
    1258          45 :   program_name = argv[0];
    1259             :   // Determine the fallback encoding.  This must be done before
    1260             :   // getopt() is called since the usage message shows the fallback
    1261             :   // encoding.
    1262          45 :   setlocale(LC_ALL, "");
    1263          45 :   char *locale = getlocale(LC_CTYPE);
    1264          45 :   if (!locale || !strcmp(locale, "C") || !strcmp(locale, "POSIX"))
    1265          28 :     strcpy(fallback_encoding, "latin1");
    1266             :   else {
    1267          17 :     strncpy(fallback_encoding, locale_charset(), MAX_VAR_LEN - 1);
    1268          17 :     fallback_encoding[MAX_VAR_LEN - 1] = 0;
    1269             :   }
    1270             : 
    1271          45 :   program_name = argv[0];
    1272             :   int opt;
    1273             :   static const struct option long_options[] = {
    1274             :     { "help", no_argument, 0 /* nullptr */, 'h' },
    1275             :     { "version", no_argument, 0 /* nullptr */, 'v' },
    1276             :     { 0 /* nullptr */, 0, 0 /* nullptr */, 0 }
    1277             :   };
    1278             :   // Parse the command-line options.
    1279          89 :   while ((opt = getopt_long(argc, argv, ":dD:e:hrv", long_options,
    1280             :                             0 /* nullptr */))
    1281          89 :          != EOF)
    1282          45 :     switch (opt) {
    1283           1 :     case 'v':
    1284           1 :       printf("GNU preconv (groff) version %s %s iconv support and %s"
    1285             :              " uchardet support\n",
    1286             :              Version_string,
    1287             : #ifdef HAVE_ICONV
    1288             :              "with",
    1289             : #else
    1290             :              "without",
    1291             : #endif /* HAVE_ICONV */
    1292             : #ifdef HAVE_UCHARDET
    1293             :              "with"
    1294             : #else
    1295             :              "without"
    1296             : #endif /* HAVE_UCHARDET */
    1297             :             );
    1298           1 :       exit(EXIT_SUCCESS);
    1299             :       break;
    1300          11 :     case 'd':
    1301          11 :       is_debugging = true;
    1302          11 :       break;
    1303          32 :     case 'e':
    1304          32 :       if (optarg != 0 /* nullptr */) {
    1305          32 :         strncpy(user_encoding, optarg, MAX_VAR_LEN - 1);
    1306          32 :         user_encoding[MAX_VAR_LEN - 1] = 0;
    1307             :       }
    1308             :       else
    1309           0 :         user_encoding[0] = 0;
    1310          32 :       break;
    1311           1 :     case 'D':
    1312           1 :       if (optarg != 0 /* nullptr */) {
    1313           1 :         strncpy(fallback_encoding, optarg, MAX_VAR_LEN - 1);
    1314           1 :         fallback_encoding[MAX_VAR_LEN - 1] = 0;
    1315             :       }
    1316           1 :       break;
    1317           0 :     case 'r':
    1318           0 :       want_raw_output = true;
    1319           0 :       break;
    1320           0 :     case 'h':
    1321           0 :       usage(stdout);
    1322           0 :       exit(EXIT_SUCCESS);
    1323             :       break;
    1324           0 :     case '?':
    1325           0 :       if (optopt != 0)
    1326           0 :         error("unrecognized command-line option '%1'", char(optopt));
    1327             :       else
    1328           0 :         error("unrecognized command-line option '%1'",
    1329           0 :               argv[(optind - 1)]);
    1330           0 :       usage(stderr);
    1331           0 :       exit(2);
    1332             :       break;
    1333           0 :     case ':':
    1334           0 :       error("command-line option '%1' requires an argument",
    1335           0 :            char(optopt));
    1336           0 :       usage(stderr);
    1337           0 :       exit(2);
    1338             :       break;
    1339           0 :     default:
    1340           0 :       assert(0 == "unhandled getopt_long return value");
    1341             :     }
    1342          44 :   int nbad = 0;
    1343          44 :   if (is_debugging)
    1344          11 :     fprintf(stderr, "fallback encoding: '%s'\n", fallback_encoding);
    1345          44 :   if (optind >= argc)
    1346          20 :     nbad += !do_file("-");
    1347             :   else
    1348         246 :     for (int i = optind; i < argc; i++)
    1349         222 :       nbad += !do_file(argv[i]);
    1350          44 :   if (ferror(stdout))
    1351           0 :     fatal("error status on standard output stream");
    1352          44 :   if (fflush(stdout) < 0)
    1353           0 :     fatal("cannot flush standard output stream: %1", strerror(errno));
    1354          44 :   return (nbad != 0);
    1355             : }
    1356             : 
    1357             : // Local Variables:
    1358             : // fill-column: 72
    1359             : // mode: C++
    1360             : // End:
    1361             : // vim: set cindent noexpandtab shiftwidth=2 textwidth=72:

Generated by: LCOV version 1.14