LCOV - code coverage report
Current view: top level - libs/libgroff - unicode.cpp (source / functions) Hit Total Coverage
Test: GNU roff Lines: 42 70 60.0 %
Date: 2026-01-16 17:51:41 Functions: 2 2 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* Copyright 2002-2024 Free Software Foundation, Inc.
       2             :      Written by Werner Lemberg <wl@gnu.org>
       3             : 
       4             : This file is part of groff, the GNU roff typesetting system.
       5             : 
       6             : groff is free software; you can redistribute it and/or modify it under
       7             : the terms of the GNU General Public License as published by the Free
       8             : Software Foundation, either version 3 of the License, or
       9             : (at your option) any later version.
      10             : 
      11             : groff is distributed in the hope that it will be useful, but WITHOUT ANY
      12             : WARRANTY; without even the implied warranty of MERCHANTABILITY or
      13             : FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      14             : for more details.
      15             : 
      16             : You should have received a copy of the GNU General Public License
      17             : along with this program.  If not, see <http://www.gnu.org/licenses/>. */
      18             : 
      19             : #ifdef HAVE_CONFIG_H
      20             : #include <config.h>
      21             : #endif
      22             : 
      23             : #include "lib.h"
      24             : 
      25             : #include "cset.h"
      26             : #include "stringclass.h"
      27             : #include "unicode.h"
      28             : 
      29    27673409 : const char *valid_unicode_code_sequence(const char *u, char *errbuf)
      30             : {
      31    27673409 :   if (errbuf != 0 /* nullptr */)
      32       31215 :     (void) memset(errbuf, '\0', ERRBUFSZ);
      33    27673409 :   if (*u != 'u') {
      34    27521345 :     if (errbuf != 0 /* nullptr */)
      35           0 :       snprintf(errbuf, ERRBUFSZ, "Unicode special character sequence"
      36             :                " lacks 'u' as first character");
      37    27521345 :     return 0 /* nullptr */;
      38             :   }
      39      152064 :   const char *p = ++u;
      40             :   for (;;) {
      41      171953 :     int val = 0;
      42      171953 :     const char *start = p;
      43             :     for (;;) {
      44             :       // only uppercase hex digits allowed
      45      687226 :       if (!csxdigit(*p)) {
      46         186 :         if (errbuf != 0 /* nullptr */)
      47           0 :           snprintf(errbuf, ERRBUFSZ, "Unicode special character"
      48           0 :                    " sequence has non-hexadecimal digit '%c'", *p);
      49         186 :         return 0 /* nullptr */;
      50             :       }
      51      687040 :       if (csdigit(*p))
      52      633588 :         val = val*0x10 + (*p-'0');
      53       53452 :       else if (csupper(*p))
      54       53301 :         val = val*0x10 + (*p-'A'+10);
      55         151 :       else if ((*p >= 'a') && (*p <= 'f')) {
      56         151 :         if (errbuf != 0 /* nullptr */)
      57           0 :           snprintf(errbuf, ERRBUFSZ, "Unicode special character"
      58             :                 " sequence must use uppercase hexadecimal digit, not"
      59           0 :                 " '%c'", *p);
      60         151 :         return 0 /* nullptr */;
      61             :       }
      62             :       else {
      63           0 :         assert(0 == "unhandled hexadecimal digit character");
      64             :         return 0 /* nullptr */;
      65             :       }
      66             :       // biggest Unicode value is U+10FFFF
      67      686889 :       if (val > 0x10FFFF) {
      68           0 :         if (errbuf != 0 /* nullptr */)
      69           0 :           snprintf(errbuf, ERRBUFSZ, "Unicode special character code"
      70             :                    " point %04X is out of range (0000..10FFFF)", val);
      71           0 :         return 0 /* nullptr */;
      72             :       }
      73      686889 :       p++;
      74      686889 :       if (*p == '\0' || *p == '_')
      75             :         break;
      76             :     }
      77             :     // surrogates not allowed
      78      171616 :     if ((val >= 0xD800 && val <= 0xDBFF)
      79      171616 :         || (val >= 0xDC00 && val <= 0xDFFF)) {
      80           0 :       if (errbuf != 0 /* nullptr */)
      81           0 :         snprintf(errbuf, ERRBUFSZ, "Unicode special character code"
      82             :                  " point %04X is a surrogate", val);
      83           0 :       return 0 /* nullptr */;
      84             :     }
      85      171616 :     const ptrdiff_t width = p - start;
      86      171616 :     if (width < 4) {
      87          45 :       if (errbuf != 0 /* nullptr */)
      88           1 :         snprintf(errbuf, ERRBUFSZ, "Unicode special character sequence"
      89             :                  " must be 4..6 digits");
      90          45 :       return 0 /* nullptr */;
      91             :     }
      92      171571 :     else if ((width > 4) && ('0' == *u)) {
      93           0 :       if (errbuf != 0 /* nullptr */)
      94           0 :         snprintf(errbuf, ERRBUFSZ, "Unicode special character sequence"
      95             :                  " %s has invalid leading zero(es)", u);
      96           0 :       return 0 /* nullptr */;
      97             :     }
      98      171571 :     if (*p == '\0')
      99      151682 :       break;
     100       19889 :     p++;
     101       19889 :   }
     102      151682 :   return u;
     103             : }
     104             : 
     105             : // TODO: Does gnulib have a function that does this?
     106           5 : char *to_utf8_string(unsigned int ch)
     107             : {
     108             :   static char buf[16];
     109             : 
     110           5 :   if (ch < 0x80)
     111           0 :     sprintf(buf, "%c", (ch & 0xff));
     112           5 :   else if (ch < 0x800)
     113           5 :     sprintf(buf, "%c%c",
     114           5 :       0xc0 + ((ch >>  6) & 0x1f),
     115           5 :       0x80 + ((ch      ) & 0x3f));
     116           0 :   else if ((ch < 0xD800) || ((ch > 0xDFFF) && (ch < 0x10000)))
     117           0 :     sprintf(buf, "%c%c%c",
     118           0 :       0xe0 + ((ch >> 12) & 0x0f),
     119           0 :       0x80 + ((ch >>  6) & 0x3f),
     120           0 :       0x80 + ((ch      ) & 0x3f));
     121           0 :   else if ((ch > 0xFFFF) && (ch < 0x120000))
     122           0 :     sprintf(buf, "%c%c%c%c",
     123           0 :       0xf0 + ((ch >> 18) & 0x07),
     124           0 :       0x80 + ((ch >> 12) & 0x3f),
     125           0 :       0x80 + ((ch >>  6) & 0x3f),
     126           0 :       0x80 + ((ch      ) & 0x3f));
     127             :   else
     128           0 :     sprintf(buf, "&#x%X;", ch);
     129           5 :   return buf;
     130             : }
     131             : 
     132             : // Local Variables:
     133             : // fill-column: 72
     134             : // mode: C++
     135             : // End:
     136             : // vim: set cindent noexpandtab shiftwidth=2 textwidth=72:

Generated by: LCOV version 1.14