Line data Source code
1 : /* Copyright 2002-2024 Free Software Foundation, Inc.
2 : Written by Werner Lemberg <wl@gnu.org>
3 :
4 : This file is part of groff, the GNU roff typesetting system.
5 :
6 : groff is free software; you can redistribute it and/or modify it under
7 : the terms of the GNU General Public License as published by the Free
8 : Software Foundation, either version 3 of the License, or
9 : (at your option) any later version.
10 :
11 : groff is distributed in the hope that it will be useful, but WITHOUT ANY
12 : WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 : for more details.
15 :
16 : You should have received a copy of the GNU General Public License
17 : along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 :
19 : #ifdef HAVE_CONFIG_H
20 : #include <config.h>
21 : #endif
22 :
23 : #include "lib.h"
24 :
25 : #include "cset.h"
26 : #include "stringclass.h"
27 : #include "unicode.h"
28 :
29 27673409 : const char *valid_unicode_code_sequence(const char *u, char *errbuf)
30 : {
31 27673409 : if (errbuf != 0 /* nullptr */)
32 31215 : (void) memset(errbuf, '\0', ERRBUFSZ);
33 27673409 : if (*u != 'u') {
34 27521345 : if (errbuf != 0 /* nullptr */)
35 0 : snprintf(errbuf, ERRBUFSZ, "Unicode special character sequence"
36 : " lacks 'u' as first character");
37 27521345 : return 0 /* nullptr */;
38 : }
39 152064 : const char *p = ++u;
40 : for (;;) {
41 171953 : int val = 0;
42 171953 : const char *start = p;
43 : for (;;) {
44 : // only uppercase hex digits allowed
45 687226 : if (!csxdigit(*p)) {
46 186 : if (errbuf != 0 /* nullptr */)
47 0 : snprintf(errbuf, ERRBUFSZ, "Unicode special character"
48 0 : " sequence has non-hexadecimal digit '%c'", *p);
49 186 : return 0 /* nullptr */;
50 : }
51 687040 : if (csdigit(*p))
52 633588 : val = val*0x10 + (*p-'0');
53 53452 : else if (csupper(*p))
54 53301 : val = val*0x10 + (*p-'A'+10);
55 151 : else if ((*p >= 'a') && (*p <= 'f')) {
56 151 : if (errbuf != 0 /* nullptr */)
57 0 : snprintf(errbuf, ERRBUFSZ, "Unicode special character"
58 : " sequence must use uppercase hexadecimal digit, not"
59 0 : " '%c'", *p);
60 151 : return 0 /* nullptr */;
61 : }
62 : else {
63 0 : assert(0 == "unhandled hexadecimal digit character");
64 : return 0 /* nullptr */;
65 : }
66 : // biggest Unicode value is U+10FFFF
67 686889 : if (val > 0x10FFFF) {
68 0 : if (errbuf != 0 /* nullptr */)
69 0 : snprintf(errbuf, ERRBUFSZ, "Unicode special character code"
70 : " point %04X is out of range (0000..10FFFF)", val);
71 0 : return 0 /* nullptr */;
72 : }
73 686889 : p++;
74 686889 : if (*p == '\0' || *p == '_')
75 : break;
76 : }
77 : // surrogates not allowed
78 171616 : if ((val >= 0xD800 && val <= 0xDBFF)
79 171616 : || (val >= 0xDC00 && val <= 0xDFFF)) {
80 0 : if (errbuf != 0 /* nullptr */)
81 0 : snprintf(errbuf, ERRBUFSZ, "Unicode special character code"
82 : " point %04X is a surrogate", val);
83 0 : return 0 /* nullptr */;
84 : }
85 171616 : const ptrdiff_t width = p - start;
86 171616 : if (width < 4) {
87 45 : if (errbuf != 0 /* nullptr */)
88 1 : snprintf(errbuf, ERRBUFSZ, "Unicode special character sequence"
89 : " must be 4..6 digits");
90 45 : return 0 /* nullptr */;
91 : }
92 171571 : else if ((width > 4) && ('0' == *u)) {
93 0 : if (errbuf != 0 /* nullptr */)
94 0 : snprintf(errbuf, ERRBUFSZ, "Unicode special character sequence"
95 : " %s has invalid leading zero(es)", u);
96 0 : return 0 /* nullptr */;
97 : }
98 171571 : if (*p == '\0')
99 151682 : break;
100 19889 : p++;
101 19889 : }
102 151682 : return u;
103 : }
104 :
105 : // TODO: Does gnulib have a function that does this?
106 5 : char *to_utf8_string(unsigned int ch)
107 : {
108 : static char buf[16];
109 :
110 5 : if (ch < 0x80)
111 0 : sprintf(buf, "%c", (ch & 0xff));
112 5 : else if (ch < 0x800)
113 5 : sprintf(buf, "%c%c",
114 5 : 0xc0 + ((ch >> 6) & 0x1f),
115 5 : 0x80 + ((ch ) & 0x3f));
116 0 : else if ((ch < 0xD800) || ((ch > 0xDFFF) && (ch < 0x10000)))
117 0 : sprintf(buf, "%c%c%c",
118 0 : 0xe0 + ((ch >> 12) & 0x0f),
119 0 : 0x80 + ((ch >> 6) & 0x3f),
120 0 : 0x80 + ((ch ) & 0x3f));
121 0 : else if ((ch > 0xFFFF) && (ch < 0x120000))
122 0 : sprintf(buf, "%c%c%c%c",
123 0 : 0xf0 + ((ch >> 18) & 0x07),
124 0 : 0x80 + ((ch >> 12) & 0x3f),
125 0 : 0x80 + ((ch >> 6) & 0x3f),
126 0 : 0x80 + ((ch ) & 0x3f));
127 : else
128 0 : sprintf(buf, "&#x%X;", ch);
129 5 : return buf;
130 : }
131 :
132 : // Local Variables:
133 : // fill-column: 72
134 : // mode: C++
135 : // End:
136 : // vim: set cindent noexpandtab shiftwidth=2 textwidth=72:
|