1 /* Association between Unicode characters and their names.
2 Copyright (C) 2000-2002, 2005-2007, 2009-2021 Free Software Foundation, Inc.
3
4 This file is free software.
5 It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
6 You can redistribute it and/or modify it under either
7 - the terms of the GNU Lesser General Public License as published
8 by the Free Software Foundation; either version 3, or (at your
9 option) any later version, or
10 - the terms of the GNU General Public License as published by the
11 Free Software Foundation; either version 2, or (at your option)
12 any later version, or
13 - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
14
15 This file is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 Lesser General Public License and the GNU General Public License
19 for more details.
20
21 You should have received a copy of the GNU Lesser General Public
22 License and of the GNU General Public License along with this
23 program. If not, see <https://www.gnu.org/licenses/>. */
24
25 #include <config.h>
26
27 /* Specification. */
28 #include "uniname.h"
29
30 #include <assert.h>
31 #include <stdbool.h>
32 #include <stdint.h>
33 #include <stdio.h>
34 #include <string.h>
35
36 #include "attribute.h"
37
38 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
39
40
41 /* Table of Unicode character names, derived from UnicodeData.txt.
42 This table is generated in a way to minimize the memory footprint:
43 1. its compiled size is small (less than 350 KB),
44 2. it resides entirely in the text or read-only data segment of the
45 executable or shared library: the table contains only immediate
46 integers, no pointers, and the functions don't do heap allocation.
47 */
48 #include "uninames.h"
49 /* It contains:
50 static const char unicode_name_words[36303] = ...;
51 #define UNICODE_CHARNAME_NUM_WORDS 6260
52 static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
53 #define UNICODE_CHARNAME_WORD_HANGUL 3902
54 #define UNICODE_CHARNAME_WORD_SYLLABLE 4978
55 #define UNICODE_CHARNAME_WORD_CJK 417
56 #define UNICODE_CHARNAME_WORD_COMPATIBILITY 6107
57 static const uint16_t unicode_names[68940] = ...;
58 static const struct { uint16_t index; uint32_t name:24; } unicode_name_to_index[16626] = ...;
59 static const struct { uint16_t index; uint32_t name:24; } unicode_index_to_name[16626] = ...;
60 #define UNICODE_CHARNAME_MAX_LENGTH 83
61 #define UNICODE_CHARNAME_MAX_WORDS 13
62 static const struct { uint32_t index; uint32_t gap; uint16_t length; } unicode_ranges[401] = ...;
63 */
64
65 /* Returns the word with a given index. */
66 static const char *
67 unicode_name_word (unsigned int index, unsigned int *lengthp)
/* ![[previous]](../icons/n_left.png)
![[next]](../icons/right.png)
![[first]](../icons/n_first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
68 {
69 unsigned int i1;
70 unsigned int i2;
71
72 assert (index < UNICODE_CHARNAME_NUM_WORDS);
73
74 /* Binary search for i with
75 unicode_name_by_length[i].ind_offset <= index
76 and
77 index < unicode_name_by_length[i+1].ind_offset
78 */
79
80 i1 = 0;
81 i2 = SIZEOF (unicode_name_by_length) - 1;
82 while (i2 - i1 > 1)
83 {
84 unsigned int i = (i1 + i2) >> 1;
85 if (unicode_name_by_length[i].ind_offset <= index)
86 i1 = i;
87 else
88 i2 = i;
89 }
90 unsigned int i = i1;
91 assert (unicode_name_by_length[i].ind_offset <= index
92 && index < unicode_name_by_length[i+1].ind_offset);
93 *lengthp = i;
94 return &unicode_name_words[unicode_name_by_length[i].extra_offset
95 + (index-unicode_name_by_length[i].ind_offset)*i];
96 }
97
98 /* Looks up the index of a word. */
99 static int
100 unicode_name_word_lookup (const char *word, size_t length)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
101 {
102 if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1)
103 {
104 /* Binary search among the words of given length. */
105 unsigned int extra_offset = unicode_name_by_length[length].extra_offset;
106 unsigned int i0 = unicode_name_by_length[length].ind_offset;
107 unsigned int i1 = i0;
108 unsigned int i2 = unicode_name_by_length[length+1].ind_offset;
109 while (i2 - i1 > 0)
110 {
111 unsigned int i = (i1 + i2) >> 1;
112 const char *p = &unicode_name_words[extra_offset + (i-i0)*length];
113 const char *w = word;
114 unsigned int n = length;
115 for (;;)
116 {
117 if (*p < *w)
118 {
119 if (i1 == i)
120 return -1;
121 /* Note here: i1 < i < i2. */
122 i1 = i;
123 break;
124 }
125 if (*p > *w)
126 {
127 /* Note here: i1 <= i < i2. */
128 i2 = i;
129 break;
130 }
131 p++; w++; n--;
132 if (n == 0)
133 return i;
134 }
135 }
136 }
137 return -1;
138 }
139
140 #define UNINAME_INVALID_INDEX UINT16_MAX
141
142 /* Looks up the internal index of a Unicode character. */
143 static uint16_t
144 unicode_code_to_index (ucs4_t c)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
145 {
146 /* Binary search in unicode_ranges. */
147 unsigned int i1 = 0;
148 unsigned int i2 = SIZEOF (unicode_ranges);
149
150 for (;;)
151 {
152 unsigned int i = (i1 + i2) >> 1;
153 ucs4_t start_code =
154 unicode_ranges[i].index + unicode_ranges[i].gap;
155 ucs4_t end_code =
156 start_code + unicode_ranges[i].length - 1;
157
158 if (start_code <= c && c <= end_code)
159 return c - unicode_ranges[i].gap;
160
161 if (end_code < c)
162 {
163 if (i1 == i)
164 break;
165 /* Note here: i1 < i < i2. */
166 i1 = i;
167 }
168 else if (c < start_code)
169 {
170 if (i2 == i)
171 break;
172 /* Note here: i1 <= i < i2. */
173 i2 = i;
174 }
175 }
176 return UNINAME_INVALID_INDEX;
177 }
178
179 /* Looks up the codepoint of a Unicode character, from the given
180 internal index. */
181 static ucs4_t
182 unicode_index_to_code (uint16_t index)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
183 {
184 /* Binary search in unicode_ranges. */
185 unsigned int i1 = 0;
186 unsigned int i2 = SIZEOF (unicode_ranges);
187
188 for (;;)
189 {
190 unsigned int i = (i1 + i2) >> 1;
191 uint16_t start_index = unicode_ranges[i].index;
192 uint16_t end_index = start_index + unicode_ranges[i].length - 1;
193
194 if (start_index <= index && index <= end_index)
195 return index + unicode_ranges[i].gap;
196
197 if (end_index < index)
198 {
199 if (i1 == i)
200 break;
201 /* Note here: i1 < i < i2. */
202 i1 = i;
203 }
204 else if (index < start_index)
205 {
206 if (i2 == i)
207 break;
208 /* Note here: i1 <= i < i2. */
209 i2 = i;
210 }
211 }
212 return UNINAME_INVALID;
213 }
214
215
216 /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
217 sections 3.11 and 4.4. */
218 static const char jamo_initial_short_name[19][3] =
219 {
220 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
221 "C", "K", "T", "P", "H"
222 };
223 static const char jamo_medial_short_name[21][4] =
224 {
225 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
226 "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
227 };
228 static const char jamo_final_short_name[28][3] =
229 {
230 "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
231 "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
232 };
233
234 /* Looks up the name of a Unicode character, in uppercase ASCII.
235 Returns the filled buf, or NULL if the character does not have a name. */
236 char *
237 unicode_character_name (ucs4_t c, char *buf)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
238 {
239 if (c >= 0xAC00 && c <= 0xD7A3)
240 {
241 /* Special case for Hangul syllables. Keeps the tables small. */
242 char *ptr;
243 unsigned int tmp;
244 unsigned int index1;
245 unsigned int index2;
246 unsigned int index3;
247 const char *q;
248
249 /* buf needs to have at least 16 + 7 bytes here. */
250 memcpy (buf, "HANGUL SYLLABLE ", 16);
251 ptr = buf + 16;
252
253 tmp = c - 0xAC00;
254 index3 = tmp % 28; tmp = tmp / 28;
255 index2 = tmp % 21; tmp = tmp / 21;
256 index1 = tmp;
257
258 q = jamo_initial_short_name[index1];
259 while (*q != '\0')
260 *ptr++ = *q++;
261 q = jamo_medial_short_name[index2];
262 while (*q != '\0')
263 *ptr++ = *q++;
264 q = jamo_final_short_name[index3];
265 while (*q != '\0')
266 *ptr++ = *q++;
267 *ptr = '\0';
268 return buf;
269 }
270 else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A)
271 || (c >= 0xFA70 && c <= 0xFAD9) || (c >= 0x2F800 && c <= 0x2FA1D))
272 {
273 /* Special case for CJK compatibility ideographs. Keeps the tables
274 small. */
275 char *ptr;
276 int i;
277
278 /* buf needs to have at least 28 + 5 bytes here. */
279 memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28);
280 ptr = buf + 28;
281
282 for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4)
283 {
284 unsigned int x = (c >> i) & 0xf;
285 *ptr++ = (x < 10 ? '0' : 'A' - 10) + x;
286 }
287 *ptr = '\0';
288 return buf;
289 }
290 else if ((c >= 0xFE00 && c <= 0xFE0F) || (c >= 0xE0100 && c <= 0xE01EF))
291 {
292 /* Special case for variation selectors. Keeps the tables
293 small. */
294
295 /* buf needs to have at least 19 + 3 bytes here. */
296 sprintf (buf, "VARIATION SELECTOR-%d",
297 c <= 0xFE0F ? c - 0xFE00 + 1 : c - 0xE0100 + 17);
298 return buf;
299 }
300 else
301 {
302 uint16_t index = unicode_code_to_index (c);
303 const uint16_t *words = NULL;
304
305 if (index != UNINAME_INVALID_INDEX)
306 {
307 /* Binary search in unicode_code_to_name. */
308 unsigned int i1 = 0;
309 unsigned int i2 = SIZEOF (unicode_index_to_name);
310 for (;;)
311 {
312 unsigned int i = (i1 + i2) >> 1;
313 if (unicode_index_to_name[i].index == index)
314 {
315 words = &unicode_names[unicode_index_to_name[i].name];
316 break;
317 }
318 else if (unicode_index_to_name[i].index < index)
319 {
320 if (i1 == i)
321 {
322 words = NULL;
323 break;
324 }
325 /* Note here: i1 < i < i2. */
326 i1 = i;
327 }
328 else if (unicode_index_to_name[i].index > index)
329 {
330 if (i2 == i)
331 {
332 words = NULL;
333 break;
334 }
335 /* Note here: i1 <= i < i2. */
336 i2 = i;
337 }
338 }
339 }
340 if (words != NULL)
341 {
342 /* Found it in unicode_index_to_name. Now concatenate the words. */
343 /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes. */
344 char *ptr = buf;
345 for (;;)
346 {
347 unsigned int wordlen;
348 const char *word = unicode_name_word (*words>>1, &wordlen);
349 do
350 *ptr++ = *word++;
351 while (--wordlen > 0);
352 if ((*words & 1) == 0)
353 break;
354 *ptr++ = ' ';
355 words++;
356 }
357 *ptr = '\0';
358 return buf;
359 }
360 return NULL;
361 }
362 }
363
364 /* Looks up the Unicode character with a given name, in upper- or lowercase
365 ASCII. Returns the character if found, or UNINAME_INVALID if not found. */
366 ucs4_t
367 unicode_name_character (const char *name)
/* ![[previous]](../icons/left.png)
![[next]](../icons/n_right.png)
![[first]](../icons/first.png)
![[last]](../icons/n_last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
368 {
369 size_t len = strlen (name);
370 if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH)
371 {
372 /* Test for "word1 word2 ..." syntax. */
373 char buf[UNICODE_CHARNAME_MAX_LENGTH];
374 char *ptr = buf;
375 for (;;)
376 {
377 char c = *name++;
378 if (!(c >= ' ' && c <= '~'))
379 break;
380 *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
381 if (--len == 0)
382 goto filled_buf;
383 }
384 if (false)
385 filled_buf:
386 {
387 {
388 /* Special case for variation selector aliases. Keeps the
389 tables small. */
390 const char *p1 = buf;
391 if (ptr >= buf + 3 && *p1++ == 'V')
392 {
393 if (*p1++ == 'S')
394 {
395 if (*p1 != '0')
396 {
397 unsigned int c = 0;
398 for (;;)
399 {
400 if (*p1 >= '0' && *p1 <= '9')
401 c += (*p1 - '0');
402 p1++;
403 if (p1 == ptr)
404 {
405 if (c >= 1 && c <= 16)
406 return c - 1 + 0xFE00;
407 else if (c >= 17 && c <= 256)
408 return c - 17 + 0xE0100;
409 else
410 break;
411 }
412 c = c * 10;
413 }
414 }
415 }
416 }
417 }
418 {
419 /* Convert the constituents to uint16_t words. */
420 uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
421 uint16_t *wordptr = words;
422 {
423 const char *p1 = buf;
424 for (;;)
425 {
426 {
427 int word;
428 const char *p2 = p1;
429 while (p2 < ptr && *p2 != ' ')
430 p2++;
431 word = unicode_name_word_lookup (p1, p2 - p1);
432 if (word < 0)
433 break;
434 if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS])
435 break;
436 *wordptr++ = word;
437 if (p2 == ptr)
438 goto filled_words;
439 p1 = p2 + 1;
440 }
441 /* Special case for Hangul syllables. Keeps the tables small. */
442 if (wordptr == &words[2]
443 && words[0] == UNICODE_CHARNAME_WORD_HANGUL
444 && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE)
445 {
446 /* Split the last word [p1..ptr) into three parts:
447 1) [BCDGHJKMNPRST]
448 2) [AEIOUWY]
449 3) [BCDGHIJKLMNPST]
450 */
451 const char *p2;
452 const char *p3;
453 const char *p4;
454
455 p2 = p1;
456 while (p2 < ptr
457 && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D'
458 || *p2 == 'G' || *p2 == 'H' || *p2 == 'J'
459 || *p2 == 'K' || *p2 == 'M' || *p2 == 'N'
460 || *p2 == 'P' || *p2 == 'R' || *p2 == 'S'
461 || *p2 == 'T'))
462 p2++;
463 p3 = p2;
464 while (p3 < ptr
465 && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I'
466 || *p3 == 'O' || *p3 == 'U' || *p3 == 'W'
467 || *p3 == 'Y'))
468 p3++;
469 p4 = p3;
470 while (p4 < ptr
471 && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D'
472 || *p4 == 'G' || *p4 == 'H' || *p4 == 'I'
473 || *p4 == 'J' || *p4 == 'K' || *p4 == 'L'
474 || *p4 == 'M' || *p4 == 'N' || *p4 == 'P'
475 || *p4 == 'S' || *p4 == 'T'))
476 p4++;
477 if (p4 == ptr)
478 {
479 size_t n1 = p2 - p1;
480 size_t n2 = p3 - p2;
481 size_t n3 = p4 - p3;
482
483 if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2)
484 {
485 unsigned int index1;
486
487 for (index1 = 0; index1 < 19; index1++)
488 if (memcmp (jamo_initial_short_name[index1], p1, n1) == 0
489 && jamo_initial_short_name[index1][n1] == '\0')
490 {
491 unsigned int index2;
492
493 for (index2 = 0; index2 < 21; index2++)
494 if (memcmp (jamo_medial_short_name[index2], p2, n2) == 0
495 && jamo_medial_short_name[index2][n2] == '\0')
496 {
497 unsigned int index3;
498
499 for (index3 = 0; index3 < 28; index3++)
500 if (memcmp (jamo_final_short_name[index3], p3, n3) == 0
501 && jamo_final_short_name[index3][n3] == '\0')
502 {
503 return 0xAC00 + (index1 * 21 + index2) * 28 + index3;
504 }
505 break;
506 }
507 break;
508 }
509 }
510 }
511 }
512 /* Special case for CJK compatibility ideographs. Keeps the
513 tables small. */
514 if (wordptr == &words[2]
515 && words[0] == UNICODE_CHARNAME_WORD_CJK
516 && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
517 && p1 + 14 <= ptr
518 && p1 + 15 >= ptr
519 && memcmp (p1, "IDEOGRAPH-", 10) == 0)
520 {
521 const char *p2 = p1 + 10;
522
523 if (*p2 != '0')
524 {
525 unsigned int c = 0;
526
527 for (;;)
528 {
529 if (*p2 >= '0' && *p2 <= '9')
530 c += (*p2 - '0');
531 else if (*p2 >= 'A' && *p2 <= 'F')
532 c += (*p2 - 'A' + 10);
533 else
534 break;
535 p2++;
536 if (p2 == ptr)
537 {
538 if ((c >= 0xF900 && c <= 0xFA2D)
539 || (c >= 0xFA30 && c <= 0xFA6A)
540 || (c >= 0xFA70 && c <= 0xFAD9)
541 || (c >= 0x2F800 && c <= 0x2FA1D))
542 return c;
543 else
544 break;
545 }
546 c = c << 4;
547 }
548 }
549 }
550 /* Special case for variation selectors. Keeps the
551 tables small. */
552 if (wordptr == &words[1]
553 && words[0] == UNICODE_CHARNAME_WORD_VARIATION
554 && p1 + 10 <= ptr
555 && p1 + 12 >= ptr
556 && memcmp (p1, "SELECTOR-", 9) == 0)
557 {
558 const char *p2 = p1 + 9;
559
560 if (*p2 != '0')
561 {
562 unsigned int c = 0;
563
564 for (;;)
565 {
566 if (*p2 >= '0' && *p2 <= '9')
567 c += (*p2 - '0');
568 p2++;
569 if (p2 == ptr)
570 {
571 if (c >= 1 && c <= 16)
572 return c - 1 + 0xFE00;
573 else if (c >= 17 && c <= 256)
574 return c - 17 + 0xE0100;
575 else
576 break;
577 }
578 c = c * 10;
579 }
580 }
581 }
582 }
583 }
584 if (false)
585 filled_words:
586 {
587 /* Multiply by 2, to simplify later comparisons. */
588 size_t words_length = wordptr - words;
589 {
590 size_t i = words_length - 1;
591 words[i] = 2 * words[i];
592 for (; i > 0; )
593 {
594 --i;
595 words[i] = 2 * words[i] + 1;
596 }
597 }
598 /* Binary search in unicode_name_to_index. */
599 {
600 unsigned int i1 = 0;
601 unsigned int i2 = SIZEOF (unicode_name_to_index);
602 for (;;)
603 {
604 unsigned int i = (i1 + i2) >> 1;
605 const uint16_t *w = words;
606 const uint16_t *p = &unicode_names[unicode_name_to_index[i].name];
607 size_t n = words_length;
608 for (;;)
609 {
610 if (*p < *w)
611 {
612 if (i1 == i)
613 goto name_not_found;
614 /* Note here: i1 < i < i2. */
615 i1 = i;
616 break;
617 }
618 else if (*p > *w)
619 {
620 if (i2 == i)
621 goto name_not_found;
622 /* Note here: i1 <= i < i2. */
623 i2 = i;
624 break;
625 }
626 p++; w++; n--;
627 if (n == 0)
628 return unicode_index_to_code (unicode_name_to_index[i].index);
629 }
630 }
631 }
632 name_not_found: ;
633 }
634 }
635 }
636 }
637 return UNINAME_INVALID;
638 }