root/maint/gnulib/lib/uniname/uniname.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. unicode_name_word
  2. unicode_name_word_lookup
  3. unicode_code_to_index
  4. unicode_index_to_code
  5. unicode_character_name
  6. unicode_name_character

   1 /* Association between Unicode characters and their names.
   2    Copyright (C) 2000-2002, 2005-2007, 2009-2021 Free Software Foundation, Inc.
   3 
   4    This file is free software.
   5    It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
   6    You can redistribute it and/or modify it under either
   7      - the terms of the GNU Lesser General Public License as published
   8        by the Free Software Foundation; either version 3, or (at your
   9        option) any later version, or
  10      - the terms of the GNU General Public License as published by the
  11        Free Software Foundation; either version 2, or (at your option)
  12        any later version, or
  13      - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
  14 
  15    This file is distributed in the hope that it will be useful,
  16    but WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18    Lesser General Public License and the GNU General Public License
  19    for more details.
  20 
  21    You should have received a copy of the GNU Lesser General Public
  22    License and of the GNU General Public License along with this
  23    program.  If not, see <https://www.gnu.org/licenses/>.  */
  24 
  25 #include <config.h>
  26 
  27 /* Specification.  */
  28 #include "uniname.h"
  29 
  30 #include <assert.h>
  31 #include <stdbool.h>
  32 #include <stdint.h>
  33 #include <stdio.h>
  34 #include <string.h>
  35 
  36 #include "attribute.h"
  37 
  38 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
  39 
  40 
  41 /* Table of Unicode character names, derived from UnicodeData.txt.
  42    This table is generated in a way to minimize the memory footprint:
  43      1. its compiled size is small (less than 350 KB),
  44      2. it resides entirely in the text or read-only data segment of the
  45         executable or shared library: the table contains only immediate
  46         integers, no pointers, and the functions don't do heap allocation.
  47  */
  48 #include "uninames.h"
  49 /* It contains:
  50   static const char unicode_name_words[36303] = ...;
  51   #define UNICODE_CHARNAME_NUM_WORDS 6260
  52   static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
  53   #define UNICODE_CHARNAME_WORD_HANGUL 3902
  54   #define UNICODE_CHARNAME_WORD_SYLLABLE 4978
  55   #define UNICODE_CHARNAME_WORD_CJK 417
  56   #define UNICODE_CHARNAME_WORD_COMPATIBILITY 6107
  57   static const uint16_t unicode_names[68940] = ...;
  58   static const struct { uint16_t index; uint32_t name:24; } unicode_name_to_index[16626] = ...;
  59   static const struct { uint16_t index; uint32_t name:24; } unicode_index_to_name[16626] = ...;
  60   #define UNICODE_CHARNAME_MAX_LENGTH 83
  61   #define UNICODE_CHARNAME_MAX_WORDS 13
  62   static const struct { uint32_t index; uint32_t gap; uint16_t length; } unicode_ranges[401] = ...;
  63 */
  64 
  65 /* Returns the word with a given index.  */
  66 static const char *
  67 unicode_name_word (unsigned int index, unsigned int *lengthp)
     /* [previous][next][first][last][top][bottom][index][help] */
  68 {
  69   unsigned int i1;
  70   unsigned int i2;
  71 
  72   assert (index < UNICODE_CHARNAME_NUM_WORDS);
  73 
  74   /* Binary search for i with
  75        unicode_name_by_length[i].ind_offset <= index
  76      and
  77        index < unicode_name_by_length[i+1].ind_offset
  78    */
  79 
  80   i1 = 0;
  81   i2 = SIZEOF (unicode_name_by_length) - 1;
  82   while (i2 - i1 > 1)
  83     {
  84       unsigned int i = (i1 + i2) >> 1;
  85       if (unicode_name_by_length[i].ind_offset <= index)
  86         i1 = i;
  87       else
  88         i2 = i;
  89     }
  90   unsigned int i = i1;
  91   assert (unicode_name_by_length[i].ind_offset <= index
  92           && index < unicode_name_by_length[i+1].ind_offset);
  93   *lengthp = i;
  94   return &unicode_name_words[unicode_name_by_length[i].extra_offset
  95                              + (index-unicode_name_by_length[i].ind_offset)*i];
  96 }
  97 
  98 /* Looks up the index of a word.  */
  99 static int
 100 unicode_name_word_lookup (const char *word, size_t length)
     /* [previous][next][first][last][top][bottom][index][help] */
 101 {
 102   if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1)
 103     {
 104       /* Binary search among the words of given length.  */
 105       unsigned int extra_offset = unicode_name_by_length[length].extra_offset;
 106       unsigned int i0 = unicode_name_by_length[length].ind_offset;
 107       unsigned int i1 = i0;
 108       unsigned int i2 = unicode_name_by_length[length+1].ind_offset;
 109       while (i2 - i1 > 0)
 110         {
 111           unsigned int i = (i1 + i2) >> 1;
 112           const char *p = &unicode_name_words[extra_offset + (i-i0)*length];
 113           const char *w = word;
 114           unsigned int n = length;
 115           for (;;)
 116             {
 117               if (*p < *w)
 118                 {
 119                   if (i1 == i)
 120                     return -1;
 121                   /* Note here: i1 < i < i2.  */
 122                   i1 = i;
 123                   break;
 124                 }
 125               if (*p > *w)
 126                 {
 127                   /* Note here: i1 <= i < i2.  */
 128                   i2 = i;
 129                   break;
 130                 }
 131               p++; w++; n--;
 132               if (n == 0)
 133                 return i;
 134             }
 135         }
 136     }
 137   return -1;
 138 }
 139 
 140 #define UNINAME_INVALID_INDEX UINT16_MAX
 141 
 142 /* Looks up the internal index of a Unicode character.  */
 143 static uint16_t
 144 unicode_code_to_index (ucs4_t c)
     /* [previous][next][first][last][top][bottom][index][help] */
 145 {
 146   /* Binary search in unicode_ranges.  */
 147   unsigned int i1 = 0;
 148   unsigned int i2 = SIZEOF (unicode_ranges);
 149 
 150   for (;;)
 151     {
 152       unsigned int i = (i1 + i2) >> 1;
 153       ucs4_t start_code =
 154         unicode_ranges[i].index + unicode_ranges[i].gap;
 155       ucs4_t end_code =
 156         start_code + unicode_ranges[i].length - 1;
 157 
 158       if (start_code <= c && c <= end_code)
 159         return c - unicode_ranges[i].gap;
 160 
 161       if (end_code < c)
 162         {
 163           if (i1 == i)
 164             break;
 165           /* Note here: i1 < i < i2.  */
 166           i1 = i;
 167         }
 168       else if (c < start_code)
 169         {
 170           if (i2 == i)
 171             break;
 172           /* Note here: i1 <= i < i2.  */
 173           i2 = i;
 174         }
 175     }
 176   return UNINAME_INVALID_INDEX;
 177 }
 178 
 179 /* Looks up the codepoint of a Unicode character, from the given
 180    internal index.  */
 181 static ucs4_t
 182 unicode_index_to_code (uint16_t index)
     /* [previous][next][first][last][top][bottom][index][help] */
 183 {
 184   /* Binary search in unicode_ranges.  */
 185   unsigned int i1 = 0;
 186   unsigned int i2 = SIZEOF (unicode_ranges);
 187 
 188   for (;;)
 189     {
 190       unsigned int i = (i1 + i2) >> 1;
 191       uint16_t start_index = unicode_ranges[i].index;
 192       uint16_t end_index = start_index + unicode_ranges[i].length - 1;
 193 
 194       if (start_index <= index && index <= end_index)
 195         return index + unicode_ranges[i].gap;
 196 
 197       if (end_index < index)
 198         {
 199           if (i1 == i)
 200             break;
 201           /* Note here: i1 < i < i2.  */
 202           i1 = i;
 203         }
 204       else if (index < start_index)
 205         {
 206           if (i2 == i)
 207             break;
 208           /* Note here: i1 <= i < i2.  */
 209           i2 = i;
 210         }
 211     }
 212   return UNINAME_INVALID;
 213 }
 214 
 215 
 216 /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
 217    sections 3.11 and 4.4.  */
 218 static const char jamo_initial_short_name[19][3] =
 219 {
 220   "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
 221   "C", "K", "T", "P", "H"
 222 };
 223 static const char jamo_medial_short_name[21][4] =
 224 {
 225   "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
 226   "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
 227 };
 228 static const char jamo_final_short_name[28][3] =
 229 {
 230   "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
 231   "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
 232 };
 233 
 234 /* Looks up the name of a Unicode character, in uppercase ASCII.
 235    Returns the filled buf, or NULL if the character does not have a name.  */
 236 char *
 237 unicode_character_name (ucs4_t c, char *buf)
     /* [previous][next][first][last][top][bottom][index][help] */
 238 {
 239   if (c >= 0xAC00 && c <= 0xD7A3)
 240     {
 241       /* Special case for Hangul syllables. Keeps the tables small.  */
 242       char *ptr;
 243       unsigned int tmp;
 244       unsigned int index1;
 245       unsigned int index2;
 246       unsigned int index3;
 247       const char *q;
 248 
 249       /* buf needs to have at least 16 + 7 bytes here.  */
 250       memcpy (buf, "HANGUL SYLLABLE ", 16);
 251       ptr = buf + 16;
 252 
 253       tmp = c - 0xAC00;
 254       index3 = tmp % 28; tmp = tmp / 28;
 255       index2 = tmp % 21; tmp = tmp / 21;
 256       index1 = tmp;
 257 
 258       q = jamo_initial_short_name[index1];
 259       while (*q != '\0')
 260         *ptr++ = *q++;
 261       q = jamo_medial_short_name[index2];
 262       while (*q != '\0')
 263         *ptr++ = *q++;
 264       q = jamo_final_short_name[index3];
 265       while (*q != '\0')
 266         *ptr++ = *q++;
 267       *ptr = '\0';
 268       return buf;
 269     }
 270   else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A)
 271            || (c >= 0xFA70 && c <= 0xFAD9) || (c >= 0x2F800 && c <= 0x2FA1D))
 272     {
 273       /* Special case for CJK compatibility ideographs. Keeps the tables
 274          small.  */
 275       char *ptr;
 276       int i;
 277 
 278       /* buf needs to have at least 28 + 5 bytes here.  */
 279       memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28);
 280       ptr = buf + 28;
 281 
 282       for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4)
 283         {
 284           unsigned int x = (c >> i) & 0xf;
 285           *ptr++ = (x < 10 ? '0' : 'A' - 10) + x;
 286         }
 287       *ptr = '\0';
 288       return buf;
 289     }
 290   else if ((c >= 0xFE00 && c <= 0xFE0F) || (c >= 0xE0100 && c <= 0xE01EF))
 291     {
 292       /* Special case for variation selectors. Keeps the tables
 293          small.  */
 294 
 295       /* buf needs to have at least 19 + 3 bytes here.  */
 296       sprintf (buf, "VARIATION SELECTOR-%d",
 297                c <= 0xFE0F ? c - 0xFE00 + 1 : c - 0xE0100 + 17);
 298       return buf;
 299     }
 300   else
 301     {
 302       uint16_t index = unicode_code_to_index (c);
 303       const uint16_t *words = NULL;
 304 
 305       if (index != UNINAME_INVALID_INDEX)
 306         {
 307           /* Binary search in unicode_code_to_name.  */
 308           unsigned int i1 = 0;
 309           unsigned int i2 = SIZEOF (unicode_index_to_name);
 310           for (;;)
 311             {
 312               unsigned int i = (i1 + i2) >> 1;
 313               if (unicode_index_to_name[i].index == index)
 314                 {
 315                   words = &unicode_names[unicode_index_to_name[i].name];
 316                   break;
 317                 }
 318               else if (unicode_index_to_name[i].index < index)
 319                 {
 320                   if (i1 == i)
 321                     {
 322                       words = NULL;
 323                       break;
 324                     }
 325                   /* Note here: i1 < i < i2.  */
 326                   i1 = i;
 327                 }
 328               else if (unicode_index_to_name[i].index > index)
 329                 {
 330                   if (i2 == i)
 331                     {
 332                       words = NULL;
 333                       break;
 334                     }
 335                   /* Note here: i1 <= i < i2.  */
 336                   i2 = i;
 337                 }
 338             }
 339         }
 340       if (words != NULL)
 341         {
 342           /* Found it in unicode_index_to_name. Now concatenate the words.  */
 343           /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes.  */
 344           char *ptr = buf;
 345           for (;;)
 346             {
 347               unsigned int wordlen;
 348               const char *word = unicode_name_word (*words>>1, &wordlen);
 349               do
 350                 *ptr++ = *word++;
 351               while (--wordlen > 0);
 352               if ((*words & 1) == 0)
 353                 break;
 354               *ptr++ = ' ';
 355               words++;
 356             }
 357           *ptr = '\0';
 358           return buf;
 359         }
 360       return NULL;
 361     }
 362 }
 363 
 364 /* Looks up the Unicode character with a given name, in upper- or lowercase
 365    ASCII.  Returns the character if found, or UNINAME_INVALID if not found.  */
 366 ucs4_t
 367 unicode_name_character (const char *name)
     /* [previous][next][first][last][top][bottom][index][help] */
 368 {
 369   size_t len = strlen (name);
 370   if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH)
 371     {
 372       /* Test for "word1 word2 ..." syntax.  */
 373       char buf[UNICODE_CHARNAME_MAX_LENGTH];
 374       char *ptr = buf;
 375       for (;;)
 376         {
 377           char c = *name++;
 378           if (!(c >= ' ' && c <= '~'))
 379             break;
 380           *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
 381           if (--len == 0)
 382             goto filled_buf;
 383         }
 384       if (false)
 385       filled_buf:
 386         {
 387           {
 388             /* Special case for variation selector aliases. Keeps the
 389                tables small.  */
 390             const char *p1 = buf;
 391             if (ptr >= buf + 3 && *p1++ == 'V')
 392               {
 393                 if (*p1++ == 'S')
 394                   {
 395                     if (*p1 != '0')
 396                       {
 397                         unsigned int c = 0;
 398                         for (;;)
 399                           {
 400                             if (*p1 >= '0' && *p1 <= '9')
 401                               c += (*p1 - '0');
 402                             p1++;
 403                             if (p1 == ptr)
 404                               {
 405                                 if (c >= 1 && c <= 16)
 406                                   return c - 1 + 0xFE00;
 407                                 else if (c >= 17 && c <= 256)
 408                                   return c - 17 + 0xE0100;
 409                                 else
 410                                   break;
 411                               }
 412                             c = c * 10;
 413                           }
 414                       }
 415                   }
 416               }
 417           }
 418           {
 419             /* Convert the constituents to uint16_t words.  */
 420             uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
 421             uint16_t *wordptr = words;
 422             {
 423               const char *p1 = buf;
 424               for (;;)
 425                 {
 426                   {
 427                     int word;
 428                     const char *p2 = p1;
 429                     while (p2 < ptr && *p2 != ' ')
 430                       p2++;
 431                     word = unicode_name_word_lookup (p1, p2 - p1);
 432                     if (word < 0)
 433                       break;
 434                     if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS])
 435                       break;
 436                     *wordptr++ = word;
 437                     if (p2 == ptr)
 438                       goto filled_words;
 439                     p1 = p2 + 1;
 440                   }
 441                   /* Special case for Hangul syllables. Keeps the tables small. */
 442                   if (wordptr == &words[2]
 443                       && words[0] == UNICODE_CHARNAME_WORD_HANGUL
 444                       && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE)
 445                     {
 446                       /* Split the last word [p1..ptr) into three parts:
 447                            1) [BCDGHJKMNPRST]
 448                            2) [AEIOUWY]
 449                            3) [BCDGHIJKLMNPST]
 450                        */
 451                       const char *p2;
 452                       const char *p3;
 453                       const char *p4;
 454 
 455                       p2 = p1;
 456                       while (p2 < ptr
 457                              && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D'
 458                                  || *p2 == 'G' || *p2 == 'H' || *p2 == 'J'
 459                                  || *p2 == 'K' || *p2 == 'M' || *p2 == 'N'
 460                                  || *p2 == 'P' || *p2 == 'R' || *p2 == 'S'
 461                                  || *p2 == 'T'))
 462                         p2++;
 463                       p3 = p2;
 464                       while (p3 < ptr
 465                              && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I'
 466                                  || *p3 == 'O' || *p3 == 'U' || *p3 == 'W'
 467                                  || *p3 == 'Y'))
 468                         p3++;
 469                       p4 = p3;
 470                       while (p4 < ptr
 471                              && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D'
 472                                  || *p4 == 'G' || *p4 == 'H' || *p4 == 'I'
 473                                  || *p4 == 'J' || *p4 == 'K' || *p4 == 'L'
 474                                  || *p4 == 'M' || *p4 == 'N' || *p4 == 'P'
 475                                  || *p4 == 'S' || *p4 == 'T'))
 476                         p4++;
 477                       if (p4 == ptr)
 478                         {
 479                           size_t n1 = p2 - p1;
 480                           size_t n2 = p3 - p2;
 481                           size_t n3 = p4 - p3;
 482 
 483                           if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2)
 484                             {
 485                               unsigned int index1;
 486 
 487                               for (index1 = 0; index1 < 19; index1++)
 488                                 if (memcmp (jamo_initial_short_name[index1], p1, n1) == 0
 489                                     && jamo_initial_short_name[index1][n1] == '\0')
 490                                   {
 491                                     unsigned int index2;
 492 
 493                                     for (index2 = 0; index2 < 21; index2++)
 494                                       if (memcmp (jamo_medial_short_name[index2], p2, n2) == 0
 495                                           && jamo_medial_short_name[index2][n2] == '\0')
 496                                         {
 497                                           unsigned int index3;
 498 
 499                                           for (index3 = 0; index3 < 28; index3++)
 500                                             if (memcmp (jamo_final_short_name[index3], p3, n3) == 0
 501                                                 && jamo_final_short_name[index3][n3] == '\0')
 502                                               {
 503                                                 return 0xAC00 + (index1 * 21 + index2) * 28 + index3;
 504                                               }
 505                                           break;
 506                                         }
 507                                     break;
 508                                   }
 509                             }
 510                         }
 511                     }
 512                   /* Special case for CJK compatibility ideographs. Keeps the
 513                      tables small.  */
 514                   if (wordptr == &words[2]
 515                       && words[0] == UNICODE_CHARNAME_WORD_CJK
 516                       && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
 517                       && p1 + 14 <= ptr
 518                       && p1 + 15 >= ptr
 519                       && memcmp (p1, "IDEOGRAPH-", 10) == 0)
 520                     {
 521                       const char *p2 = p1 + 10;
 522 
 523                       if (*p2 != '0')
 524                         {
 525                           unsigned int c = 0;
 526 
 527                           for (;;)
 528                             {
 529                               if (*p2 >= '0' && *p2 <= '9')
 530                                 c += (*p2 - '0');
 531                               else if (*p2 >= 'A' && *p2 <= 'F')
 532                                 c += (*p2 - 'A' + 10);
 533                               else
 534                                 break;
 535                               p2++;
 536                               if (p2 == ptr)
 537                                 {
 538                                   if ((c >= 0xF900 && c <= 0xFA2D)
 539                                       || (c >= 0xFA30 && c <= 0xFA6A)
 540                                       || (c >= 0xFA70 && c <= 0xFAD9)
 541                                       || (c >= 0x2F800 && c <= 0x2FA1D))
 542                                     return c;
 543                                   else
 544                                     break;
 545                                 }
 546                               c = c << 4;
 547                             }
 548                         }
 549                     }
 550                   /* Special case for variation selectors. Keeps the
 551                      tables small.  */
 552                   if (wordptr == &words[1]
 553                       && words[0] == UNICODE_CHARNAME_WORD_VARIATION
 554                       && p1 + 10 <= ptr
 555                       && p1 + 12 >= ptr
 556                       && memcmp (p1, "SELECTOR-", 9) == 0)
 557                     {
 558                       const char *p2 = p1 + 9;
 559 
 560                       if (*p2 != '0')
 561                         {
 562                           unsigned int c = 0;
 563 
 564                           for (;;)
 565                             {
 566                               if (*p2 >= '0' && *p2 <= '9')
 567                                 c += (*p2 - '0');
 568                               p2++;
 569                               if (p2 == ptr)
 570                                 {
 571                                   if (c >= 1 && c <= 16)
 572                                     return c - 1 + 0xFE00;
 573                                   else if (c >= 17 && c <= 256)
 574                                     return c - 17 + 0xE0100;
 575                                   else
 576                                     break;
 577                                 }
 578                               c = c * 10;
 579                             }
 580                         }
 581                     }
 582                 }
 583             }
 584             if (false)
 585             filled_words:
 586               {
 587                 /* Multiply by 2, to simplify later comparisons.  */
 588                 size_t words_length = wordptr - words;
 589                 {
 590                   size_t i = words_length - 1;
 591                   words[i] = 2 * words[i];
 592                   for (; i > 0; )
 593                     {
 594                       --i;
 595                       words[i] = 2 * words[i] + 1;
 596                     }
 597                 }
 598                 /* Binary search in unicode_name_to_index.  */
 599                 {
 600                   unsigned int i1 = 0;
 601                   unsigned int i2 = SIZEOF (unicode_name_to_index);
 602                   for (;;)
 603                     {
 604                       unsigned int i = (i1 + i2) >> 1;
 605                       const uint16_t *w = words;
 606                       const uint16_t *p = &unicode_names[unicode_name_to_index[i].name];
 607                       size_t n = words_length;
 608                       for (;;)
 609                         {
 610                           if (*p < *w)
 611                             {
 612                               if (i1 == i)
 613                                 goto name_not_found;
 614                               /* Note here: i1 < i < i2.  */
 615                               i1 = i;
 616                               break;
 617                             }
 618                           else if (*p > *w)
 619                             {
 620                               if (i2 == i)
 621                                 goto name_not_found;
 622                               /* Note here: i1 <= i < i2.  */
 623                               i2 = i;
 624                               break;
 625                             }
 626                           p++; w++; n--;
 627                           if (n == 0)
 628                             return unicode_index_to_code (unicode_name_to_index[i].index);
 629                         }
 630                     }
 631                 }
 632               name_not_found: ;
 633               }
 634           }
 635         }
 636     }
 637   return UNINAME_INVALID;
 638 }

/* [previous][next][first][last][top][bottom][index][help] */