root/maint/gnulib/lib/uninorm/canonical-decomposition.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. uc_canonical_decomposition

   1 /* Canonical decomposition of Unicode characters.
   2    Copyright (C) 2009-2021 Free Software Foundation, Inc.
   3    Written by Bruno Haible <bruno@clisp.org>, 2009.
   4 
   5    This file is free software: you can redistribute it and/or modify
   6    it under the terms of the GNU Lesser General Public License as
   7    published by the Free Software Foundation; either version 2.1 of the
   8    License, or (at your option) any later version.
   9 
  10    This file is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU Lesser General Public License for more details.
  14 
  15    You should have received a copy of the GNU Lesser General Public License
  16    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
  17 
  18 #include <config.h>
  19 
  20 /* Specification.  */
  21 #include "uninorm.h"
  22 
  23 #include <stdlib.h>
  24 
  25 #include "uninorm/decomposition-table.h"
  26 
  27 int
  28 uc_canonical_decomposition (ucs4_t uc, ucs4_t *decomposition)
     /* [previous][next][first][last][top][bottom][index][help] */
  29 {
  30   if (uc >= 0xAC00 && uc < 0xD7A4)
  31     {
  32       /* Hangul syllable.  See Unicode standard, chapter 3, section
  33          "Hangul Syllable Decomposition",  See also the clarification at
  34          <https://www.unicode.org/versions/Unicode5.1.0/>, section
  35          "Clarification of Hangul Jamo Handling".  */
  36       unsigned int t;
  37 
  38       uc -= 0xAC00;
  39       t = uc % 28;
  40 
  41       if (t == 0)
  42         {
  43           unsigned int v, l;
  44 
  45           uc = uc / 28;
  46           v = uc % 21;
  47           l = uc / 21;
  48 
  49           decomposition[0] = 0x1100 + l;
  50           decomposition[1] = 0x1161 + v;
  51           return 2;
  52         }
  53       else
  54         {
  55 #if 1 /* Return the pairwise decomposition, not the full decomposition.  */
  56           decomposition[0] = 0xAC00 + uc - t; /* = 0xAC00 + (l * 21 + v) * 28; */
  57           decomposition[1] = 0x11A7 + t;
  58           return 2;
  59 #else
  60           unsigned int v, l;
  61 
  62           uc = uc / 28;
  63           v = uc % 21;
  64           l = uc / 21;
  65 
  66           decomposition[0] = 0x1100 + l;
  67           decomposition[1] = 0x1161 + v;
  68           decomposition[2] = 0x11A7 + t;
  69           return 3;
  70 #endif
  71         }
  72     }
  73   else if (uc < 0x110000)
  74     {
  75       unsigned short entry = decomp_index (uc);
  76       /* An entry of (unsigned short)(-1) denotes an absent entry.
  77          Otherwise, bit 15 of the entry tells whether the decomposition
  78          is a canonical one.  */
  79       if (entry < 0x8000)
  80         {
  81           const unsigned char *p;
  82           unsigned int element;
  83           unsigned int length;
  84 
  85           p = &gl_uninorm_decomp_chars_table[3 * entry];
  86           element = (p[0] << 16) | (p[1] << 8) | p[2];
  87           /* The first element has 5 bits for the decomposition type.  */
  88           if (((element >> 18) & 0x1f) != UC_DECOMP_CANONICAL)
  89             abort ();
  90           length = 1;
  91           for (;;)
  92             {
  93               /* Every element has an 18 bits wide Unicode code point.  */
  94               *decomposition = element & 0x3ffff;
  95               /* Bit 23 tells whether there are more elements,  */
  96               if ((element & (1 << 23)) == 0)
  97                 break;
  98               p += 3;
  99               element = (p[0] << 16) | (p[1] << 8) | p[2];
 100               decomposition++;
 101               length++;
 102             }
 103           return length;
 104         }
 105     }
 106   return -1;
 107 }

/* [previous][next][first][last][top][bottom][index][help] */