root/maint/gnulib/lib/uninorm/u-normalize-internal.h

/* [previous][next][first][last][top][bottom][index][help] */

INCLUDED FROM


DEFINITIONS

This source file includes following definitions.
  1. FUNC

   1 /* Decomposition and composition of Unicode strings.
   2    Copyright (C) 2009-2021 Free Software Foundation, Inc.
   3    Written by Bruno Haible <bruno@clisp.org>, 2009.
   4 
   5    This file is free software: you can redistribute it and/or modify
   6    it under the terms of the GNU Lesser General Public License as
   7    published by the Free Software Foundation; either version 2.1 of the
   8    License, or (at your option) any later version.
   9 
  10    This file is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU Lesser General Public License for more details.
  14 
  15    You should have received a copy of the GNU Lesser General Public License
  16    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
  17 
  18 UNIT *
  19 FUNC (uninorm_t nf, const UNIT *s, size_t n,
     /* [previous][next][first][last][top][bottom][index][help] */
  20       UNIT *resultbuf, size_t *lengthp)
  21 {
  22   int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
  23   ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
  24 
  25   /* The result being accumulated.  */
  26   UNIT *result;
  27   size_t length;
  28   size_t allocated;
  29   /* The buffer for sorting.  */
  30   #define SORTBUF_PREALLOCATED 64
  31   struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
  32   struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
  33   size_t sortbuf_allocated;
  34   size_t sortbuf_count;
  35 
  36   /* Initialize the accumulator.  */
  37   if (resultbuf == NULL)
  38     {
  39       result = NULL;
  40       allocated = 0;
  41     }
  42   else
  43     {
  44       result = resultbuf;
  45       allocated = *lengthp;
  46     }
  47   length = 0;
  48 
  49   /* Initialize the buffer for sorting.  */
  50   sortbuf = sortbuf_preallocated;
  51   sortbuf_allocated = SORTBUF_PREALLOCATED;
  52   sortbuf_count = 0;
  53 
  54   {
  55     const UNIT *s_end = s + n;
  56 
  57     for (;;)
  58       {
  59         int count;
  60         ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
  61         int decomposed_count;
  62         int i;
  63 
  64         if (s < s_end)
  65           {
  66             /* Fetch the next character.  */
  67             count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
  68             decomposed_count = 1;
  69 
  70             /* Decompose it, recursively.
  71                It would be possible to precompute the recursive decomposition
  72                and store it in a table.  But this would significantly increase
  73                the size of the decomposition tables, because for example for
  74                U+1FC1 the recursive canonical decomposition and the recursive
  75                compatibility decomposition are different.  */
  76             {
  77               int curr;
  78 
  79               for (curr = 0; curr < decomposed_count; )
  80                 {
  81                   /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
  82                      all elements are atomic.  */
  83                   ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
  84                   int curr_decomposed_count;
  85 
  86                   curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
  87                   if (curr_decomposed_count >= 0)
  88                     {
  89                       /* Move curr_decomposed[0..curr_decomposed_count-1] over
  90                          decomposed[curr], making room.  It's not worth using
  91                          memcpy() here, since the counts are so small.  */
  92                       int shift = curr_decomposed_count - 1;
  93 
  94                       if (shift < 0)
  95                         abort ();
  96                       if (shift > 0)
  97                         {
  98                           int j;
  99 
 100                           decomposed_count += shift;
 101                           if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
 102                             abort ();
 103                           for (j = decomposed_count - 1 - shift; j > curr; j--)
 104                             decomposed[j + shift] = decomposed[j];
 105                         }
 106                       for (; shift >= 0; shift--)
 107                         decomposed[curr + shift] = curr_decomposed[shift];
 108                     }
 109                   else
 110                     {
 111                       /* decomposed[curr] is atomic.  */
 112                       curr++;
 113                     }
 114                 }
 115             }
 116           }
 117         else
 118           {
 119             count = 0;
 120             decomposed_count = 0;
 121           }
 122 
 123         i = 0;
 124         for (;;)
 125           {
 126             ucs4_t uc;
 127             int ccc;
 128 
 129             if (s < s_end)
 130               {
 131                 /* Fetch the next character from the decomposition.  */
 132                 if (i == decomposed_count)
 133                   break;
 134                 uc = decomposed[i];
 135                 ccc = uc_combining_class (uc);
 136               }
 137             else
 138               {
 139                 /* End of string reached.  */
 140                 uc = 0;
 141                 ccc = 0;
 142               }
 143 
 144             if (ccc == 0)
 145               {
 146                 size_t j;
 147 
 148                 /* Apply the canonical ordering algorithm to the accumulated
 149                    sequence of characters.  */
 150                 if (sortbuf_count > 1)
 151                   gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
 152                                                            sortbuf + sortbuf_count);
 153 
 154                 if (composer != NULL)
 155                   {
 156                     /* Attempt to combine decomposed characters, as specified
 157                        in the Unicode Standard Annex #15 "Unicode Normalization
 158                        Forms".  We need to check
 159                          1. whether the first accumulated character is a
 160                             "starter" (i.e. has ccc = 0).  This is usually the
 161                             case.  But when the string starts with a
 162                             non-starter, the sortbuf also starts with a
 163                             non-starter.  Btw, this check could also be
 164                             omitted, because the composition table has only
 165                             entries (code1, code2) for which code1 is a
 166                             starter; if the first accumulated character is not
 167                             a starter, no lookup will succeed.
 168                          2. If the sortbuf has more than one character, check
 169                             for each of these characters that are not "blocked"
 170                             from the starter (i.e. have a ccc that is higher
 171                             than the ccc of the previous character) whether it
 172                             can be combined with the first character.
 173                          3. If only one character is left in sortbuf, check
 174                             whether it can be combined with the next character
 175                             (also a starter).  */
 176                     if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
 177                       {
 178                         for (j = 1; j < sortbuf_count; )
 179                           {
 180                             if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
 181                               {
 182                                 ucs4_t combined =
 183                                   composer (sortbuf[0].code, sortbuf[j].code);
 184                                 if (combined)
 185                                   {
 186                                     size_t k;
 187 
 188                                     sortbuf[0].code = combined;
 189                                     /* sortbuf[0].ccc = 0, still valid.  */
 190                                     for (k = j + 1; k < sortbuf_count; k++)
 191                                       sortbuf[k - 1] = sortbuf[k];
 192                                     sortbuf_count--;
 193                                     continue;
 194                                   }
 195                               }
 196                             j++;
 197                           }
 198                         if (s < s_end && sortbuf_count == 1)
 199                           {
 200                             ucs4_t combined =
 201                               composer (sortbuf[0].code, uc);
 202                             if (combined)
 203                               {
 204                                 uc = combined;
 205                                 ccc = 0;
 206                                 /* uc could be further combined with subsequent
 207                                    characters.  So don't put it into sortbuf[0] in
 208                                    this round, only in the next round.  */
 209                                 sortbuf_count = 0;
 210                               }
 211                           }
 212                       }
 213                   }
 214 
 215                 for (j = 0; j < sortbuf_count; j++)
 216                   {
 217                     ucs4_t muc = sortbuf[j].code;
 218 
 219                     /* Append muc to the result accumulator.  */
 220                     if (length < allocated)
 221                       {
 222                         int ret =
 223                           U_UCTOMB (result + length, muc, allocated - length);
 224                         if (ret == -1)
 225                           {
 226                             errno = EINVAL;
 227                             goto fail;
 228                           }
 229                         if (ret >= 0)
 230                           {
 231                             length += ret;
 232                             goto done_appending;
 233                           }
 234                       }
 235                     {
 236                       size_t old_allocated = allocated;
 237                       size_t new_allocated = 2 * old_allocated;
 238                       if (new_allocated < 64)
 239                         new_allocated = 64;
 240                       if (new_allocated < old_allocated) /* integer overflow? */
 241                         abort ();
 242                       {
 243                         UNIT *larger_result;
 244                         if (result == NULL)
 245                           {
 246                             larger_result =
 247                               (UNIT *) malloc (new_allocated * sizeof (UNIT));
 248                             if (larger_result == NULL)
 249                               {
 250                                 errno = ENOMEM;
 251                                 goto fail;
 252                               }
 253                           }
 254                         else if (result == resultbuf)
 255                           {
 256                             larger_result =
 257                               (UNIT *) malloc (new_allocated * sizeof (UNIT));
 258                             if (larger_result == NULL)
 259                               {
 260                                 errno = ENOMEM;
 261                                 goto fail;
 262                               }
 263                             U_CPY (larger_result, resultbuf, length);
 264                           }
 265                         else
 266                           {
 267                             larger_result =
 268                               (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
 269                             if (larger_result == NULL)
 270                               {
 271                                 errno = ENOMEM;
 272                                 goto fail;
 273                               }
 274                           }
 275                         result = larger_result;
 276                         allocated = new_allocated;
 277                         {
 278                           int ret =
 279                             U_UCTOMB (result + length, muc, allocated - length);
 280                           if (ret == -1)
 281                             {
 282                               errno = EINVAL;
 283                               goto fail;
 284                             }
 285                           if (ret < 0)
 286                             abort ();
 287                           length += ret;
 288                           goto done_appending;
 289                         }
 290                       }
 291                     }
 292                    done_appending: ;
 293                   }
 294 
 295                 /* sortbuf is now empty.  */
 296                 sortbuf_count = 0;
 297               }
 298 
 299             if (!(s < s_end))
 300               /* End of string reached.  */
 301               break;
 302 
 303             /* Append (uc, ccc) to sortbuf.  */
 304             if (sortbuf_count == sortbuf_allocated)
 305               {
 306                 struct ucs4_with_ccc *new_sortbuf;
 307 
 308                 sortbuf_allocated = 2 * sortbuf_allocated;
 309                 if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
 310                   abort ();
 311                 new_sortbuf =
 312                   (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
 313                 if (new_sortbuf == NULL)
 314                   {
 315                     errno = ENOMEM;
 316                     goto fail;
 317                   }
 318                 memcpy (new_sortbuf, sortbuf,
 319                         sortbuf_count * sizeof (struct ucs4_with_ccc));
 320                 if (sortbuf != sortbuf_preallocated)
 321                   free (sortbuf);
 322                 sortbuf = new_sortbuf;
 323               }
 324             sortbuf[sortbuf_count].code = uc;
 325             sortbuf[sortbuf_count].ccc = ccc;
 326             sortbuf_count++;
 327 
 328             i++;
 329           }
 330 
 331         if (!(s < s_end))
 332           /* End of string reached.  */
 333           break;
 334 
 335         s += count;
 336       }
 337   }
 338 
 339   if (length == 0)
 340     {
 341       if (result == NULL)
 342         {
 343           /* Return a non-NULL value.  NULL means error.  */
 344           result = (UNIT *) malloc (1);
 345           if (result == NULL)
 346             {
 347               errno = ENOMEM;
 348               goto fail;
 349             }
 350         }
 351     }
 352   else if (result != resultbuf && length < allocated)
 353     {
 354       /* Shrink the allocated memory if possible.  */
 355       UNIT *memory;
 356 
 357       memory = (UNIT *) realloc (result, length * sizeof (UNIT));
 358       if (memory != NULL)
 359         result = memory;
 360     }
 361 
 362   if (sortbuf_count > 0)
 363     abort ();
 364   if (sortbuf != sortbuf_preallocated)
 365     free (sortbuf);
 366 
 367   *lengthp = length;
 368   return result;
 369 
 370  fail:
 371   {
 372     int saved_errno = errno;
 373     if (sortbuf != sortbuf_preallocated)
 374       free (sortbuf);
 375     if (result != resultbuf)
 376       free (result);
 377     errno = saved_errno;
 378   }
 379   return NULL;
 380 }

/* [previous][next][first][last][top][bottom][index][help] */