root/maint/gnulib/lib/uninorm/uninorm-filter.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. uninorm_filter_create
  2. uninorm_filter_write
  3. uninorm_filter_flush
  4. uninorm_filter_free

   1 /* Stream-based normalization of Unicode strings.
   2    Copyright (C) 2009-2021 Free Software Foundation, Inc.
   3    Written by Bruno Haible <bruno@clisp.org>, 2009.
   4 
   5    This file is free software.
   6    It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
   7    You can redistribute it and/or modify it under either
   8      - the terms of the GNU Lesser General Public License as published
   9        by the Free Software Foundation; either version 3, or (at your
  10        option) any later version, or
  11      - the terms of the GNU General Public License as published by the
  12        Free Software Foundation; either version 2, or (at your option)
  13        any later version, or
  14      - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
  15 
  16    This file is distributed in the hope that it will be useful,
  17    but WITHOUT ANY WARRANTY; without even the implied warranty of
  18    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19    Lesser General Public License and the GNU General Public License
  20    for more details.
  21 
  22    You should have received a copy of the GNU Lesser General Public
  23    License and of the GNU General Public License along with this
  24    program.  If not, see <https://www.gnu.org/licenses/>.  */
  25 
  26 #include <config.h>
  27 
  28 /* Specification.  */
  29 #include "uninorm.h"
  30 
  31 #include <errno.h>
  32 #include <stddef.h>
  33 #include <stdlib.h>
  34 #include <string.h>
  35 
  36 #include "unictype.h"
  37 #include "normalize-internal.h"
  38 #include "uninorm/decompose-internal.h"
  39 
  40 
  41 struct uninorm_filter
  42 {
  43   /* Characteristics of the normalization form.  */
  44   int (*decomposer) (ucs4_t uc, ucs4_t *decomposition);
  45   ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2);
  46 
  47   /* The encapsulated stream.  */
  48   int (*stream_func) (void *stream_data, ucs4_t uc);
  49   void *stream_data;
  50 
  51   /* The buffer for sorting.  */
  52   #define SORTBUF_PREALLOCATED 64
  53   struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
  54   struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
  55   size_t sortbuf_allocated;
  56   size_t sortbuf_count;
  57 };
  58 
  59 struct uninorm_filter *
  60 uninorm_filter_create (uninorm_t nf,
     /* [previous][next][first][last][top][bottom][index][help] */
  61                        int (*stream_func) (void *stream_data, ucs4_t uc),
  62                        void *stream_data)
  63 {
  64   struct uninorm_filter *filter =
  65     (struct uninorm_filter *) malloc (sizeof (struct uninorm_filter));
  66 
  67   if (filter == NULL)
  68     /* errno is ENOMEM. */
  69     return NULL;
  70 
  71   filter->decomposer = nf->decomposer;
  72   filter->composer = nf->composer;
  73   filter->stream_func = stream_func;
  74   filter->stream_data = stream_data;
  75   filter->sortbuf = filter->sortbuf_preallocated;
  76   filter->sortbuf_allocated = SORTBUF_PREALLOCATED;
  77   filter->sortbuf_count = 0;
  78 
  79   return filter;
  80 }
  81 
  82 int
  83 uninorm_filter_write (struct uninorm_filter *filter, ucs4_t uc_arg)
     /* [previous][next][first][last][top][bottom][index][help] */
  84 {
  85   ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
  86   int decomposed_count;
  87 
  88   /* Accept the next character.  */
  89   decomposed[0] = uc_arg;
  90   decomposed_count = 1;
  91 
  92   /* Decompose it, recursively.
  93      It would be possible to precompute the recursive decomposition
  94      and store it in a table.  But this would significantly increase
  95      the size of the decomposition tables, because for example for
  96      U+1FC1 the recursive canonical decomposition and the recursive
  97      compatibility decomposition are different.  */
  98   {
  99     int curr;
 100 
 101     for (curr = 0; curr < decomposed_count; )
 102       {
 103         /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
 104            all elements are atomic.  */
 105         ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
 106         int curr_decomposed_count;
 107 
 108         curr_decomposed_count =
 109           filter->decomposer (decomposed[curr], curr_decomposed);
 110         if (curr_decomposed_count >= 0)
 111           {
 112             /* Move curr_decomposed[0..curr_decomposed_count-1] over
 113                decomposed[curr], making room.  It's not worth using
 114                memcpy() here, since the counts are so small.  */
 115             int shift = curr_decomposed_count - 1;
 116 
 117             if (shift < 0)
 118               abort ();
 119             if (shift > 0)
 120               {
 121                 int j;
 122 
 123                 decomposed_count += shift;
 124                 if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
 125                   abort ();
 126                 for (j = decomposed_count - 1 - shift; j > curr; j--)
 127                   decomposed[j + shift] = decomposed[j];
 128               }
 129             for (; shift >= 0; shift--)
 130               decomposed[curr + shift] = curr_decomposed[shift];
 131           }
 132         else
 133           {
 134             /* decomposed[curr] is atomic.  */
 135             curr++;
 136           }
 137       }
 138   }
 139 
 140   {
 141     /* Cache sortbuf and sortbuf_count in local register variables.  */
 142     struct ucs4_with_ccc *sortbuf = filter->sortbuf;
 143     size_t sortbuf_count = filter->sortbuf_count;
 144     int i;
 145 
 146     for (i = 0; i < decomposed_count; i++)
 147       {
 148         /* Fetch the next character from the decomposition.  */
 149         ucs4_t uc = decomposed[i];
 150         int ccc = uc_combining_class (uc);
 151 
 152         if (ccc == 0)
 153           {
 154             size_t j;
 155 
 156             /* Apply the canonical ordering algorithm to the accumulated
 157                sequence of characters.  */
 158             if (sortbuf_count > 1)
 159               gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
 160                                                        sortbuf + sortbuf_count);
 161 
 162             if (filter->composer != NULL)
 163               {
 164                 /* Attempt to combine decomposed characters, as specified
 165                    in the Unicode Standard Annex #15 "Unicode Normalization
 166                    Forms".  We need to check
 167                      1. whether the first accumulated character is a
 168                         "starter" (i.e. has ccc = 0).  This is usually the
 169                         case.  But when the string starts with a
 170                         non-starter, the sortbuf also starts with a
 171                         non-starter.  Btw, this check could also be
 172                         omitted, because the composition table has only
 173                         entries (code1, code2) for which code1 is a
 174                         starter; if the first accumulated character is not
 175                         a starter, no lookup will succeed.
 176                      2. If the sortbuf has more than one character, check
 177                         for each of these characters that are not "blocked"
 178                         from the starter (i.e. have a ccc that is higher
 179                         than the ccc of the previous character) whether it
 180                         can be combined with the first character.
 181                      3. If only one character is left in sortbuf, check
 182                         whether it can be combined with the next character
 183                         (also a starter).  */
 184                 if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
 185                   {
 186                     for (j = 1; j < sortbuf_count; )
 187                       {
 188                         if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
 189                           {
 190                             ucs4_t combined =
 191                               filter->composer (sortbuf[0].code, sortbuf[j].code);
 192                             if (combined)
 193                               {
 194                                 size_t k;
 195 
 196                                 sortbuf[0].code = combined;
 197                                 /* sortbuf[0].ccc = 0, still valid.  */
 198                                 for (k = j + 1; k < sortbuf_count; k++)
 199                                   sortbuf[k - 1] = sortbuf[k];
 200                                 sortbuf_count--;
 201                                 continue;
 202                               }
 203                           }
 204                         j++;
 205                       }
 206                     if (sortbuf_count == 1)
 207                       {
 208                         ucs4_t combined =
 209                           filter->composer (sortbuf[0].code, uc);
 210                         if (combined)
 211                           {
 212                             uc = combined;
 213                             ccc = 0;
 214                             /* uc could be further combined with subsequent
 215                                characters.  So don't put it into sortbuf[0] in
 216                                this round, only in the next round.  */
 217                             sortbuf_count = 0;
 218                           }
 219                       }
 220                   }
 221               }
 222 
 223             for (j = 0; j < sortbuf_count; j++)
 224               {
 225                 ucs4_t muc = sortbuf[j].code;
 226 
 227                 /* Output muc to the encapsulated stream.  */
 228                 int ret = filter->stream_func (filter->stream_data, muc);
 229                 if (ret < 0)
 230                   {
 231                     /* errno is set here.  */
 232                     filter->sortbuf_count = 0;
 233                     return -1;
 234                   }
 235               }
 236 
 237             /* sortbuf is now empty.  */
 238             sortbuf_count = 0;
 239           }
 240 
 241         /* Append (uc, ccc) to sortbuf.  */
 242         if (sortbuf_count == filter->sortbuf_allocated)
 243           {
 244             struct ucs4_with_ccc *new_sortbuf;
 245 
 246             filter->sortbuf_allocated = 2 * filter->sortbuf_allocated;
 247             if (filter->sortbuf_allocated < sortbuf_count) /* integer overflow? */
 248               abort ();
 249             new_sortbuf =
 250               (struct ucs4_with_ccc *)
 251               malloc (2 * filter->sortbuf_allocated * sizeof (struct ucs4_with_ccc));
 252             if (new_sortbuf == NULL)
 253               {
 254                 /* errno is ENOMEM. */
 255                 filter->sortbuf_count = sortbuf_count;
 256                 return -1;
 257               }
 258             memcpy (new_sortbuf, filter->sortbuf,
 259                     sortbuf_count * sizeof (struct ucs4_with_ccc));
 260             if (filter->sortbuf != filter->sortbuf_preallocated)
 261               free (filter->sortbuf);
 262             filter->sortbuf = new_sortbuf;
 263             /* Update cache of filter->sortbuf.  */
 264             sortbuf = filter->sortbuf;
 265           }
 266         sortbuf[sortbuf_count].code = uc;
 267         sortbuf[sortbuf_count].ccc = ccc;
 268         sortbuf_count++;
 269       }
 270 
 271     filter->sortbuf_count = sortbuf_count;
 272   }
 273 
 274   return 0;
 275 }
 276 
 277 /* Bring data buffered in the filter to its destination, the encapsulated
 278    stream.
 279    Return 0 if successful, or -1 with errno set upon failure.
 280    Note! If after calling this function, additional characters are written
 281    into the filter, the resulting character sequence in the encapsulated stream
 282    will not necessarily be normalized.  */
 283 int
 284 uninorm_filter_flush (struct uninorm_filter *filter)
     /* [previous][next][first][last][top][bottom][index][help] */
 285 {
 286   /* Cache sortbuf and sortbuf_count in local register variables.  */
 287   struct ucs4_with_ccc * const sortbuf = filter->sortbuf;
 288   size_t sortbuf_count = filter->sortbuf_count;
 289   size_t j;
 290 
 291   /* Apply the canonical ordering algorithm to the accumulated
 292      sequence of characters.  */
 293   if (sortbuf_count > 1)
 294     gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
 295                                              sortbuf + sortbuf_count);
 296 
 297   if (filter->composer != NULL)
 298     {
 299       /* Attempt to combine decomposed characters, as specified
 300          in the Unicode Standard Annex #15 "Unicode Normalization
 301          Forms".  We need to check
 302            1. whether the first accumulated character is a
 303               "starter" (i.e. has ccc = 0).  This is usually the
 304               case.  But when the string starts with a
 305               non-starter, the sortbuf also starts with a
 306               non-starter.  Btw, this check could also be
 307               omitted, because the composition table has only
 308               entries (code1, code2) for which code1 is a
 309               starter; if the first accumulated character is not
 310               a starter, no lookup will succeed.
 311            2. If the sortbuf has more than one character, check
 312               for each of these characters that are not "blocked"
 313               from the starter (i.e. have a ccc that is higher
 314               than the ccc of the previous character) whether it
 315               can be combined with the first character.
 316            3. If only one character is left in sortbuf, check
 317               whether it can be combined with the next character
 318               (also a starter).  */
 319       if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
 320         {
 321           for (j = 1; j < sortbuf_count; )
 322             {
 323               if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
 324                 {
 325                   ucs4_t combined =
 326                     filter->composer (sortbuf[0].code, sortbuf[j].code);
 327                   if (combined)
 328                     {
 329                       size_t k;
 330 
 331                       sortbuf[0].code = combined;
 332                       /* sortbuf[0].ccc = 0, still valid.  */
 333                       for (k = j + 1; k < sortbuf_count; k++)
 334                         sortbuf[k - 1] = sortbuf[k];
 335                       sortbuf_count--;
 336                       continue;
 337                     }
 338                 }
 339               j++;
 340             }
 341         }
 342     }
 343 
 344   for (j = 0; j < sortbuf_count; j++)
 345     {
 346       ucs4_t muc = sortbuf[j].code;
 347 
 348       /* Output muc to the encapsulated stream.  */
 349       int ret = filter->stream_func (filter->stream_data, muc);
 350       if (ret < 0)
 351         {
 352           /* errno is set here.  */
 353           filter->sortbuf_count = 0;
 354           return -1;
 355         }
 356     }
 357 
 358   /* sortbuf is now empty.  */
 359   filter->sortbuf_count = 0;
 360 
 361   return 0;
 362 }
 363 
 364 /* Bring data buffered in the filter to its destination, the encapsulated
 365    stream, then close and free the filter.
 366    Return 0 if successful, or -1 with errno set upon failure.  */
 367 int
 368 uninorm_filter_free (struct uninorm_filter *filter)
     /* [previous][next][first][last][top][bottom][index][help] */
 369 {
 370   int ret = uninorm_filter_flush (filter);
 371 
 372   if (ret < 0)
 373     /* errno is set here.  */
 374     return -1;
 375 
 376   if (filter->sortbuf_count > 0)
 377     abort ();
 378   if (filter->sortbuf != filter->sortbuf_preallocated)
 379     free (filter->sortbuf);
 380   free (filter);
 381 
 382   return 0;
 383 }

/* [previous][next][first][last][top][bottom][index][help] */