root/maint/gnulib/lib/unicase/u-casemap.h

/* [previous][next][first][last][top][bottom][index][help] */

INCLUDED FROM


DEFINITIONS

This source file includes following definitions.
  1. FUNC

   1 /* Case mapping for UTF-8/UTF-16/UTF-32 strings (locale dependent).
   2    Copyright (C) 2009-2021 Free Software Foundation, Inc.
   3    Written by Bruno Haible <bruno@clisp.org>, 2009.
   4 
   5    This file is free software.
   6    It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
   7    You can redistribute it and/or modify it under either
   8      - the terms of the GNU Lesser General Public License as published
   9        by the Free Software Foundation; either version 3, or (at your
  10        option) any later version, or
  11      - the terms of the GNU General Public License as published by the
  12        Free Software Foundation; either version 2, or (at your option)
  13        any later version, or
  14      - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
  15 
  16    This file is distributed in the hope that it will be useful,
  17    but WITHOUT ANY WARRANTY; without even the implied warranty of
  18    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19    Lesser General Public License and the GNU General Public License
  20    for more details.
  21 
  22    You should have received a copy of the GNU Lesser General Public
  23    License and of the GNU General Public License along with this
  24    program.  If not, see <https://www.gnu.org/licenses/>.  */
  25 
  26 UNIT *
  27 FUNC (const UNIT *s, size_t n,
     /* [previous][next][first][last][top][bottom][index][help] */
  28       casing_prefix_context_t prefix_context,
  29       casing_suffix_context_t suffix_context,
  30       const char *iso639_language,
  31       ucs4_t (*single_character_map) (ucs4_t),
  32       size_t offset_in_rule, /* offset in 'struct special_casing_rule' */
  33       uninorm_t nf,
  34       UNIT *resultbuf, size_t *lengthp)
  35 {
  36   /* The result being accumulated.  */
  37   UNIT *result;
  38   size_t length;
  39   size_t allocated;
  40 
  41   /* Initialize the accumulator.  */
  42   if (nf != NULL || resultbuf == NULL)
  43     {
  44       result = NULL;
  45       allocated = 0;
  46     }
  47   else
  48     {
  49       result = resultbuf;
  50       allocated = *lengthp;
  51     }
  52   length = 0;
  53 
  54   {
  55     const UNIT *s_end = s + n;
  56 
  57     /* Helper for evaluating the FINAL_SIGMA condition:
  58        Last character that was not case-ignorable.  */
  59     ucs4_t last_char_except_ignorable =
  60       prefix_context.last_char_except_ignorable;
  61 
  62     /* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions:
  63        Last character that was of combining class 230 ("Above") or 0.  */
  64     ucs4_t last_char_normal_or_above =
  65       prefix_context.last_char_normal_or_above;
  66 
  67     while (s < s_end)
  68       {
  69         ucs4_t uc;
  70         int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
  71 
  72         ucs4_t mapped_uc[3];
  73         unsigned int mapped_count;
  74 
  75         if (uc < 0x10000)
  76           {
  77             /* Look first in the special-casing table.  */
  78             char code[3];
  79 
  80             code[0] = (uc >> 8) & 0xff;
  81             code[1] = uc & 0xff;
  82 
  83             for (code[2] = 0; ; code[2]++)
  84               {
  85                 const struct special_casing_rule *rule =
  86                   gl_unicase_special_lookup (code, 3);
  87 
  88                 if (rule == NULL)
  89                   break;
  90 
  91                 /* Test if the condition applies.  */
  92                 /* Does the language apply?  */
  93                 if (rule->language[0] == '\0'
  94                     || (iso639_language != NULL
  95                         && iso639_language[0] == rule->language[0]
  96                         && iso639_language[1] == rule->language[1]))
  97                   {
  98                     /* Does the context apply?  */
  99                     int context = rule->context;
 100                     bool applies;
 101 
 102                     if (context < 0)
 103                       context = - context;
 104                     switch (context)
 105                       {
 106                       case SCC_ALWAYS:
 107                         applies = true;
 108                         break;
 109 
 110                       case SCC_FINAL_SIGMA:
 111                         /* "Before" condition: preceded by a sequence
 112                            consisting of a cased letter and a case-ignorable
 113                            sequence.
 114                            "After" condition: not followed by a sequence
 115                            consisting of a case-ignorable sequence and then a
 116                            cased letter.  */
 117                         /* Test the "before" condition.  */
 118                         applies = uc_is_cased (last_char_except_ignorable);
 119                         /* Test the "after" condition.  */
 120                         if (applies)
 121                           {
 122                             const UNIT *s2 = s + count;
 123                             for (;;)
 124                               {
 125                                 if (s2 < s_end)
 126                                   {
 127                                     ucs4_t uc2;
 128                                     int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
 129                                     /* Our uc_is_case_ignorable function is
 130                                        known to return false for all cased
 131                                        characters.  So we can call
 132                                        uc_is_case_ignorable first.  */
 133                                     if (!uc_is_case_ignorable (uc2))
 134                                       {
 135                                         applies = ! uc_is_cased (uc2);
 136                                         break;
 137                                       }
 138                                     s2 += count2;
 139                                   }
 140                                 else
 141                                   {
 142                                     applies = ! uc_is_cased (suffix_context.first_char_except_ignorable);
 143                                     break;
 144                                   }
 145                               }
 146                           }
 147                         break;
 148 
 149                       case SCC_AFTER_SOFT_DOTTED:
 150                         /* "Before" condition: There is a Soft_Dotted character
 151                            before it, with no intervening character of
 152                            combining class 0 or 230 (Above).  */
 153                         /* Test the "before" condition.  */
 154                         applies = uc_is_property_soft_dotted (last_char_normal_or_above);
 155                         break;
 156 
 157                       case SCC_MORE_ABOVE:
 158                         /* "After" condition: followed by a character of
 159                            combining class 230 (Above) with no intervening
 160                            character of combining class 0 or 230 (Above).  */
 161                         /* Test the "after" condition.  */
 162                         {
 163                           const UNIT *s2 = s + count;
 164                           applies = false;
 165                           for (;;)
 166                             {
 167                               if (s2 < s_end)
 168                                 {
 169                                   ucs4_t uc2;
 170                                   int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
 171                                   int ccc = uc_combining_class (uc2);
 172                                   if (ccc == UC_CCC_A)
 173                                     {
 174                                       applies = true;
 175                                       break;
 176                                     }
 177                                   if (ccc == UC_CCC_NR)
 178                                     break;
 179                                   s2 += count2;
 180                                 }
 181                               else
 182                                 {
 183                                   applies = ((suffix_context.bits & SCC_MORE_ABOVE_MASK) != 0);
 184                                   break;
 185                                 }
 186                             }
 187                         }
 188                         break;
 189 
 190                       case SCC_BEFORE_DOT:
 191                         /* "After" condition: followed by COMBINING DOT ABOVE
 192                            (U+0307). Any sequence of characters with a
 193                            combining class that is neither 0 nor 230 may
 194                            intervene between the current character and the
 195                            combining dot above.  */
 196                         /* Test the "after" condition.  */
 197                         {
 198                           const UNIT *s2 = s + count;
 199                           applies = false;
 200                           for (;;)
 201                             {
 202                               if (s2 < s_end)
 203                                 {
 204                                   ucs4_t uc2;
 205                                   int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
 206                                   if (uc2 == 0x0307) /* COMBINING DOT ABOVE */
 207                                     {
 208                                       applies = true;
 209                                       break;
 210                                     }
 211                                   {
 212                                     int ccc = uc_combining_class (uc2);
 213                                     if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
 214                                       break;
 215                                   }
 216                                   s2 += count2;
 217                                 }
 218                               else
 219                                 {
 220                                   applies = ((suffix_context.bits & SCC_BEFORE_DOT_MASK) != 0);
 221                                   break;
 222                                 }
 223                             }
 224                         }
 225                         break;
 226 
 227                       case SCC_AFTER_I:
 228                         /* "Before" condition: There is an uppercase I before
 229                            it, and there is no intervening character of
 230                            combining class 0 or 230 (Above).  */
 231                         /* Test the "before" condition.  */
 232                         applies = (last_char_normal_or_above == 'I');
 233                         break;
 234 
 235                       default:
 236                         abort ();
 237                       }
 238                     if (rule->context < 0)
 239                       applies = !applies;
 240 
 241                     if (applies)
 242                       {
 243                         /* The rule applies.
 244                            Look up the mapping (0 to 3 characters).  */
 245                         const unsigned short *mapped_in_rule =
 246                           (const unsigned short *)((const char *)rule + offset_in_rule);
 247 
 248                         if (mapped_in_rule[0] == 0)
 249                           mapped_count = 0;
 250                         else
 251                           {
 252                             mapped_uc[0] = mapped_in_rule[0];
 253                             if (mapped_in_rule[1] == 0)
 254                               mapped_count = 1;
 255                             else
 256                               {
 257                                 mapped_uc[1] = mapped_in_rule[1];
 258                                 if (mapped_in_rule[2] == 0)
 259                                   mapped_count = 2;
 260                                 else
 261                                   {
 262                                     mapped_uc[2] = mapped_in_rule[2];
 263                                     mapped_count = 3;
 264                                   }
 265                               }
 266                           }
 267                         goto found_mapping;
 268                       }
 269                   }
 270 
 271                 /* Optimization: Save a hash table lookup in the next round.  */
 272                 if (!rule->has_next)
 273                   break;
 274               }
 275           }
 276 
 277         /* No special-cased mapping.  So use the locale and context independent
 278            mapping.  */
 279         mapped_uc[0] = single_character_map (uc);
 280         mapped_count = 1;
 281 
 282        found_mapping:
 283         /* Found the mapping: uc maps to mapped_uc[0..mapped_count-1].  */
 284         {
 285           unsigned int i;
 286 
 287           for (i = 0; i < mapped_count; i++)
 288             {
 289               ucs4_t muc = mapped_uc[i];
 290 
 291               /* Append muc to the result accumulator.  */
 292               if (length < allocated)
 293                 {
 294                   int ret = U_UCTOMB (result + length, muc, allocated - length);
 295                   if (ret == -1)
 296                     {
 297                       errno = EINVAL;
 298                       goto fail;
 299                     }
 300                   if (ret >= 0)
 301                     {
 302                       length += ret;
 303                       goto done_appending;
 304                     }
 305                 }
 306               {
 307                 size_t old_allocated = allocated;
 308                 size_t new_allocated = 2 * old_allocated;
 309                 if (new_allocated < 64)
 310                   new_allocated = 64;
 311                 if (new_allocated < old_allocated) /* integer overflow? */
 312                   abort ();
 313                 {
 314                   UNIT *larger_result;
 315                   if (result == NULL)
 316                     {
 317                       larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
 318                       if (larger_result == NULL)
 319                         {
 320                           errno = ENOMEM;
 321                           goto fail;
 322                         }
 323                     }
 324                   else if (result == resultbuf)
 325                     {
 326                       larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
 327                       if (larger_result == NULL)
 328                         {
 329                           errno = ENOMEM;
 330                           goto fail;
 331                         }
 332                       U_CPY (larger_result, resultbuf, length);
 333                     }
 334                   else
 335                     {
 336                       larger_result =
 337                         (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
 338                       if (larger_result == NULL)
 339                         {
 340                           errno = ENOMEM;
 341                           goto fail;
 342                         }
 343                     }
 344                   result = larger_result;
 345                   allocated = new_allocated;
 346                   {
 347                     int ret = U_UCTOMB (result + length, muc, allocated - length);
 348                     if (ret == -1)
 349                       {
 350                         errno = EINVAL;
 351                         goto fail;
 352                       }
 353                     if (ret < 0)
 354                       abort ();
 355                     length += ret;
 356                     goto done_appending;
 357                   }
 358                 }
 359               }
 360              done_appending: ;
 361             }
 362         }
 363 
 364         if (!uc_is_case_ignorable (uc))
 365           last_char_except_ignorable = uc;
 366 
 367         {
 368           int ccc = uc_combining_class (uc);
 369           if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
 370             last_char_normal_or_above = uc;
 371         }
 372 
 373         s += count;
 374       }
 375   }
 376 
 377   if (nf != NULL)
 378     {
 379       /* Finally, normalize the result.  */
 380       UNIT *normalized_result;
 381 
 382       normalized_result = U_NORMALIZE (nf, result, length, resultbuf, lengthp);
 383       if (normalized_result == NULL)
 384         goto fail;
 385 
 386       free (result);
 387       return normalized_result;
 388     }
 389 
 390   if (length == 0)
 391     {
 392       if (result == NULL)
 393         {
 394           /* Return a non-NULL value.  NULL means error.  */
 395           result = (UNIT *) malloc (1);
 396           if (result == NULL)
 397             {
 398               errno = ENOMEM;
 399               goto fail;
 400             }
 401         }
 402     }
 403   else if (result != resultbuf && length < allocated)
 404     {
 405       /* Shrink the allocated memory if possible.  */
 406       UNIT *memory;
 407 
 408       memory = (UNIT *) realloc (result, length * sizeof (UNIT));
 409       if (memory != NULL)
 410         result = memory;
 411     }
 412 
 413   *lengthp = length;
 414   return result;
 415 
 416  fail:
 417   if (result != resultbuf)
 418     {
 419       int saved_errno = errno;
 420       free (result);
 421       errno = saved_errno;
 422     }
 423   return NULL;
 424 }

/* [previous][next][first][last][top][bottom][index][help] */