root/maint/gnulib/lib/unicase/u-ct-totitle.h

/* [previous][next][first][last][top][bottom][index][help] */

INCLUDED FROM


DEFINITIONS

This source file includes following definitions.
  1. FUNC

   1 /* Titlecase mapping for UTF-8/UTF-16/UTF-32 substrings (locale dependent).
   2    Copyright (C) 2009-2021 Free Software Foundation, Inc.
   3    Written by Bruno Haible <bruno@clisp.org>, 2009.
   4 
   5    This file is free software.
   6    It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
   7    You can redistribute it and/or modify it under either
   8      - the terms of the GNU Lesser General Public License as published
   9        by the Free Software Foundation; either version 3, or (at your
  10        option) any later version, or
  11      - the terms of the GNU General Public License as published by the
  12        Free Software Foundation; either version 2, or (at your option)
  13        any later version, or
  14      - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
  15 
  16    This file is distributed in the hope that it will be useful,
  17    but WITHOUT ANY WARRANTY; without even the implied warranty of
  18    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19    Lesser General Public License and the GNU General Public License
  20    for more details.
  21 
  22    You should have received a copy of the GNU Lesser General Public
  23    License and of the GNU General Public License along with this
  24    program.  If not, see <https://www.gnu.org/licenses/>.  */
  25 
  26 /* Quoting the Unicode standard, section "Default Case Algorithms":
  27      Find the word boundaries in X according to Unicode Standard Annex #29,
  28      “Text Boundaries.” For each word boundary, find the first cased character
  29      F following the word boundary. If F exists, map F to Titlecase_Mapping(F);
  30      then map all characters C between F and the following word boundary to
  31      Lowercase_Mapping(C).  */
  32 
  33 UNIT *
  34 FUNC (const UNIT *s, size_t n,
     /* [previous][next][first][last][top][bottom][index][help] */
  35       casing_prefix_context_t prefix_context,
  36       casing_suffix_context_t suffix_context,
  37       const char *iso639_language,
  38       uninorm_t nf,
  39       UNIT *resultbuf, size_t *lengthp)
  40 {
  41   /* The result being accumulated.  */
  42   UNIT *result;
  43   size_t length;
  44   size_t allocated;
  45   /* An array containing the word break positions.  */
  46   char *wordbreaks;
  47 
  48   /* Initialize the accumulator.  */
  49   if (nf != NULL || resultbuf == NULL)
  50     {
  51       result = NULL;
  52       allocated = 0;
  53     }
  54   else
  55     {
  56       result = resultbuf;
  57       allocated = *lengthp;
  58     }
  59   length = 0;
  60 
  61   /* Initialize the word breaks array.  */
  62   if (n > 0)
  63     {
  64       wordbreaks = (char *) malloc (n);
  65       if (wordbreaks == NULL)
  66         {
  67           errno = ENOMEM;
  68           goto fail2;
  69         }
  70       U_WORDBREAKS (s, n, wordbreaks);
  71     }
  72   else
  73     wordbreaks = NULL;
  74 
  75   {
  76     const UNIT *s_end = s + n;
  77     const char *wp = wordbreaks;
  78 
  79     /* When considering the string as segmented by word boundaries: For each
  80        such segment:
  81         - In the first part, we are searching for the first cased character.
  82           In this state, in_word_first_part = true, and no conversion takes
  83           place.
  84         - In the second part, we are converting every character: the first
  85           among these characters to title case, the other ones to lower case.
  86           In this state, in_word_first_part = false.  */
  87     bool in_word_first_part = true;
  88 
  89     /* Helper for evaluating the FINAL_SIGMA condition:
  90        Last character that was not case-ignorable.  */
  91     ucs4_t last_char_except_ignorable =
  92       prefix_context.last_char_except_ignorable;
  93 
  94     /* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions:
  95        Last character that was of combining class 230 ("Above") or 0.  */
  96     ucs4_t last_char_normal_or_above =
  97       prefix_context.last_char_normal_or_above;
  98 
  99     while (s < s_end)
 100       {
 101         /* Fetch the next character.  */
 102         ucs4_t uc;
 103         int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
 104 
 105         ucs4_t (*single_character_map) (ucs4_t);
 106         size_t offset_in_rule; /* offset in 'struct special_casing_rule' */
 107 
 108         ucs4_t mapped_uc[3];
 109         unsigned int mapped_count;
 110 
 111         if (*wp)
 112           /* Crossing a word boundary.  */
 113           in_word_first_part = true;
 114 
 115         /* Determine single_character_map, offset_in_rule.
 116            There are three possibilities:
 117              - uc should not be converted.
 118              - uc should be titlecased.
 119              - uc should be lowercased.  */
 120         if (in_word_first_part)
 121           {
 122             if (uc_is_cased (uc))
 123               {
 124                 /* uc is to be titlecased.  */
 125                 single_character_map = uc_totitle;
 126                 offset_in_rule = offsetof (struct special_casing_rule, title[0]);
 127                 in_word_first_part = false;
 128               }
 129             else
 130               {
 131                 /* uc is not converted.  */
 132                 single_character_map = NULL;
 133                 offset_in_rule = 0;
 134               }
 135           }
 136         else
 137           {
 138             /* uc is to be lowercased.  */
 139             single_character_map = uc_tolower;
 140             offset_in_rule = offsetof (struct special_casing_rule, lower[0]);
 141           }
 142 
 143         /* Actually map uc.  */
 144         if (single_character_map == NULL)
 145           {
 146             mapped_uc[0] = uc;
 147             mapped_count = 1;
 148             goto found_mapping;
 149           }
 150 
 151         if (uc < 0x10000)
 152           {
 153             /* Look first in the special-casing table.  */
 154             char code[3];
 155 
 156             code[0] = (uc >> 8) & 0xff;
 157             code[1] = uc & 0xff;
 158 
 159             for (code[2] = 0; ; code[2]++)
 160               {
 161                 const struct special_casing_rule *rule =
 162                   gl_unicase_special_lookup (code, 3);
 163 
 164                 if (rule == NULL)
 165                   break;
 166 
 167                 /* Test if the condition applies.  */
 168                 /* Does the language apply?  */
 169                 if (rule->language[0] == '\0'
 170                     || (iso639_language != NULL
 171                         && iso639_language[0] == rule->language[0]
 172                         && iso639_language[1] == rule->language[1]))
 173                   {
 174                     /* Does the context apply?  */
 175                     int context = rule->context;
 176                     bool applies;
 177 
 178                     if (context < 0)
 179                       context = - context;
 180                     switch (context)
 181                       {
 182                       case SCC_ALWAYS:
 183                         applies = true;
 184                         break;
 185 
 186                       case SCC_FINAL_SIGMA:
 187                         /* "Before" condition: preceded by a sequence
 188                            consisting of a cased letter and a case-ignorable
 189                            sequence.
 190                            "After" condition: not followed by a sequence
 191                            consisting of a case-ignorable sequence and then a
 192                            cased letter.  */
 193                         /* Test the "before" condition.  */
 194                         applies = uc_is_cased (last_char_except_ignorable);
 195                         /* Test the "after" condition.  */
 196                         if (applies)
 197                           {
 198                             const UNIT *s2 = s + count;
 199                             for (;;)
 200                               {
 201                                 if (s2 < s_end)
 202                                   {
 203                                     ucs4_t uc2;
 204                                     int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
 205                                     /* Our uc_is_case_ignorable function is
 206                                        known to return false for all cased
 207                                        characters.  So we can call
 208                                        uc_is_case_ignorable first.  */
 209                                     if (!uc_is_case_ignorable (uc2))
 210                                       {
 211                                         applies = ! uc_is_cased (uc2);
 212                                         break;
 213                                       }
 214                                     s2 += count2;
 215                                   }
 216                                 else
 217                                   {
 218                                     applies = ! uc_is_cased (suffix_context.first_char_except_ignorable);
 219                                     break;
 220                                   }
 221                               }
 222                           }
 223                         break;
 224 
 225                       case SCC_AFTER_SOFT_DOTTED:
 226                         /* "Before" condition: There is a Soft_Dotted character
 227                            before it, with no intervening character of
 228                            combining class 0 or 230 (Above).  */
 229                         /* Test the "before" condition.  */
 230                         applies = uc_is_property_soft_dotted (last_char_normal_or_above);
 231                         break;
 232 
 233                       case SCC_MORE_ABOVE:
 234                         /* "After" condition: followed by a character of
 235                            combining class 230 (Above) with no intervening
 236                            character of combining class 0 or 230 (Above).  */
 237                         /* Test the "after" condition.  */
 238                         {
 239                           const UNIT *s2 = s + count;
 240                           applies = false;
 241                           for (;;)
 242                             {
 243                               if (s2 < s_end)
 244                                 {
 245                                   ucs4_t uc2;
 246                                   int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
 247                                   int ccc = uc_combining_class (uc2);
 248                                   if (ccc == UC_CCC_A)
 249                                     {
 250                                       applies = true;
 251                                       break;
 252                                     }
 253                                   if (ccc == UC_CCC_NR)
 254                                     break;
 255                                   s2 += count2;
 256                                 }
 257                               else
 258                                 {
 259                                   applies = ((suffix_context.bits & SCC_MORE_ABOVE_MASK) != 0);
 260                                   break;
 261                                 }
 262                             }
 263                         }
 264                         break;
 265 
 266                       case SCC_BEFORE_DOT:
 267                         /* "After" condition: followed by COMBINING DOT ABOVE
 268                            (U+0307). Any sequence of characters with a
 269                            combining class that is neither 0 nor 230 may
 270                            intervene between the current character and the
 271                            combining dot above.  */
 272                         /* Test the "after" condition.  */
 273                         {
 274                           const UNIT *s2 = s + count;
 275                           applies = false;
 276                           for (;;)
 277                             {
 278                               if (s2 < s_end)
 279                                 {
 280                                   ucs4_t uc2;
 281                                   int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
 282                                   if (uc2 == 0x0307) /* COMBINING DOT ABOVE */
 283                                     {
 284                                       applies = true;
 285                                       break;
 286                                     }
 287                                   {
 288                                     int ccc = uc_combining_class (uc2);
 289                                     if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
 290                                       break;
 291                                   }
 292                                   s2 += count2;
 293                                 }
 294                               else
 295                                 {
 296                                   applies = ((suffix_context.bits & SCC_BEFORE_DOT_MASK) != 0);
 297                                   break;
 298                                 }
 299                             }
 300                         }
 301                         break;
 302 
 303                       case SCC_AFTER_I:
 304                         /* "Before" condition: There is an uppercase I before
 305                            it, and there is no intervening character of
 306                            combining class 0 or 230 (Above).  */
 307                         /* Test the "before" condition.  */
 308                         applies = (last_char_normal_or_above == 'I');
 309                         break;
 310 
 311                       default:
 312                         abort ();
 313                       }
 314                     if (rule->context < 0)
 315                       applies = !applies;
 316 
 317                     if (applies)
 318                       {
 319                         /* The rule applies.
 320                            Look up the mapping (0 to 3 characters).  */
 321                         const unsigned short *mapped_in_rule =
 322                           (const unsigned short *)((const char *)rule + offset_in_rule);
 323 
 324                         if (mapped_in_rule[0] == 0)
 325                           mapped_count = 0;
 326                         else
 327                           {
 328                             mapped_uc[0] = mapped_in_rule[0];
 329                             if (mapped_in_rule[1] == 0)
 330                               mapped_count = 1;
 331                             else
 332                               {
 333                                 mapped_uc[1] = mapped_in_rule[1];
 334                                 if (mapped_in_rule[2] == 0)
 335                                   mapped_count = 2;
 336                                 else
 337                                   {
 338                                     mapped_uc[2] = mapped_in_rule[2];
 339                                     mapped_count = 3;
 340                                   }
 341                               }
 342                           }
 343                         goto found_mapping;
 344                       }
 345                   }
 346 
 347                 /* Optimization: Save a hash table lookup in the next round.  */
 348                 if (!rule->has_next)
 349                   break;
 350               }
 351           }
 352 
 353         /* No special-cased mapping.  So use the locale and context independent
 354            mapping.  */
 355         mapped_uc[0] = single_character_map (uc);
 356         mapped_count = 1;
 357 
 358        found_mapping:
 359         /* Found the mapping: uc maps to mapped_uc[0..mapped_count-1].  */
 360         {
 361           unsigned int i;
 362 
 363           for (i = 0; i < mapped_count; i++)
 364             {
 365               ucs4_t muc = mapped_uc[i];
 366 
 367               /* Append muc to the result accumulator.  */
 368               if (length < allocated)
 369                 {
 370                   int ret = U_UCTOMB (result + length, muc, allocated - length);
 371                   if (ret == -1)
 372                     {
 373                       errno = EINVAL;
 374                       goto fail1;
 375                     }
 376                   if (ret >= 0)
 377                     {
 378                       length += ret;
 379                       goto done_appending;
 380                     }
 381                 }
 382               {
 383                 size_t old_allocated = allocated;
 384                 size_t new_allocated = 2 * old_allocated;
 385                 if (new_allocated < 64)
 386                   new_allocated = 64;
 387                 if (new_allocated < old_allocated) /* integer overflow? */
 388                   abort ();
 389                 {
 390                   UNIT *larger_result;
 391                   if (result == NULL)
 392                     {
 393                       larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
 394                       if (larger_result == NULL)
 395                         {
 396                           errno = ENOMEM;
 397                           goto fail1;
 398                         }
 399                     }
 400                   else if (result == resultbuf)
 401                     {
 402                       larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
 403                       if (larger_result == NULL)
 404                         {
 405                           errno = ENOMEM;
 406                           goto fail1;
 407                         }
 408                       U_CPY (larger_result, resultbuf, length);
 409                     }
 410                   else
 411                     {
 412                       larger_result =
 413                         (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
 414                       if (larger_result == NULL)
 415                         {
 416                           errno = ENOMEM;
 417                           goto fail1;
 418                         }
 419                     }
 420                   result = larger_result;
 421                   allocated = new_allocated;
 422                   {
 423                     int ret = U_UCTOMB (result + length, muc, allocated - length);
 424                     if (ret == -1)
 425                       {
 426                         errno = EINVAL;
 427                         goto fail1;
 428                       }
 429                     if (ret < 0)
 430                       abort ();
 431                     length += ret;
 432                     goto done_appending;
 433                   }
 434                 }
 435               }
 436              done_appending: ;
 437             }
 438         }
 439 
 440         if (!uc_is_case_ignorable (uc))
 441           last_char_except_ignorable = uc;
 442 
 443         {
 444           int ccc = uc_combining_class (uc);
 445           if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
 446             last_char_normal_or_above = uc;
 447         }
 448 
 449         s += count;
 450         wp += count;
 451       }
 452   }
 453 
 454   free (wordbreaks);
 455 
 456   if (nf != NULL)
 457     {
 458       /* Finally, normalize the result.  */
 459       UNIT *normalized_result;
 460 
 461       normalized_result = U_NORMALIZE (nf, result, length, resultbuf, lengthp);
 462       if (normalized_result == NULL)
 463         goto fail2;
 464 
 465       free (result);
 466       return normalized_result;
 467     }
 468 
 469   if (length == 0)
 470     {
 471       if (result == NULL)
 472         {
 473           /* Return a non-NULL value.  NULL means error.  */
 474           result = (UNIT *) malloc (1);
 475           if (result == NULL)
 476             {
 477               errno = ENOMEM;
 478               goto fail2;
 479             }
 480         }
 481     }
 482   else if (result != resultbuf && length < allocated)
 483     {
 484       /* Shrink the allocated memory if possible.  */
 485       UNIT *memory;
 486 
 487       memory = (UNIT *) realloc (result, length * sizeof (UNIT));
 488       if (memory != NULL)
 489         result = memory;
 490     }
 491 
 492   *lengthp = length;
 493   return result;
 494 
 495  fail1:
 496   {
 497     int saved_errno = errno;
 498     free (wordbreaks);
 499     errno = saved_errno;
 500   }
 501  fail2:
 502   if (result != resultbuf)
 503     {
 504       int saved_errno = errno;
 505       free (result);
 506       errno = saved_errno;
 507     }
 508   return NULL;
 509 }
 510 
 511 /*
 512  * Local Variables:
 513  * coding: utf-8
 514  * End:
 515  */

/* [previous][next][first][last][top][bottom][index][help] */