root/maint/gnulib/lib/uniwbrk/u-wordbreaks.h

/* [previous][next][first][last][top][bottom][index][help] */

INCLUDED FROM


DEFINITIONS

This source file includes following definitions.
  1. FUNC

   1 /* Word breaks in UTF-8/UTF-16/UTF-32 strings.  -*- coding: utf-8 -*-
   2    Copyright (C) 2009-2021 Free Software Foundation, Inc.
   3    Written by Bruno Haible <bruno@clisp.org>, 2009.
   4 
   5    This file is free software.
   6    It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
   7    You can redistribute it and/or modify it under either
   8      - the terms of the GNU Lesser General Public License as published
   9        by the Free Software Foundation; either version 3, or (at your
  10        option) any later version, or
  11      - the terms of the GNU General Public License as published by the
  12        Free Software Foundation; either version 2, or (at your option)
  13        any later version, or
  14      - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
  15 
  16    This file is distributed in the hope that it will be useful,
  17    but WITHOUT ANY WARRANTY; without even the implied warranty of
  18    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19    Lesser General Public License and the GNU General Public License
  20    for more details.
  21 
  22    You should have received a copy of the GNU Lesser General Public
  23    License and of the GNU General Public License along with this
  24    program.  If not, see <https://www.gnu.org/licenses/>.  */
  25 
  26 void
  27 FUNC (const UNIT *s, size_t n, char *p)
     /* [previous][next][first][last][top][bottom][index][help] */
  28 {
  29   if (n > 0)
  30     {
  31       const UNIT *s_end = s + n;
  32 
  33       /* Word break property of the last character.
  34          -1 at the very beginning of the string.  */
  35       int last_char_prop = -1;
  36 
  37       /* Format and Extend characters are ignored; this means, the mostly used
  38          unit is the complex character (= character with subsequent ignored
  39          characters).
  40          Word break property of the last complex character.
  41          -1 at the very beginning of the string.  */
  42       int last_compchar_prop = -1;
  43       char *last_compchar_ptr = NULL;
  44 
  45       /* For recognizing rules involving 3 complex characters:
  46          Word break property of the second-to-last complex character.
  47          -1 at the very beginning of the string.  */
  48       int secondlast_compchar_prop = -1;
  49 
  50       size_t ri_count = 0;
  51 
  52       /* Don't break inside multibyte characters.  */
  53       memset (p, 0, n);
  54 
  55       while (s < s_end)
  56         {
  57           ucs4_t uc;
  58           int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
  59           int prop = uc_wordbreak_property (uc);
  60 
  61           /* No break at the start of the string.  */
  62           if (last_char_prop >= 0)
  63             {
  64               /* No break between CR and LF (WB3).  */
  65               if (last_char_prop == WBP_CR && prop == WBP_LF)
  66                 /* *p = 0 */;
  67               /* Break before and after newlines (WB3a, WB3b).  */
  68               else if ((last_char_prop == WBP_CR
  69                         || last_char_prop == WBP_LF
  70                         || last_char_prop == WBP_NEWLINE)
  71                        || (prop == WBP_CR
  72                            || prop == WBP_LF
  73                            || prop == WBP_NEWLINE))
  74                 *p = 1;
  75               /* No break within emoji zwj sequence (WB3c).  */
  76               else if (last_char_prop == WBP_ZWJ &&
  77                        (prop == WBP_GAZ || prop == WBP_EBG))
  78                 /* *p = 0 */;
  79               /* Ignore Format and Extend characters.  */
  80               else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT || prop == WBP_ZWJ))
  81                 {
  82                   /* No break in these situations (see UAX #29):
  83 
  84                       secondlast          last             current
  85 
  86     (ALetter | HL)   (MidLetter | MidNumLet | SQ) × (ALetter | HL)      (WB7)
  87     (ALetter | HL) × (MidLetter | MidNumLet | SQ)   (ALetter | HL)      (WB6)
  88                   Numeric   (MidNum | MidNumLet | SQ)    × Numeric      (WB11)
  89                   Numeric × (MidNum | MidNumLet | SQ)      Numeric      (WB12)
  90                                                         HL × DQ HL      (WB7b)
  91                                                         HL DQ × HL      (WB7c)
  92                                                 ^ (RI RI)* RI × RI      (WB15)
  93                                             [^RI] (RI RI)* RI × RI      (WB16)
  94                    */
  95                   /* No break across certain punctuation.  Also, disable word
  96                      breaks that were recognized earlier (due to lookahead of
  97                      only one complex character).  */
  98                   if (((prop == WBP_ALETTER
  99                         || prop == WBP_HL)
 100                        && (last_compchar_prop == WBP_MIDLETTER
 101                            || last_compchar_prop == WBP_MIDNUMLET
 102                            || last_compchar_prop == WBP_SQ)
 103                        && (secondlast_compchar_prop == WBP_ALETTER
 104                            || secondlast_compchar_prop == WBP_HL))
 105                       || (prop == WBP_NUMERIC
 106                           && (last_compchar_prop == WBP_MIDNUM
 107                               || last_compchar_prop == WBP_MIDNUMLET
 108                               || last_compchar_prop == WBP_SQ)
 109                           && secondlast_compchar_prop == WBP_NUMERIC)
 110                       || (prop == WBP_HL
 111                           && last_compchar_prop == WBP_DQ
 112                           && secondlast_compchar_prop == WBP_HL))
 113                     {
 114                       *last_compchar_ptr = 0;
 115                       /* *p = 0; */
 116                     }
 117                   /* Break before RI, if odd number of RI's are
 118                      preceding (WB15, WB16).  */
 119                   else if (last_compchar_prop == WBP_RI && prop == WBP_RI)
 120                     {
 121                       if (ri_count % 2 == 0)
 122                         *p = 1;
 123                       /* else *p = 0 */
 124                     }
 125                   /* Break after Format and Extend character.  */
 126                   else if (last_compchar_prop == WBP_EXTEND
 127                            || last_compchar_prop == WBP_FORMAT)
 128                     *p = 1;
 129                   else
 130                     {
 131                       int last_compchar_index =
 132                         uniwbrk_prop_index[last_compchar_prop];
 133                       int index = uniwbrk_prop_index[prop];
 134 
 135                       /* Break between unknown pair (WB999).  */
 136                       if (last_compchar_index < 0 || index < 0)
 137                         *p = 1;
 138                       /* Perform a single table lookup.  */
 139                       else if (uniwbrk_table[last_compchar_index][index])
 140                         *p = 1;
 141                       /* else *p = 0; */
 142                     }
 143                 }
 144             }
 145 
 146           last_char_prop = prop;
 147 
 148           /* Ignore Format and Extend characters, except at the
 149              start of the line.  */
 150           if (last_compchar_prop < 0
 151               || last_compchar_prop == WBP_CR
 152               || last_compchar_prop == WBP_LF
 153               || last_compchar_prop == WBP_NEWLINE
 154               || !(prop == WBP_EXTEND || prop == WBP_FORMAT || prop == WBP_ZWJ))
 155             {
 156               secondlast_compchar_prop = last_compchar_prop;
 157               last_compchar_prop = prop;
 158               last_compchar_ptr = p;
 159 
 160               if (prop == WBP_RI)
 161                 ri_count++;
 162               else
 163                 ri_count = 0;
 164             }
 165 
 166           s += count;
 167           p += count;
 168         }
 169     }
 170 }

/* [previous][next][first][last][top][bottom][index][help] */