1 /* Word breaks in UTF-8/UTF-16/UTF-32 strings. -*- coding: utf-8 -*- 2 Copyright (C) 2009-2021 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno@clisp.org>, 2009. 4 5 This file is free software. 6 It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". 7 You can redistribute it and/or modify it under either 8 - the terms of the GNU Lesser General Public License as published 9 by the Free Software Foundation; either version 3, or (at your 10 option) any later version, or 11 - the terms of the GNU General Public License as published by the 12 Free Software Foundation; either version 2, or (at your option) 13 any later version, or 14 - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+". 15 16 This file is distributed in the hope that it will be useful, 17 but WITHOUT ANY WARRANTY; without even the implied warranty of 18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 Lesser General Public License and the GNU General Public License 20 for more details. 21 22 You should have received a copy of the GNU Lesser General Public 23 License and of the GNU General Public License along with this 24 program. If not, see <https://www.gnu.org/licenses/>. */ 25 26 void 27 FUNC (const UNIT *s, size_t n, char *p) /* */ 28 { 29 if (n > 0) 30 { 31 const UNIT *s_end = s + n; 32 33 /* Word break property of the last character. 34 -1 at the very beginning of the string. */ 35 int last_char_prop = -1; 36 37 /* Format and Extend characters are ignored; this means, the mostly used 38 unit is the complex character (= character with subsequent ignored 39 characters). 40 Word break property of the last complex character. 41 -1 at the very beginning of the string. */ 42 int last_compchar_prop = -1; 43 char *last_compchar_ptr = NULL; 44 45 /* For recognizing rules involving 3 complex characters: 46 Word break property of the second-to-last complex character. 47 -1 at the very beginning of the string. */ 48 int secondlast_compchar_prop = -1; 49 50 size_t ri_count = 0; 51 52 /* Don't break inside multibyte characters. */ 53 memset (p, 0, n); 54 55 while (s < s_end) 56 { 57 ucs4_t uc; 58 int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s); 59 int prop = uc_wordbreak_property (uc); 60 61 /* No break at the start of the string. */ 62 if (last_char_prop >= 0) 63 { 64 /* No break between CR and LF (WB3). */ 65 if (last_char_prop == WBP_CR && prop == WBP_LF) 66 /* *p = 0 */; 67 /* Break before and after newlines (WB3a, WB3b). */ 68 else if ((last_char_prop == WBP_CR 69 || last_char_prop == WBP_LF 70 || last_char_prop == WBP_NEWLINE) 71 || (prop == WBP_CR 72 || prop == WBP_LF 73 || prop == WBP_NEWLINE)) 74 *p = 1; 75 /* No break within emoji zwj sequence (WB3c). */ 76 else if (last_char_prop == WBP_ZWJ && 77 (prop == WBP_GAZ || prop == WBP_EBG)) 78 /* *p = 0 */; 79 /* Ignore Format and Extend characters. */ 80 else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT || prop == WBP_ZWJ)) 81 { 82 /* No break in these situations (see UAX #29): 83 84 secondlast last current 85 86 (ALetter | HL) (MidLetter | MidNumLet | SQ) × (ALetter | HL) (WB7) 87 (ALetter | HL) × (MidLetter | MidNumLet | SQ) (ALetter | HL) (WB6) 88 Numeric (MidNum | MidNumLet | SQ) × Numeric (WB11) 89 Numeric × (MidNum | MidNumLet | SQ) Numeric (WB12) 90 HL × DQ HL (WB7b) 91 HL DQ × HL (WB7c) 92 ^ (RI RI)* RI × RI (WB15) 93 [^RI] (RI RI)* RI × RI (WB16) 94 */ 95 /* No break across certain punctuation. Also, disable word 96 breaks that were recognized earlier (due to lookahead of 97 only one complex character). */ 98 if (((prop == WBP_ALETTER 99 || prop == WBP_HL) 100 && (last_compchar_prop == WBP_MIDLETTER 101 || last_compchar_prop == WBP_MIDNUMLET 102 || last_compchar_prop == WBP_SQ) 103 && (secondlast_compchar_prop == WBP_ALETTER 104 || secondlast_compchar_prop == WBP_HL)) 105 || (prop == WBP_NUMERIC 106 && (last_compchar_prop == WBP_MIDNUM 107 || last_compchar_prop == WBP_MIDNUMLET 108 || last_compchar_prop == WBP_SQ) 109 && secondlast_compchar_prop == WBP_NUMERIC) 110 || (prop == WBP_HL 111 && last_compchar_prop == WBP_DQ 112 && secondlast_compchar_prop == WBP_HL)) 113 { 114 *last_compchar_ptr = 0; 115 /* *p = 0; */ 116 } 117 /* Break before RI, if odd number of RI's are 118 preceding (WB15, WB16). */ 119 else if (last_compchar_prop == WBP_RI && prop == WBP_RI) 120 { 121 if (ri_count % 2 == 0) 122 *p = 1; 123 /* else *p = 0 */ 124 } 125 /* Break after Format and Extend character. */ 126 else if (last_compchar_prop == WBP_EXTEND 127 || last_compchar_prop == WBP_FORMAT) 128 *p = 1; 129 else 130 { 131 int last_compchar_index = 132 uniwbrk_prop_index[last_compchar_prop]; 133 int index = uniwbrk_prop_index[prop]; 134 135 /* Break between unknown pair (WB999). */ 136 if (last_compchar_index < 0 || index < 0) 137 *p = 1; 138 /* Perform a single table lookup. */ 139 else if (uniwbrk_table[last_compchar_index][index]) 140 *p = 1; 141 /* else *p = 0; */ 142 } 143 } 144 } 145 146 last_char_prop = prop; 147 148 /* Ignore Format and Extend characters, except at the 149 start of the line. */ 150 if (last_compchar_prop < 0 151 || last_compchar_prop == WBP_CR 152 || last_compchar_prop == WBP_LF 153 || last_compchar_prop == WBP_NEWLINE 154 || !(prop == WBP_EXTEND || prop == WBP_FORMAT || prop == WBP_ZWJ)) 155 { 156 secondlast_compchar_prop = last_compchar_prop; 157 last_compchar_prop = prop; 158 last_compchar_ptr = p; 159 160 if (prop == WBP_RI) 161 ri_count++; 162 else 163 ri_count = 0; 164 } 165 166 s += count; 167 p += count; 168 } 169 } 170 }