1 /* Word breaks in UTF-8/UTF-16/UTF-32 strings. -*- coding: utf-8 -*-
2 Copyright (C) 2009-2021 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5 This file is free software.
6 It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
7 You can redistribute it and/or modify it under either
8 - the terms of the GNU Lesser General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version, or
11 - the terms of the GNU General Public License as published by the
12 Free Software Foundation; either version 2, or (at your option)
13 any later version, or
14 - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
15
16 This file is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 Lesser General Public License and the GNU General Public License
20 for more details.
21
22 You should have received a copy of the GNU Lesser General Public
23 License and of the GNU General Public License along with this
24 program. If not, see <https://www.gnu.org/licenses/>. */
25
26 void
27 FUNC (const UNIT *s, size_t n, char *p)
/* ![[previous]](../icons/n_left.png)
![[next]](../icons/n_right.png)
![[first]](../icons/n_first.png)
![[last]](../icons/n_last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
28 {
29 if (n > 0)
30 {
31 const UNIT *s_end = s + n;
32
33 /* Word break property of the last character.
34 -1 at the very beginning of the string. */
35 int last_char_prop = -1;
36
37 /* Format and Extend characters are ignored; this means, the mostly used
38 unit is the complex character (= character with subsequent ignored
39 characters).
40 Word break property of the last complex character.
41 -1 at the very beginning of the string. */
42 int last_compchar_prop = -1;
43 char *last_compchar_ptr = NULL;
44
45 /* For recognizing rules involving 3 complex characters:
46 Word break property of the second-to-last complex character.
47 -1 at the very beginning of the string. */
48 int secondlast_compchar_prop = -1;
49
50 size_t ri_count = 0;
51
52 /* Don't break inside multibyte characters. */
53 memset (p, 0, n);
54
55 while (s < s_end)
56 {
57 ucs4_t uc;
58 int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
59 int prop = uc_wordbreak_property (uc);
60
61 /* No break at the start of the string. */
62 if (last_char_prop >= 0)
63 {
64 /* No break between CR and LF (WB3). */
65 if (last_char_prop == WBP_CR && prop == WBP_LF)
66 /* *p = 0 */;
67 /* Break before and after newlines (WB3a, WB3b). */
68 else if ((last_char_prop == WBP_CR
69 || last_char_prop == WBP_LF
70 || last_char_prop == WBP_NEWLINE)
71 || (prop == WBP_CR
72 || prop == WBP_LF
73 || prop == WBP_NEWLINE))
74 *p = 1;
75 /* No break within emoji zwj sequence (WB3c). */
76 else if (last_char_prop == WBP_ZWJ &&
77 (prop == WBP_GAZ || prop == WBP_EBG))
78 /* *p = 0 */;
79 /* Ignore Format and Extend characters. */
80 else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT || prop == WBP_ZWJ))
81 {
82 /* No break in these situations (see UAX #29):
83
84 secondlast last current
85
86 (ALetter | HL) (MidLetter | MidNumLet | SQ) × (ALetter | HL) (WB7)
87 (ALetter | HL) × (MidLetter | MidNumLet | SQ) (ALetter | HL) (WB6)
88 Numeric (MidNum | MidNumLet | SQ) × Numeric (WB11)
89 Numeric × (MidNum | MidNumLet | SQ) Numeric (WB12)
90 HL × DQ HL (WB7b)
91 HL DQ × HL (WB7c)
92 ^ (RI RI)* RI × RI (WB15)
93 [^RI] (RI RI)* RI × RI (WB16)
94 */
95 /* No break across certain punctuation. Also, disable word
96 breaks that were recognized earlier (due to lookahead of
97 only one complex character). */
98 if (((prop == WBP_ALETTER
99 || prop == WBP_HL)
100 && (last_compchar_prop == WBP_MIDLETTER
101 || last_compchar_prop == WBP_MIDNUMLET
102 || last_compchar_prop == WBP_SQ)
103 && (secondlast_compchar_prop == WBP_ALETTER
104 || secondlast_compchar_prop == WBP_HL))
105 || (prop == WBP_NUMERIC
106 && (last_compchar_prop == WBP_MIDNUM
107 || last_compchar_prop == WBP_MIDNUMLET
108 || last_compchar_prop == WBP_SQ)
109 && secondlast_compchar_prop == WBP_NUMERIC)
110 || (prop == WBP_HL
111 && last_compchar_prop == WBP_DQ
112 && secondlast_compchar_prop == WBP_HL))
113 {
114 *last_compchar_ptr = 0;
115 /* *p = 0; */
116 }
117 /* Break before RI, if odd number of RI's are
118 preceding (WB15, WB16). */
119 else if (last_compchar_prop == WBP_RI && prop == WBP_RI)
120 {
121 if (ri_count % 2 == 0)
122 *p = 1;
123 /* else *p = 0 */
124 }
125 /* Break after Format and Extend character. */
126 else if (last_compchar_prop == WBP_EXTEND
127 || last_compchar_prop == WBP_FORMAT)
128 *p = 1;
129 else
130 {
131 int last_compchar_index =
132 uniwbrk_prop_index[last_compchar_prop];
133 int index = uniwbrk_prop_index[prop];
134
135 /* Break between unknown pair (WB999). */
136 if (last_compchar_index < 0 || index < 0)
137 *p = 1;
138 /* Perform a single table lookup. */
139 else if (uniwbrk_table[last_compchar_index][index])
140 *p = 1;
141 /* else *p = 0; */
142 }
143 }
144 }
145
146 last_char_prop = prop;
147
148 /* Ignore Format and Extend characters, except at the
149 start of the line. */
150 if (last_compchar_prop < 0
151 || last_compchar_prop == WBP_CR
152 || last_compchar_prop == WBP_LF
153 || last_compchar_prop == WBP_NEWLINE
154 || !(prop == WBP_EXTEND || prop == WBP_FORMAT || prop == WBP_ZWJ))
155 {
156 secondlast_compchar_prop = last_compchar_prop;
157 last_compchar_prop = prop;
158 last_compchar_ptr = p;
159
160 if (prop == WBP_RI)
161 ri_count++;
162 else
163 ri_count = 0;
164 }
165
166 s += count;
167 p += count;
168 }
169 }
170 }