1 /* Line breaking of UTF-8 strings.
2 Copyright (C) 2001-2003, 2006-2021 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2001.
4
5 This file is free software.
6 It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
7 You can redistribute it and/or modify it under either
8 - the terms of the GNU Lesser General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version, or
11 - the terms of the GNU General Public License as published by the
12 Free Software Foundation; either version 2, or (at your option)
13 any later version, or
14 - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
15
16 This file is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 Lesser General Public License and the GNU General Public License
20 for more details.
21
22 You should have received a copy of the GNU Lesser General Public
23 License and of the GNU General Public License along with this
24 program. If not, see <https://www.gnu.org/licenses/>. */
25
26 #include <config.h>
27
28 /* Specification. */
29 #include "unilbrk.h"
30
31 #include <stdlib.h>
32 #include <string.h>
33
34 #include "unilbrk/lbrktables.h"
35 #include "uniwidth/cjk.h"
36 #include "unistr.h"
37
38 void
39 u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding, char *p)
/* ![[previous]](../icons/n_left.png)
![[next]](../icons/right.png)
![[first]](../icons/n_first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
40 {
41 if (n > 0)
42 {
43 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
44 const uint8_t *s_end = s + n;
45 int last_prop = LBP_BK; /* line break property of last non-space character */
46 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
47 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
48
49 /* Don't break inside multibyte characters. */
50 memset (p, UC_BREAK_PROHIBITED, n);
51
52 do
53 {
54 ucs4_t uc;
55 int count = u8_mbtouc_unsafe (&uc, s, s_end - s);
56 int prop = unilbrkprop_lookup (uc);
57
58 if (prop == LBP_BK)
59 {
60 /* Mandatory break. */
61 *p = UC_BREAK_MANDATORY;
62 last_prop = LBP_BK;
63 seen_space = NULL;
64 seen_space2 = NULL;
65 }
66 else
67 {
68 char *q;
69
70 /* Resolve property values whose behaviour is not fixed. */
71 switch (prop)
72 {
73 case LBP_AI:
74 /* Resolve ambiguous. */
75 prop = LBP_AI_REPLACEMENT;
76 break;
77 case LBP_CB:
78 /* This is arbitrary. */
79 prop = LBP_ID;
80 break;
81 case LBP_SA:
82 /* We don't handle complex scripts yet.
83 Treat LBP_SA like LBP_XX. */
84 case LBP_XX:
85 /* This is arbitrary. */
86 prop = LBP_AL;
87 break;
88 }
89
90 /* Deal with spaces and combining characters. */
91 q = p;
92 if (prop == LBP_SP)
93 {
94 /* Don't break just before a space. */
95 *p = UC_BREAK_PROHIBITED;
96 seen_space2 = seen_space;
97 seen_space = p;
98 }
99 else if (prop == LBP_ZW)
100 {
101 /* Don't break just before a zero-width space. */
102 *p = UC_BREAK_PROHIBITED;
103 last_prop = LBP_ZW;
104 seen_space = NULL;
105 seen_space2 = NULL;
106 }
107 else if (prop == LBP_CM)
108 {
109 /* Don't break just before a combining character, except immediately
110 after a zero-width space. */
111 if (last_prop == LBP_ZW)
112 {
113 /* Break after zero-width space. */
114 *p = UC_BREAK_POSSIBLE;
115 /* A combining character turns a preceding space into LBP_ID. */
116 last_prop = LBP_ID;
117 }
118 else
119 {
120 *p = UC_BREAK_PROHIBITED;
121 /* A combining character turns a preceding space into LBP_ID. */
122 if (seen_space != NULL)
123 {
124 q = seen_space;
125 seen_space = seen_space2;
126 prop = LBP_ID;
127 goto lookup_via_table;
128 }
129 }
130 }
131 else
132 {
133 lookup_via_table:
134 /* prop must be usable as an index for table 7.3 of UTR #14. */
135 if (!(prop >= 0 && prop < sizeof (unilbrk_table) / sizeof (unilbrk_table[0])))
136 abort ();
137
138 if (last_prop == LBP_BK)
139 {
140 /* Don't break at the beginning of a line. */
141 *q = UC_BREAK_PROHIBITED;
142 }
143 else if (last_prop == LBP_ZW)
144 {
145 /* Break after zero-width space. */
146 *q = UC_BREAK_POSSIBLE;
147 }
148 else
149 {
150 switch (unilbrk_table [last_prop] [prop])
151 {
152 case D:
153 *q = UC_BREAK_POSSIBLE;
154 break;
155 case I:
156 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
157 break;
158 case P:
159 *q = UC_BREAK_PROHIBITED;
160 break;
161 default:
162 abort ();
163 }
164 }
165 last_prop = prop;
166 seen_space = NULL;
167 seen_space2 = NULL;
168 }
169 }
170
171 s += count;
172 p += count;
173 }
174 while (s < s_end);
175 }
176 }
177
178
179 #ifdef TEST
180
181 #include <stdio.h>
182 #include <string.h>
183
184 /* Read the contents of an input stream, and return it, terminated with a NUL
185 byte. */
186 char *
187 read_file (FILE *stream)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
188 {
189 #define BUFSIZE 4096
190 char *buf = NULL;
191 int alloc = 0;
192 int size = 0;
193 int count;
194
195 while (! feof (stream))
196 {
197 if (size + BUFSIZE > alloc)
198 {
199 alloc = alloc + alloc / 2;
200 if (alloc < size + BUFSIZE)
201 alloc = size + BUFSIZE;
202 buf = realloc (buf, alloc);
203 if (buf == NULL)
204 {
205 fprintf (stderr, "out of memory\n");
206 exit (1);
207 }
208 }
209 count = fread (buf + size, 1, BUFSIZE, stream);
210 if (count == 0)
211 {
212 if (ferror (stream))
213 {
214 perror ("fread");
215 exit (1);
216 }
217 }
218 else
219 size += count;
220 }
221 buf = realloc (buf, size + 1);
222 if (buf == NULL)
223 {
224 fprintf (stderr, "out of memory\n");
225 exit (1);
226 }
227 buf[size] = '\0';
228 return buf;
229 #undef BUFSIZE
230 }
231
232 int
233 main (int argc, char * argv[])
/* ![[previous]](../icons/left.png)
![[next]](../icons/n_right.png)
![[first]](../icons/first.png)
![[last]](../icons/n_last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
234 {
235 if (argc == 1)
236 {
237 /* Display all the break opportunities in the input string. */
238 char *input = read_file (stdin);
239 int length = strlen (input);
240 char *breaks = malloc (length);
241 int i;
242
243 u8_possible_linebreaks ((uint8_t *) input, length, "UTF-8", breaks);
244
245 for (i = 0; i < length; i++)
246 {
247 switch (breaks[i])
248 {
249 case UC_BREAK_POSSIBLE:
250 /* U+2027 in UTF-8 encoding */
251 putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout);
252 break;
253 case UC_BREAK_MANDATORY:
254 /* U+21B2 (or U+21B5) in UTF-8 encoding */
255 putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout);
256 break;
257 case UC_BREAK_PROHIBITED:
258 break;
259 default:
260 abort ();
261 }
262 putc (input[i], stdout);
263 }
264
265 free (breaks);
266
267 return 0;
268 }
269 else
270 return 1;
271 }
272
273 #endif /* TEST */