1 /* Line breaking of UTF-16 strings.
2 Copyright (C) 2001-2003, 2006-2021 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2001.
4
5 This file is free software.
6 It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
7 You can redistribute it and/or modify it under either
8 - the terms of the GNU Lesser General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version, or
11 - the terms of the GNU General Public License as published by the
12 Free Software Foundation; either version 2, or (at your option)
13 any later version, or
14 - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
15
16 This file is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 Lesser General Public License and the GNU General Public License
20 for more details.
21
22 You should have received a copy of the GNU Lesser General Public
23 License and of the GNU General Public License along with this
24 program. If not, see <https://www.gnu.org/licenses/>. */
25
26 #include <config.h>
27
28 /* Specification. */
29 #include "unilbrk.h"
30
31 #include <stdlib.h>
32 #include <string.h>
33
34 #include "unilbrk/lbrktables.h"
35 #include "uniwidth/cjk.h"
36 #include "unistr.h"
37
38 void
39 u16_possible_linebreaks (const uint16_t *s, size_t n, const char *encoding, char *p)
/* ![[previous]](../icons/n_left.png)
![[next]](../icons/n_right.png)
![[first]](../icons/n_first.png)
![[last]](../icons/n_last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
40 {
41 if (n > 0)
42 {
43 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
44 const uint16_t *s_end = s + n;
45 int last_prop = LBP_BK; /* line break property of last non-space character */
46 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
47 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
48
49 /* Don't break inside multibyte characters. */
50 memset (p, UC_BREAK_PROHIBITED, n);
51
52 do
53 {
54 ucs4_t uc;
55 int count = u16_mbtouc_unsafe (&uc, s, s_end - s);
56 int prop = unilbrkprop_lookup (uc);
57
58 if (prop == LBP_BK)
59 {
60 /* Mandatory break. */
61 *p = UC_BREAK_MANDATORY;
62 last_prop = LBP_BK;
63 seen_space = NULL;
64 seen_space2 = NULL;
65 }
66 else
67 {
68 char *q;
69
70 /* Resolve property values whose behaviour is not fixed. */
71 switch (prop)
72 {
73 case LBP_AI:
74 /* Resolve ambiguous. */
75 prop = LBP_AI_REPLACEMENT;
76 break;
77 case LBP_CB:
78 /* This is arbitrary. */
79 prop = LBP_ID;
80 break;
81 case LBP_SA:
82 /* We don't handle complex scripts yet.
83 Treat LBP_SA like LBP_XX. */
84 case LBP_XX:
85 /* This is arbitrary. */
86 prop = LBP_AL;
87 break;
88 }
89
90 /* Deal with spaces and combining characters. */
91 q = p;
92 if (prop == LBP_SP)
93 {
94 /* Don't break just before a space. */
95 *p = UC_BREAK_PROHIBITED;
96 seen_space2 = seen_space;
97 seen_space = p;
98 }
99 else if (prop == LBP_ZW)
100 {
101 /* Don't break just before a zero-width space. */
102 *p = UC_BREAK_PROHIBITED;
103 last_prop = LBP_ZW;
104 seen_space = NULL;
105 seen_space2 = NULL;
106 }
107 else if (prop == LBP_CM)
108 {
109 /* Don't break just before a combining character, except immediately
110 after a zero-width space. */
111 if (last_prop == LBP_ZW)
112 {
113 /* Break after zero-width space. */
114 *p = UC_BREAK_POSSIBLE;
115 /* A combining character turns a preceding space into LBP_ID. */
116 last_prop = LBP_ID;
117 }
118 else
119 {
120 *p = UC_BREAK_PROHIBITED;
121 /* A combining character turns a preceding space into LBP_ID. */
122 if (seen_space != NULL)
123 {
124 q = seen_space;
125 seen_space = seen_space2;
126 prop = LBP_ID;
127 goto lookup_via_table;
128 }
129 }
130 }
131 else
132 {
133 lookup_via_table:
134 /* prop must be usable as an index for table 7.3 of UTR #14. */
135 if (!(prop >= 0 && prop < sizeof (unilbrk_table) / sizeof (unilbrk_table[0])))
136 abort ();
137
138 if (last_prop == LBP_BK)
139 {
140 /* Don't break at the beginning of a line. */
141 *q = UC_BREAK_PROHIBITED;
142 }
143 else if (last_prop == LBP_ZW)
144 {
145 /* Break after zero-width space. */
146 *q = UC_BREAK_POSSIBLE;
147 }
148 else
149 {
150 switch (unilbrk_table [last_prop] [prop])
151 {
152 case D:
153 *q = UC_BREAK_POSSIBLE;
154 break;
155 case I:
156 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
157 break;
158 case P:
159 *q = UC_BREAK_PROHIBITED;
160 break;
161 default:
162 abort ();
163 }
164 }
165 last_prop = prop;
166 seen_space = NULL;
167 seen_space2 = NULL;
168 }
169 }
170
171 s += count;
172 p += count;
173 }
174 while (s < s_end);
175 }
176 }