1 /* Line breaking of UTF-32 strings.
2 Copyright (C) 2001-2003, 2006-2021 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2001.
4
5 This file is free software.
6 It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
7 You can redistribute it and/or modify it under either
8 - the terms of the GNU Lesser General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version, or
11 - the terms of the GNU General Public License as published by the
12 Free Software Foundation; either version 2, or (at your option)
13 any later version, or
14 - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
15
16 This file is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 Lesser General Public License and the GNU General Public License
20 for more details.
21
22 You should have received a copy of the GNU Lesser General Public
23 License and of the GNU General Public License along with this
24 program. If not, see <https://www.gnu.org/licenses/>. */
25
26 #include <config.h>
27
28 /* Specification. */
29 #include "unilbrk.h"
30
31 #include <stdlib.h>
32
33 #include "unilbrk/lbrktables.h"
34 #include "uniwidth/cjk.h"
35
36 void
37 u32_possible_linebreaks (const uint32_t *s, size_t n, const char *encoding, char *p)
/* ![[previous]](../icons/n_left.png)
![[next]](../icons/n_right.png)
![[first]](../icons/n_first.png)
![[last]](../icons/n_last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
38 {
39 if (n > 0)
40 {
41 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
42 const uint32_t *s_end = s + n;
43 int last_prop = LBP_BK; /* line break property of last non-space character */
44 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
45 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
46
47 do
48 {
49 ucs4_t uc = *s;
50 int prop = unilbrkprop_lookup (uc);
51
52 if (prop == LBP_BK)
53 {
54 /* Mandatory break. */
55 *p = UC_BREAK_MANDATORY;
56 last_prop = LBP_BK;
57 seen_space = NULL;
58 seen_space2 = NULL;
59 }
60 else
61 {
62 char *q;
63
64 /* Resolve property values whose behaviour is not fixed. */
65 switch (prop)
66 {
67 case LBP_AI:
68 /* Resolve ambiguous. */
69 prop = LBP_AI_REPLACEMENT;
70 break;
71 case LBP_CB:
72 /* This is arbitrary. */
73 prop = LBP_ID;
74 break;
75 case LBP_SA:
76 /* We don't handle complex scripts yet.
77 Treat LBP_SA like LBP_XX. */
78 case LBP_XX:
79 /* This is arbitrary. */
80 prop = LBP_AL;
81 break;
82 }
83
84 /* Deal with spaces and combining characters. */
85 q = p;
86 if (prop == LBP_SP)
87 {
88 /* Don't break just before a space. */
89 *p = UC_BREAK_PROHIBITED;
90 seen_space2 = seen_space;
91 seen_space = p;
92 }
93 else if (prop == LBP_ZW)
94 {
95 /* Don't break just before a zero-width space. */
96 *p = UC_BREAK_PROHIBITED;
97 last_prop = LBP_ZW;
98 seen_space = NULL;
99 seen_space2 = NULL;
100 }
101 else if (prop == LBP_CM)
102 {
103 /* Don't break just before a combining character, except immediately
104 after a zero-width space. */
105 if (last_prop == LBP_ZW)
106 {
107 /* Break after zero-width space. */
108 *p = UC_BREAK_POSSIBLE;
109 /* A combining character turns a preceding space into LBP_ID. */
110 last_prop = LBP_ID;
111 }
112 else
113 {
114 *p = UC_BREAK_PROHIBITED;
115 /* A combining character turns a preceding space into LBP_ID. */
116 if (seen_space != NULL)
117 {
118 q = seen_space;
119 seen_space = seen_space2;
120 prop = LBP_ID;
121 goto lookup_via_table;
122 }
123 }
124 }
125 else
126 {
127 lookup_via_table:
128 /* prop must be usable as an index for table 7.3 of UTR #14. */
129 if (!(prop >= 0 && prop < sizeof (unilbrk_table) / sizeof (unilbrk_table[0])))
130 abort ();
131
132 if (last_prop == LBP_BK)
133 {
134 /* Don't break at the beginning of a line. */
135 *q = UC_BREAK_PROHIBITED;
136 }
137 else if (last_prop == LBP_ZW)
138 {
139 /* Break after zero-width space. */
140 *q = UC_BREAK_POSSIBLE;
141 }
142 else
143 {
144 switch (unilbrk_table [last_prop] [prop])
145 {
146 case D:
147 *q = UC_BREAK_POSSIBLE;
148 break;
149 case I:
150 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
151 break;
152 case P:
153 *q = UC_BREAK_PROHIBITED;
154 break;
155 default:
156 abort ();
157 }
158 }
159 last_prop = prop;
160 seen_space = NULL;
161 seen_space2 = NULL;
162 }
163 }
164
165 s++;
166 p++;
167 }
168 while (s < s_end);
169 }
170 }