1 /* Stream-based normalization of Unicode strings.
2 Copyright (C) 2009-2021 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5 This file is free software.
6 It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
7 You can redistribute it and/or modify it under either
8 - the terms of the GNU Lesser General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version, or
11 - the terms of the GNU General Public License as published by the
12 Free Software Foundation; either version 2, or (at your option)
13 any later version, or
14 - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
15
16 This file is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 Lesser General Public License and the GNU General Public License
20 for more details.
21
22 You should have received a copy of the GNU Lesser General Public
23 License and of the GNU General Public License along with this
24 program. If not, see <https://www.gnu.org/licenses/>. */
25
26 #include <config.h>
27
28 /* Specification. */
29 #include "uninorm.h"
30
31 #include <errno.h>
32 #include <stddef.h>
33 #include <stdlib.h>
34 #include <string.h>
35
36 #include "unictype.h"
37 #include "normalize-internal.h"
38 #include "uninorm/decompose-internal.h"
39
40
41 struct uninorm_filter
42 {
43 /* Characteristics of the normalization form. */
44 int (*decomposer) (ucs4_t uc, ucs4_t *decomposition);
45 ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2);
46
47 /* The encapsulated stream. */
48 int (*stream_func) (void *stream_data, ucs4_t uc);
49 void *stream_data;
50
51 /* The buffer for sorting. */
52 #define SORTBUF_PREALLOCATED 64
53 struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
54 struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
55 size_t sortbuf_allocated;
56 size_t sortbuf_count;
57 };
58
59 struct uninorm_filter *
60 uninorm_filter_create (uninorm_t nf,
/* ![[previous]](../icons/n_left.png)
![[next]](../icons/right.png)
![[first]](../icons/n_first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
61 int (*stream_func) (void *stream_data, ucs4_t uc),
62 void *stream_data)
63 {
64 struct uninorm_filter *filter =
65 (struct uninorm_filter *) malloc (sizeof (struct uninorm_filter));
66
67 if (filter == NULL)
68 /* errno is ENOMEM. */
69 return NULL;
70
71 filter->decomposer = nf->decomposer;
72 filter->composer = nf->composer;
73 filter->stream_func = stream_func;
74 filter->stream_data = stream_data;
75 filter->sortbuf = filter->sortbuf_preallocated;
76 filter->sortbuf_allocated = SORTBUF_PREALLOCATED;
77 filter->sortbuf_count = 0;
78
79 return filter;
80 }
81
82 int
83 uninorm_filter_write (struct uninorm_filter *filter, ucs4_t uc_arg)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
84 {
85 ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
86 int decomposed_count;
87
88 /* Accept the next character. */
89 decomposed[0] = uc_arg;
90 decomposed_count = 1;
91
92 /* Decompose it, recursively.
93 It would be possible to precompute the recursive decomposition
94 and store it in a table. But this would significantly increase
95 the size of the decomposition tables, because for example for
96 U+1FC1 the recursive canonical decomposition and the recursive
97 compatibility decomposition are different. */
98 {
99 int curr;
100
101 for (curr = 0; curr < decomposed_count; )
102 {
103 /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
104 all elements are atomic. */
105 ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
106 int curr_decomposed_count;
107
108 curr_decomposed_count =
109 filter->decomposer (decomposed[curr], curr_decomposed);
110 if (curr_decomposed_count >= 0)
111 {
112 /* Move curr_decomposed[0..curr_decomposed_count-1] over
113 decomposed[curr], making room. It's not worth using
114 memcpy() here, since the counts are so small. */
115 int shift = curr_decomposed_count - 1;
116
117 if (shift < 0)
118 abort ();
119 if (shift > 0)
120 {
121 int j;
122
123 decomposed_count += shift;
124 if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
125 abort ();
126 for (j = decomposed_count - 1 - shift; j > curr; j--)
127 decomposed[j + shift] = decomposed[j];
128 }
129 for (; shift >= 0; shift--)
130 decomposed[curr + shift] = curr_decomposed[shift];
131 }
132 else
133 {
134 /* decomposed[curr] is atomic. */
135 curr++;
136 }
137 }
138 }
139
140 {
141 /* Cache sortbuf and sortbuf_count in local register variables. */
142 struct ucs4_with_ccc *sortbuf = filter->sortbuf;
143 size_t sortbuf_count = filter->sortbuf_count;
144 int i;
145
146 for (i = 0; i < decomposed_count; i++)
147 {
148 /* Fetch the next character from the decomposition. */
149 ucs4_t uc = decomposed[i];
150 int ccc = uc_combining_class (uc);
151
152 if (ccc == 0)
153 {
154 size_t j;
155
156 /* Apply the canonical ordering algorithm to the accumulated
157 sequence of characters. */
158 if (sortbuf_count > 1)
159 gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
160 sortbuf + sortbuf_count);
161
162 if (filter->composer != NULL)
163 {
164 /* Attempt to combine decomposed characters, as specified
165 in the Unicode Standard Annex #15 "Unicode Normalization
166 Forms". We need to check
167 1. whether the first accumulated character is a
168 "starter" (i.e. has ccc = 0). This is usually the
169 case. But when the string starts with a
170 non-starter, the sortbuf also starts with a
171 non-starter. Btw, this check could also be
172 omitted, because the composition table has only
173 entries (code1, code2) for which code1 is a
174 starter; if the first accumulated character is not
175 a starter, no lookup will succeed.
176 2. If the sortbuf has more than one character, check
177 for each of these characters that are not "blocked"
178 from the starter (i.e. have a ccc that is higher
179 than the ccc of the previous character) whether it
180 can be combined with the first character.
181 3. If only one character is left in sortbuf, check
182 whether it can be combined with the next character
183 (also a starter). */
184 if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
185 {
186 for (j = 1; j < sortbuf_count; )
187 {
188 if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
189 {
190 ucs4_t combined =
191 filter->composer (sortbuf[0].code, sortbuf[j].code);
192 if (combined)
193 {
194 size_t k;
195
196 sortbuf[0].code = combined;
197 /* sortbuf[0].ccc = 0, still valid. */
198 for (k = j + 1; k < sortbuf_count; k++)
199 sortbuf[k - 1] = sortbuf[k];
200 sortbuf_count--;
201 continue;
202 }
203 }
204 j++;
205 }
206 if (sortbuf_count == 1)
207 {
208 ucs4_t combined =
209 filter->composer (sortbuf[0].code, uc);
210 if (combined)
211 {
212 uc = combined;
213 ccc = 0;
214 /* uc could be further combined with subsequent
215 characters. So don't put it into sortbuf[0] in
216 this round, only in the next round. */
217 sortbuf_count = 0;
218 }
219 }
220 }
221 }
222
223 for (j = 0; j < sortbuf_count; j++)
224 {
225 ucs4_t muc = sortbuf[j].code;
226
227 /* Output muc to the encapsulated stream. */
228 int ret = filter->stream_func (filter->stream_data, muc);
229 if (ret < 0)
230 {
231 /* errno is set here. */
232 filter->sortbuf_count = 0;
233 return -1;
234 }
235 }
236
237 /* sortbuf is now empty. */
238 sortbuf_count = 0;
239 }
240
241 /* Append (uc, ccc) to sortbuf. */
242 if (sortbuf_count == filter->sortbuf_allocated)
243 {
244 struct ucs4_with_ccc *new_sortbuf;
245
246 filter->sortbuf_allocated = 2 * filter->sortbuf_allocated;
247 if (filter->sortbuf_allocated < sortbuf_count) /* integer overflow? */
248 abort ();
249 new_sortbuf =
250 (struct ucs4_with_ccc *)
251 malloc (2 * filter->sortbuf_allocated * sizeof (struct ucs4_with_ccc));
252 if (new_sortbuf == NULL)
253 {
254 /* errno is ENOMEM. */
255 filter->sortbuf_count = sortbuf_count;
256 return -1;
257 }
258 memcpy (new_sortbuf, filter->sortbuf,
259 sortbuf_count * sizeof (struct ucs4_with_ccc));
260 if (filter->sortbuf != filter->sortbuf_preallocated)
261 free (filter->sortbuf);
262 filter->sortbuf = new_sortbuf;
263 /* Update cache of filter->sortbuf. */
264 sortbuf = filter->sortbuf;
265 }
266 sortbuf[sortbuf_count].code = uc;
267 sortbuf[sortbuf_count].ccc = ccc;
268 sortbuf_count++;
269 }
270
271 filter->sortbuf_count = sortbuf_count;
272 }
273
274 return 0;
275 }
276
277 /* Bring data buffered in the filter to its destination, the encapsulated
278 stream.
279 Return 0 if successful, or -1 with errno set upon failure.
280 Note! If after calling this function, additional characters are written
281 into the filter, the resulting character sequence in the encapsulated stream
282 will not necessarily be normalized. */
283 int
284 uninorm_filter_flush (struct uninorm_filter *filter)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
285 {
286 /* Cache sortbuf and sortbuf_count in local register variables. */
287 struct ucs4_with_ccc * const sortbuf = filter->sortbuf;
288 size_t sortbuf_count = filter->sortbuf_count;
289 size_t j;
290
291 /* Apply the canonical ordering algorithm to the accumulated
292 sequence of characters. */
293 if (sortbuf_count > 1)
294 gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
295 sortbuf + sortbuf_count);
296
297 if (filter->composer != NULL)
298 {
299 /* Attempt to combine decomposed characters, as specified
300 in the Unicode Standard Annex #15 "Unicode Normalization
301 Forms". We need to check
302 1. whether the first accumulated character is a
303 "starter" (i.e. has ccc = 0). This is usually the
304 case. But when the string starts with a
305 non-starter, the sortbuf also starts with a
306 non-starter. Btw, this check could also be
307 omitted, because the composition table has only
308 entries (code1, code2) for which code1 is a
309 starter; if the first accumulated character is not
310 a starter, no lookup will succeed.
311 2. If the sortbuf has more than one character, check
312 for each of these characters that are not "blocked"
313 from the starter (i.e. have a ccc that is higher
314 than the ccc of the previous character) whether it
315 can be combined with the first character.
316 3. If only one character is left in sortbuf, check
317 whether it can be combined with the next character
318 (also a starter). */
319 if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
320 {
321 for (j = 1; j < sortbuf_count; )
322 {
323 if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
324 {
325 ucs4_t combined =
326 filter->composer (sortbuf[0].code, sortbuf[j].code);
327 if (combined)
328 {
329 size_t k;
330
331 sortbuf[0].code = combined;
332 /* sortbuf[0].ccc = 0, still valid. */
333 for (k = j + 1; k < sortbuf_count; k++)
334 sortbuf[k - 1] = sortbuf[k];
335 sortbuf_count--;
336 continue;
337 }
338 }
339 j++;
340 }
341 }
342 }
343
344 for (j = 0; j < sortbuf_count; j++)
345 {
346 ucs4_t muc = sortbuf[j].code;
347
348 /* Output muc to the encapsulated stream. */
349 int ret = filter->stream_func (filter->stream_data, muc);
350 if (ret < 0)
351 {
352 /* errno is set here. */
353 filter->sortbuf_count = 0;
354 return -1;
355 }
356 }
357
358 /* sortbuf is now empty. */
359 filter->sortbuf_count = 0;
360
361 return 0;
362 }
363
364 /* Bring data buffered in the filter to its destination, the encapsulated
365 stream, then close and free the filter.
366 Return 0 if successful, or -1 with errno set upon failure. */
367 int
368 uninorm_filter_free (struct uninorm_filter *filter)
/* ![[previous]](../icons/left.png)
![[next]](../icons/n_right.png)
![[first]](../icons/first.png)
![[last]](../icons/n_last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
369 {
370 int ret = uninorm_filter_flush (filter);
371
372 if (ret < 0)
373 /* errno is set here. */
374 return -1;
375
376 if (filter->sortbuf_count > 0)
377 abort ();
378 if (filter->sortbuf != filter->sortbuf_preallocated)
379 free (filter->sortbuf);
380 free (filter);
381
382 return 0;
383 }