This source file includes following definitions.
- fill_attribute
- getfield
- fill_attributes
- output_library_license
- output_tests_license
- is_category_L
- is_category_LC
- is_category_Lu
- is_category_Ll
- is_category_Lt
- is_category_Lm
- is_category_Lo
- is_category_M
- is_category_Mn
- is_category_Mc
- is_category_Me
- is_category_N
- is_category_Nd
- is_category_Nl
- is_category_No
- is_category_P
- is_category_Pc
- is_category_Pd
- is_category_Ps
- is_category_Pe
- is_category_Pi
- is_category_Pf
- is_category_Po
- is_category_S
- is_category_Sm
- is_category_Sc
- is_category_Sk
- is_category_So
- is_category_Z
- is_category_Zs
- is_category_Zl
- is_category_Zp
- is_category_C
- is_category_Cc
- is_category_Cf
- is_category_Cs
- is_category_Co
- is_category_Cn
- debug_output_predicate
- output_predicate_test
- output_predicate
- output_categories
- general_category_byname
- output_category
- output_combclass
- bidi_category_byname
- get_bidi_category
- output_bidi_category
- get_decdigit_value
- output_decimal_digit_test
- output_decimal_digit
- get_digit_value
- output_digit_test
- output_digit
- get_numeric_value
- output_numeric_test
- output_numeric
- get_mirror_value
- output_mirror
- is_WBP_MIDNUMLET
- is_WBP_MIDLETTER
- clear_properties
- fill_properties
- fill_property30
- fill_properties30
- is_property_white_space
- is_property_alphabetic
- is_property_other_alphabetic
- is_property_not_a_character
- is_property_default_ignorable_code_point
- is_property_other_default_ignorable_code_point
- is_property_deprecated
- is_property_logical_order_exception
- is_property_variation_selector
- is_property_private_use
- is_property_unassigned_code_value
- is_property_uppercase
- is_property_other_uppercase
- is_property_lowercase
- is_property_other_lowercase
- is_property_titlecase
- is_property_cased
- is_property_case_ignorable
- is_property_changes_when_lowercased
- is_property_changes_when_uppercased
- is_property_changes_when_titlecased
- is_property_changes_when_casefolded
- is_property_changes_when_casemapped
- is_property_soft_dotted
- is_property_id_start
- is_property_other_id_start
- is_property_id_continue
- is_property_other_id_continue
- is_property_xid_start
- is_property_xid_continue
- is_property_pattern_white_space
- is_property_pattern_syntax
- is_property_join_control
- is_property_grapheme_base
- is_property_grapheme_extend
- is_property_other_grapheme_extend
- is_property_grapheme_link
- is_property_bidi_control
- is_property_bidi_left_to_right
- is_property_bidi_hebrew_right_to_left
- is_property_bidi_arabic_right_to_left
- is_property_bidi_european_digit
- is_property_bidi_eur_num_separator
- is_property_bidi_eur_num_terminator
- is_property_bidi_arabic_digit
- is_property_bidi_common_separator
- is_property_bidi_block_separator
- is_property_bidi_segment_separator
- is_property_bidi_whitespace
- is_property_bidi_non_spacing_mark
- is_property_bidi_boundary_neutral
- is_property_bidi_pdf
- is_property_bidi_embedding_or_override
- is_property_bidi_other_neutral
- is_property_hex_digit
- is_property_ascii_hex_digit
- is_property_ideographic
- is_property_unified_ideograph
- is_property_radical
- is_property_ids_binary_operator
- is_property_ids_trinary_operator
- is_property_zero_width
- is_property_space
- is_property_non_break
- is_property_iso_control
- is_property_format_control
- is_property_dash
- is_property_hyphen
- is_property_punctuation
- is_property_line_separator
- is_property_paragraph_separator
- is_property_quotation_mark
- is_property_sentence_terminal
- is_property_terminal_punctuation
- is_property_currency_symbol
- is_property_math
- is_property_other_math
- is_property_paired_punctuation
- is_property_left_of_pair
- is_property_combining
- is_property_non_spacing
- is_property_composite
- is_property_decimal_digit
- is_property_numeric
- is_property_diacritic
- is_property_extender
- is_property_ignorable_control
- output_properties
- fill_arabicshaping
- joining_type_as_c_identifier
- output_joining_type_test
- output_joining_type
- joining_group_as_c_identifier
- output_joining_group_test
- output_joining_group
- fill_scripts
- output_scripts
- output_scripts_byname
- fill_blocks
- block_first_index
- block_last_index
- output_blocks
- is_c_whitespace
- c_ident_category
- is_java_whitespace
- java_ident_category
- output_ident_category
- output_ident_properties
- to_upper
- to_lower
- to_title
- is_upper
- is_lower
- is_alpha
- is_digit
- is_alnum
- is_blank
- is_space
- is_cntrl
- is_xdigit
- is_graph
- is_print
- is_punct
- output_old_ctype
- is_combining
- is_combining_level3
- ucs_symbol
- ucs_symbol_range
- output_charclass
- output_charmap
- output_widthmap
- output_tables
- fill_width
- is_nonspacing
- output_nonspacing_property
- symbolic_width
- output_width_property_test
- get_lbp
- debug_output_lbp
- debug_output_lbrk_tables
- fill_org_lbp
- debug_output_org_lbp
- debug_output_org_lbrk_tables
- output_lbp
- output_lbrk_tables
- get_wbp
- debug_output_wbp
- debug_output_wbrk_tables
- fill_org_wbp
- debug_output_org_wbp
- debug_output_org_wbrk_tables
- output_wbp
- output_wbrk_tables
- output_gbp_test
- output_gbp_table
- fill_org_gbp
- get_decomposition
- output_decomposition
- output_decomposition_tables
- fill_composition_exclusions
- debug_output_composition_tables
- output_composition_tables
- output_simple_mapping_test
- output_simple_mapping
- add_casing_rule
- fill_casing_rules
- fill_casefolding_rules
- to_casefold
- redistribute_casefolding_rules
- compare_casing_rules
- sort_casing_rules
- output_casing_rules
- is_cased
- is_case_ignorable
- output_casing_properties
- main
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38 #include <assert.h>
39 #include <stdbool.h>
40 #include <stdint.h>
41 #include <stdio.h>
42 #include <stdlib.h>
43 #include <string.h>
44 #include <time.h>
45
46 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
47
48
49
50
51
52
53
54 struct unicode_attribute
55 {
56 const char *name;
57 const char *category;
58 const char *combining;
59 const char *bidi;
60 const char *decomposition;
61 const char *decdigit;
62 const char *digit;
63 const char *numeric;
64 bool mirrored;
65 const char *oldname;
66 const char *comment;
67 unsigned int upper;
68 unsigned int lower;
69 unsigned int title;
70 };
71
72
73
74 #define NONE (~(unsigned int)0)
75
76
77 struct unicode_attribute unicode_attributes [0x110000];
78
79
80 static void
81 fill_attribute (unsigned int i,
82 const char *field1, const char *field2,
83 const char *field3, const char *field4,
84 const char *field5, const char *field6,
85 const char *field7, const char *field8,
86 const char *field9, const char *field10,
87 const char *field11, const char *field12,
88 const char *field13, const char *field14)
89 {
90 struct unicode_attribute * uni;
91
92 if (i >= 0x110000)
93 {
94 fprintf (stderr, "index too large\n");
95 exit (1);
96 }
97 if (strcmp (field2, "Cs") == 0)
98
99 return;
100 uni = &unicode_attributes[i];
101
102 uni->name = strdup (field1);
103 uni->category = (field2[0] == '\0' ? "" : strdup (field2));
104 uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
105 uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
106 uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
107 uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
108 uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
109 uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
110 uni->mirrored = (field9[0] == 'Y');
111 uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
112 uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
113 uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
114 uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
115 uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
116 }
117
118
119 #define FIELDLEN 160
120
121
122
123
124 static int
125 getfield (FILE *stream, char *buffer, int delim)
126 {
127 int count = 0;
128 int c;
129
130 for (; (c = getc (stream)), (c != EOF && c != delim); )
131 {
132
133
134 if (c == '\r')
135 continue;
136
137
138 if (++count >= FIELDLEN - 1)
139 {
140 fprintf (stderr, "field longer than expected, increase FIELDLEN\n");
141 exit (1);
142 }
143 *buffer++ = c;
144 }
145
146 if (c == EOF)
147 return 0;
148
149 *buffer = '\0';
150 return 1;
151 }
152
153
154
155 static void
156 fill_attributes (const char *unicodedata_filename)
157 {
158 unsigned int i, j;
159 FILE *stream;
160 char field0[FIELDLEN];
161 char field1[FIELDLEN];
162 char field2[FIELDLEN];
163 char field3[FIELDLEN];
164 char field4[FIELDLEN];
165 char field5[FIELDLEN];
166 char field6[FIELDLEN];
167 char field7[FIELDLEN];
168 char field8[FIELDLEN];
169 char field9[FIELDLEN];
170 char field10[FIELDLEN];
171 char field11[FIELDLEN];
172 char field12[FIELDLEN];
173 char field13[FIELDLEN];
174 char field14[FIELDLEN];
175 int lineno = 0;
176
177 for (i = 0; i < 0x110000; i++)
178 unicode_attributes[i].name = NULL;
179
180 stream = fopen (unicodedata_filename, "r");
181 if (stream == NULL)
182 {
183 fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
184 exit (1);
185 }
186
187 for (;;)
188 {
189 int n;
190
191 lineno++;
192 n = getfield (stream, field0, ';');
193 n += getfield (stream, field1, ';');
194 n += getfield (stream, field2, ';');
195 n += getfield (stream, field3, ';');
196 n += getfield (stream, field4, ';');
197 n += getfield (stream, field5, ';');
198 n += getfield (stream, field6, ';');
199 n += getfield (stream, field7, ';');
200 n += getfield (stream, field8, ';');
201 n += getfield (stream, field9, ';');
202 n += getfield (stream, field10, ';');
203 n += getfield (stream, field11, ';');
204 n += getfield (stream, field12, ';');
205 n += getfield (stream, field13, ';');
206 n += getfield (stream, field14, '\n');
207 if (n == 0)
208 break;
209 if (n != 15)
210 {
211 fprintf (stderr, "short line in '%s':%d\n",
212 unicodedata_filename, lineno);
213 exit (1);
214 }
215 i = strtoul (field0, NULL, 16);
216 if (field1[0] == '<'
217 && strlen (field1) >= 9
218 && strcmp (field1 + strlen (field1) - 8, ", First>") == 0)
219 {
220
221 lineno++;
222 n = getfield (stream, field0, ';');
223 n += getfield (stream, field1, ';');
224 n += getfield (stream, field2, ';');
225 n += getfield (stream, field3, ';');
226 n += getfield (stream, field4, ';');
227 n += getfield (stream, field5, ';');
228 n += getfield (stream, field6, ';');
229 n += getfield (stream, field7, ';');
230 n += getfield (stream, field8, ';');
231 n += getfield (stream, field9, ';');
232 n += getfield (stream, field10, ';');
233 n += getfield (stream, field11, ';');
234 n += getfield (stream, field12, ';');
235 n += getfield (stream, field13, ';');
236 n += getfield (stream, field14, '\n');
237 if (n != 15)
238 {
239 fprintf (stderr, "missing end range in '%s':%d\n",
240 unicodedata_filename, lineno);
241 exit (1);
242 }
243 if (!(field1[0] == '<'
244 && strlen (field1) >= 8
245 && strcmp (field1 + strlen (field1) - 7, ", Last>") == 0))
246 {
247 fprintf (stderr, "missing end range in '%s':%d\n",
248 unicodedata_filename, lineno);
249 exit (1);
250 }
251 field1[strlen (field1) - 7] = '\0';
252 j = strtoul (field0, NULL, 16);
253 for (; i <= j; i++)
254 fill_attribute (i, field1+1, field2, field3, field4, field5,
255 field6, field7, field8, field9, field10,
256 field11, field12, field13, field14);
257 }
258 else
259 {
260
261 fill_attribute (i, field1, field2, field3, field4, field5,
262 field6, field7, field8, field9, field10,
263 field11, field12, field13, field14);
264 }
265 }
266
267 if (ferror (stream) || fclose (stream))
268 {
269 fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
270 exit (1);
271 }
272 }
273
274
275
276
277
278 static void
279 output_library_license (FILE *stream, bool lgplv2plus)
280 {
281 if (lgplv2plus)
282 {
283
284 fprintf (stream, " This file is free software: you can redistribute it and/or modify\n");
285 fprintf (stream, " it under the terms of the GNU Lesser General Public License as\n");
286 fprintf (stream, " published by the Free Software Foundation; either version 2.1 of the\n");
287 fprintf (stream, " License, or (at your option) any later version.\n");
288 fprintf (stream, "\n");
289 fprintf (stream, " This file is distributed in the hope that it will be useful,\n");
290 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
291 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
292 fprintf (stream, " GNU Lesser General Public License for more details.\n");
293 fprintf (stream, "\n");
294 fprintf (stream, " You should have received a copy of the GNU Lesser General Public License\n");
295 fprintf (stream, " along with this program. If not, see <https://www.gnu.org/licenses/>. */\n");
296 }
297 else
298 {
299
300 fprintf (stream, " This program is free software.\n");
301 fprintf (stream, " It is dual-licensed under \"the GNU LGPLv3+ or the GNU GPLv2+\".\n");
302 fprintf (stream, " You can redistribute it and/or modify it under either\n");
303 fprintf (stream, " - the terms of the GNU Lesser General Public License as published\n");
304 fprintf (stream, " by the Free Software Foundation; either version 3, or (at your\n");
305 fprintf (stream, " option) any later version, or\n");
306 fprintf (stream, " - the terms of the GNU General Public License as published by the\n");
307 fprintf (stream, " Free Software Foundation; either version 2, or (at your option)\n");
308 fprintf (stream, " any later version, or\n");
309 fprintf (stream, " - the same dual license \"the GNU LGPLv3+ or the GNU GPLv2+\".\n");
310 fprintf (stream, "\n");
311 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
312 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
313 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\n");
314 fprintf (stream, " Lesser General Public License and the GNU General Public License\n");
315 fprintf (stream, " for more details.\n");
316 fprintf (stream, "\n");
317 fprintf (stream, " You should have received a copy of the GNU Lesser General Public\n");
318 fprintf (stream, " License and of the GNU General Public License along with this\n");
319 fprintf (stream, " program; if not, see <https://www.gnu.org/licenses/>. */\n");
320 }
321 }
322
323
324
325 static void
326 output_tests_license (FILE *stream)
327 {
328
329 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
330 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
331 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
332 fprintf (stream, " (at your option) any later version.\n");
333 fprintf (stream, "\n");
334 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
335 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
336 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
337 fprintf (stream, " GNU General Public License for more details.\n");
338 fprintf (stream, "\n");
339 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
340 fprintf (stream, " along with this program. If not, see <https://www.gnu.org/licenses/>. */\n");
341 }
342
343
344
345
346
347
348
349 static bool
350 is_category_L (unsigned int ch)
351 {
352 return (unicode_attributes[ch].name != NULL
353 && unicode_attributes[ch].category[0] == 'L');
354 }
355
356 static bool
357 is_category_LC (unsigned int ch)
358 {
359
360 return (unicode_attributes[ch].name != NULL
361 && unicode_attributes[ch].category[0] == 'L'
362 && (unicode_attributes[ch].category[1] == 'u'
363 || unicode_attributes[ch].category[1] == 'l'
364 || unicode_attributes[ch].category[1] == 't'));
365 }
366
367 static bool
368 is_category_Lu (unsigned int ch)
369 {
370 return (unicode_attributes[ch].name != NULL
371 && unicode_attributes[ch].category[0] == 'L'
372 && unicode_attributes[ch].category[1] == 'u');
373 }
374
375 static bool
376 is_category_Ll (unsigned int ch)
377 {
378 return (unicode_attributes[ch].name != NULL
379 && unicode_attributes[ch].category[0] == 'L'
380 && unicode_attributes[ch].category[1] == 'l');
381 }
382
383 static bool
384 is_category_Lt (unsigned int ch)
385 {
386 return (unicode_attributes[ch].name != NULL
387 && unicode_attributes[ch].category[0] == 'L'
388 && unicode_attributes[ch].category[1] == 't');
389 }
390
391 static bool
392 is_category_Lm (unsigned int ch)
393 {
394 return (unicode_attributes[ch].name != NULL
395 && unicode_attributes[ch].category[0] == 'L'
396 && unicode_attributes[ch].category[1] == 'm');
397 }
398
399 static bool
400 is_category_Lo (unsigned int ch)
401 {
402 return (unicode_attributes[ch].name != NULL
403 && unicode_attributes[ch].category[0] == 'L'
404 && unicode_attributes[ch].category[1] == 'o');
405 }
406
407 static bool
408 is_category_M (unsigned int ch)
409 {
410 return (unicode_attributes[ch].name != NULL
411 && unicode_attributes[ch].category[0] == 'M');
412 }
413
414 static bool
415 is_category_Mn (unsigned int ch)
416 {
417 return (unicode_attributes[ch].name != NULL
418 && unicode_attributes[ch].category[0] == 'M'
419 && unicode_attributes[ch].category[1] == 'n');
420 }
421
422 static bool
423 is_category_Mc (unsigned int ch)
424 {
425 return (unicode_attributes[ch].name != NULL
426 && unicode_attributes[ch].category[0] == 'M'
427 && unicode_attributes[ch].category[1] == 'c');
428 }
429
430 static bool
431 is_category_Me (unsigned int ch)
432 {
433 return (unicode_attributes[ch].name != NULL
434 && unicode_attributes[ch].category[0] == 'M'
435 && unicode_attributes[ch].category[1] == 'e');
436 }
437
438 static bool
439 is_category_N (unsigned int ch)
440 {
441 return (unicode_attributes[ch].name != NULL
442 && unicode_attributes[ch].category[0] == 'N');
443 }
444
445 static bool
446 is_category_Nd (unsigned int ch)
447 {
448 return (unicode_attributes[ch].name != NULL
449 && unicode_attributes[ch].category[0] == 'N'
450 && unicode_attributes[ch].category[1] == 'd');
451 }
452
453 static bool
454 is_category_Nl (unsigned int ch)
455 {
456 return (unicode_attributes[ch].name != NULL
457 && unicode_attributes[ch].category[0] == 'N'
458 && unicode_attributes[ch].category[1] == 'l');
459 }
460
461 static bool
462 is_category_No (unsigned int ch)
463 {
464 return (unicode_attributes[ch].name != NULL
465 && unicode_attributes[ch].category[0] == 'N'
466 && unicode_attributes[ch].category[1] == 'o');
467 }
468
469 static bool
470 is_category_P (unsigned int ch)
471 {
472 return (unicode_attributes[ch].name != NULL
473 && unicode_attributes[ch].category[0] == 'P');
474 }
475
476 static bool
477 is_category_Pc (unsigned int ch)
478 {
479 return (unicode_attributes[ch].name != NULL
480 && unicode_attributes[ch].category[0] == 'P'
481 && unicode_attributes[ch].category[1] == 'c');
482 }
483
484 static bool
485 is_category_Pd (unsigned int ch)
486 {
487 return (unicode_attributes[ch].name != NULL
488 && unicode_attributes[ch].category[0] == 'P'
489 && unicode_attributes[ch].category[1] == 'd');
490 }
491
492 static bool
493 is_category_Ps (unsigned int ch)
494 {
495 return (unicode_attributes[ch].name != NULL
496 && unicode_attributes[ch].category[0] == 'P'
497 && unicode_attributes[ch].category[1] == 's');
498 }
499
500 static bool
501 is_category_Pe (unsigned int ch)
502 {
503 return (unicode_attributes[ch].name != NULL
504 && unicode_attributes[ch].category[0] == 'P'
505 && unicode_attributes[ch].category[1] == 'e');
506 }
507
508 static bool
509 is_category_Pi (unsigned int ch)
510 {
511 return (unicode_attributes[ch].name != NULL
512 && unicode_attributes[ch].category[0] == 'P'
513 && unicode_attributes[ch].category[1] == 'i');
514 }
515
516 static bool
517 is_category_Pf (unsigned int ch)
518 {
519 return (unicode_attributes[ch].name != NULL
520 && unicode_attributes[ch].category[0] == 'P'
521 && unicode_attributes[ch].category[1] == 'f');
522 }
523
524 static bool
525 is_category_Po (unsigned int ch)
526 {
527 return (unicode_attributes[ch].name != NULL
528 && unicode_attributes[ch].category[0] == 'P'
529 && unicode_attributes[ch].category[1] == 'o');
530 }
531
532 static bool
533 is_category_S (unsigned int ch)
534 {
535 return (unicode_attributes[ch].name != NULL
536 && unicode_attributes[ch].category[0] == 'S');
537 }
538
539 static bool
540 is_category_Sm (unsigned int ch)
541 {
542 return (unicode_attributes[ch].name != NULL
543 && unicode_attributes[ch].category[0] == 'S'
544 && unicode_attributes[ch].category[1] == 'm');
545 }
546
547 static bool
548 is_category_Sc (unsigned int ch)
549 {
550 return (unicode_attributes[ch].name != NULL
551 && unicode_attributes[ch].category[0] == 'S'
552 && unicode_attributes[ch].category[1] == 'c');
553 }
554
555 static bool
556 is_category_Sk (unsigned int ch)
557 {
558 return (unicode_attributes[ch].name != NULL
559 && unicode_attributes[ch].category[0] == 'S'
560 && unicode_attributes[ch].category[1] == 'k');
561 }
562
563 static bool
564 is_category_So (unsigned int ch)
565 {
566 return (unicode_attributes[ch].name != NULL
567 && unicode_attributes[ch].category[0] == 'S'
568 && unicode_attributes[ch].category[1] == 'o');
569 }
570
571 static bool
572 is_category_Z (unsigned int ch)
573 {
574 return (unicode_attributes[ch].name != NULL
575 && unicode_attributes[ch].category[0] == 'Z');
576 }
577
578 static bool
579 is_category_Zs (unsigned int ch)
580 {
581 return (unicode_attributes[ch].name != NULL
582 && unicode_attributes[ch].category[0] == 'Z'
583 && unicode_attributes[ch].category[1] == 's');
584 }
585
586 static bool
587 is_category_Zl (unsigned int ch)
588 {
589 return (unicode_attributes[ch].name != NULL
590 && unicode_attributes[ch].category[0] == 'Z'
591 && unicode_attributes[ch].category[1] == 'l');
592 }
593
594 static bool
595 is_category_Zp (unsigned int ch)
596 {
597 return (unicode_attributes[ch].name != NULL
598 && unicode_attributes[ch].category[0] == 'Z'
599 && unicode_attributes[ch].category[1] == 'p');
600 }
601
602 static bool
603 is_category_C (unsigned int ch)
604 {
605 return (unicode_attributes[ch].name == NULL
606 || unicode_attributes[ch].category[0] == 'C');
607 }
608
609 static bool
610 is_category_Cc (unsigned int ch)
611 {
612 return (unicode_attributes[ch].name != NULL
613 && unicode_attributes[ch].category[0] == 'C'
614 && unicode_attributes[ch].category[1] == 'c');
615 }
616
617 static bool
618 is_category_Cf (unsigned int ch)
619 {
620 return (unicode_attributes[ch].name != NULL
621 && unicode_attributes[ch].category[0] == 'C'
622 && unicode_attributes[ch].category[1] == 'f');
623 }
624
625 static bool
626 is_category_Cs (unsigned int ch)
627 {
628 return (ch >= 0xd800 && ch < 0xe000);
629 }
630
631 static bool
632 is_category_Co (unsigned int ch)
633 {
634 return (unicode_attributes[ch].name != NULL
635 && unicode_attributes[ch].category[0] == 'C'
636 && unicode_attributes[ch].category[1] == 'o');
637 }
638
639 static bool
640 is_category_Cn (unsigned int ch)
641 {
642 return (unicode_attributes[ch].name == NULL
643 && !(ch >= 0xd800 && ch < 0xe000));
644 }
645
646
647 static void
648 debug_output_predicate (const char *filename, bool (*predicate) (unsigned int))
649 {
650 FILE *stream;
651 unsigned int ch;
652
653 stream = fopen (filename, "w");
654 if (stream == NULL)
655 {
656 fprintf (stderr, "cannot open '%s' for writing\n", filename);
657 exit (1);
658 }
659
660 #if 0
661 for (ch = 0; ch < 0x110000; ch++)
662 if (predicate (ch))
663 {
664 fprintf (stream, "0x%04X\n", ch);
665 }
666 #else
667 for (ch = 0; ch < 0x110000; ch++)
668 if (predicate (ch))
669 {
670 unsigned int first = ch;
671 unsigned int last;
672
673 while (ch + 1 < 0x110000 && predicate (ch + 1))
674 ch++;
675 last = ch;
676 if (first < last)
677 fprintf (stream, "0x%04X..0x%04X\n", first, last);
678 else
679 fprintf (stream, "0x%04X\n", ch);
680 }
681 #endif
682
683 if (ferror (stream) || fclose (stream))
684 {
685 fprintf (stderr, "error writing to '%s'\n", filename);
686 exit (1);
687 }
688 }
689
690
691 static void
692 output_predicate_test (const char *filename, bool (*predicate) (unsigned int), const char *expression)
693 {
694 FILE *stream;
695 bool need_comma;
696 unsigned int ch;
697
698 stream = fopen (filename, "w");
699 if (stream == NULL)
700 {
701 fprintf (stderr, "cannot open '%s' for writing\n", filename);
702 exit (1);
703 }
704
705 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
706 fprintf (stream, "/* Test the Unicode character type functions.\n");
707 fprintf (stream, " Copyright (C) 2007 Free Software Foundation, Inc.\n");
708 fprintf (stream, "\n");
709 output_tests_license (stream);
710 fprintf (stream, "\n");
711 fprintf (stream, "#include \"test-predicate-part1.h\"\n");
712 fprintf (stream, "\n");
713
714 need_comma = false;
715 for (ch = 0; ch < 0x110000; ch++)
716 if (predicate (ch))
717 {
718 unsigned int first = ch;
719 unsigned int last;
720
721 while (ch + 1 < 0x110000 && predicate (ch + 1))
722 ch++;
723 last = ch;
724 if (need_comma)
725 fprintf (stream, ",\n");
726 fprintf (stream, " { 0x%04X, 0x%04X }", first, last);
727 need_comma = true;
728 }
729 if (need_comma)
730 fprintf (stream, "\n");
731
732 fprintf (stream, "\n");
733 fprintf (stream, "#define PREDICATE(c) %s\n", expression);
734 fprintf (stream, "#include \"test-predicate-part2.h\"\n");
735
736 if (ferror (stream) || fclose (stream))
737 {
738 fprintf (stderr, "error writing to '%s'\n", filename);
739 exit (1);
740 }
741 }
742
743
744 #define TABLE predicate_table
745 #define xmalloc malloc
746 #define xrealloc realloc
747 #include "3levelbit.h"
748
749
750 static void
751 output_predicate (const char *filename, bool (*predicate) (unsigned int), const char *name, const char *comment, const char *version)
752 {
753 FILE *stream;
754 unsigned int ch, i;
755 struct predicate_table t;
756 unsigned int level1_offset, level2_offset, level3_offset;
757
758 stream = fopen (filename, "w");
759 if (stream == NULL)
760 {
761 fprintf (stderr, "cannot open '%s' for writing\n", filename);
762 exit (1);
763 }
764
765 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
766 fprintf (stream, "/* %s of Unicode characters. */\n", comment);
767 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
768 version);
769 fprintf (stream, "\n");
770
771 fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
772 fprintf (stream, "\n");
773 output_library_license (stream, strcmp (filename, "unictype/categ_M.h") == 0);
774 fprintf (stream, "\n");
775
776 t.p = 4;
777 t.q = 7;
778 predicate_table_init (&t);
779
780 for (ch = 0; ch < 0x110000; ch++)
781 if (predicate (ch))
782 predicate_table_add (&t, ch);
783
784 predicate_table_finalize (&t);
785
786
787 level1_offset =
788 5 * sizeof (uint32_t);
789 level2_offset =
790 5 * sizeof (uint32_t)
791 + t.level1_size * sizeof (uint32_t);
792 level3_offset =
793 5 * sizeof (uint32_t)
794 + t.level1_size * sizeof (uint32_t)
795 + (t.level2_size << t.q) * sizeof (uint32_t);
796
797 for (i = 0; i < 5; i++)
798 if (i != 1)
799 fprintf (stream, "#define header_%d %d\n", i,
800 ((uint32_t *) t.result)[i]);
801
802 fprintf (stream, "static const\n");
803 fprintf (stream, "struct\n");
804 fprintf (stream, " {\n");
805 fprintf (stream, " int header[1];\n");
806 fprintf (stream, " int level1[%zu];\n", t.level1_size);
807 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
808 fprintf (stream, " unsigned int level3[%zu << %d];\n", t.level3_size, t.p);
809 fprintf (stream, " }\n");
810 fprintf (stream, "%s =\n", name);
811 fprintf (stream, "{\n");
812 fprintf (stream, " { %d },\n", ((uint32_t *) t.result)[1]);
813 fprintf (stream, " {");
814 if (t.level1_size > 1)
815 fprintf (stream, "\n ");
816 for (i = 0; i < t.level1_size; i++)
817 {
818 uint32_t offset;
819 if (i > 0 && (i % 1) == 0)
820 fprintf (stream, "\n ");
821 offset = ((uint32_t *) (t.result + level1_offset))[i];
822 if (offset == 0)
823 fprintf (stream, " %5d", -1);
824 else
825 fprintf (stream, " %5zu * sizeof (int) / sizeof (short) + %5zu",
826 1 + t.level1_size, (offset - level2_offset) / sizeof (uint32_t));
827 if (i+1 < t.level1_size)
828 fprintf (stream, ",");
829 }
830 if (t.level1_size > 1)
831 fprintf (stream, "\n ");
832 fprintf (stream, " },\n");
833 fprintf (stream, " {");
834 if (t.level2_size << t.q > 1)
835 fprintf (stream, "\n ");
836 for (i = 0; i < t.level2_size << t.q; i++)
837 {
838 uint32_t offset;
839 if (i > 0 && (i % 1) == 0)
840 fprintf (stream, "\n ");
841 offset = ((uint32_t *) (t.result + level2_offset))[i];
842 if (offset == 0)
843 fprintf (stream, " %5d", -1);
844 else
845 fprintf (stream, " %5zu + %5zu * sizeof (short) / sizeof (int) + %5zu",
846 1 + t.level1_size, t.level2_size << t.q, (offset - level3_offset) / sizeof (uint32_t));
847 if (i+1 < t.level2_size << t.q)
848 fprintf (stream, ",");
849 }
850 if (t.level2_size << t.q > 1)
851 fprintf (stream, "\n ");
852 fprintf (stream, " },\n");
853 fprintf (stream, " {");
854 if (t.level3_size << t.p > 4)
855 fprintf (stream, "\n ");
856 for (i = 0; i < t.level3_size << t.p; i++)
857 {
858 if (i > 0 && (i % 4) == 0)
859 fprintf (stream, "\n ");
860 fprintf (stream, " 0x%08XU",
861 ((uint32_t *) (t.result + level3_offset))[i]);
862 if (i+1 < t.level3_size << t.p)
863 fprintf (stream, ",");
864 }
865 if (t.level3_size << t.p > 4)
866 fprintf (stream, "\n ");
867 fprintf (stream, " }\n");
868 fprintf (stream, "};\n");
869
870 if (ferror (stream) || fclose (stream))
871 {
872 fprintf (stderr, "error writing to '%s'\n", filename);
873 exit (1);
874 }
875 }
876
877
878 static void
879 output_categories (const char *version)
880 {
881 #define CATEGORY(C) \
882 debug_output_predicate ("unictype/categ_" #C ".txt", is_category_ ## C); \
883 output_predicate_test ("../tests/unictype/test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \
884 output_predicate ("unictype/categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version);
885 CATEGORY (L)
886 CATEGORY (LC)
887 CATEGORY (Lu)
888 CATEGORY (Ll)
889 CATEGORY (Lt)
890 CATEGORY (Lm)
891 CATEGORY (Lo)
892 CATEGORY (M)
893 CATEGORY (Mn)
894 CATEGORY (Mc)
895 CATEGORY (Me)
896 CATEGORY (N)
897 CATEGORY (Nd)
898 CATEGORY (Nl)
899 CATEGORY (No)
900 CATEGORY (P)
901 CATEGORY (Pc)
902 CATEGORY (Pd)
903 CATEGORY (Ps)
904 CATEGORY (Pe)
905 CATEGORY (Pi)
906 CATEGORY (Pf)
907 CATEGORY (Po)
908 CATEGORY (S)
909 CATEGORY (Sm)
910 CATEGORY (Sc)
911 CATEGORY (Sk)
912 CATEGORY (So)
913 CATEGORY (Z)
914 CATEGORY (Zs)
915 CATEGORY (Zl)
916 CATEGORY (Zp)
917 CATEGORY (C)
918 CATEGORY (Cc)
919 CATEGORY (Cf)
920 CATEGORY (Cs)
921 CATEGORY (Co)
922 CATEGORY (Cn)
923 #undef CATEGORY
924 }
925
926 enum
927 {
928 UC_CATEGORY_MASK_L = 0x0000001f,
929 UC_CATEGORY_MASK_LC = 0x00000007,
930 UC_CATEGORY_MASK_Lu = 0x00000001,
931 UC_CATEGORY_MASK_Ll = 0x00000002,
932 UC_CATEGORY_MASK_Lt = 0x00000004,
933 UC_CATEGORY_MASK_Lm = 0x00000008,
934 UC_CATEGORY_MASK_Lo = 0x00000010,
935 UC_CATEGORY_MASK_M = 0x000000e0,
936 UC_CATEGORY_MASK_Mn = 0x00000020,
937 UC_CATEGORY_MASK_Mc = 0x00000040,
938 UC_CATEGORY_MASK_Me = 0x00000080,
939 UC_CATEGORY_MASK_N = 0x00000700,
940 UC_CATEGORY_MASK_Nd = 0x00000100,
941 UC_CATEGORY_MASK_Nl = 0x00000200,
942 UC_CATEGORY_MASK_No = 0x00000400,
943 UC_CATEGORY_MASK_P = 0x0003f800,
944 UC_CATEGORY_MASK_Pc = 0x00000800,
945 UC_CATEGORY_MASK_Pd = 0x00001000,
946 UC_CATEGORY_MASK_Ps = 0x00002000,
947 UC_CATEGORY_MASK_Pe = 0x00004000,
948 UC_CATEGORY_MASK_Pi = 0x00008000,
949 UC_CATEGORY_MASK_Pf = 0x00010000,
950 UC_CATEGORY_MASK_Po = 0x00020000,
951 UC_CATEGORY_MASK_S = 0x003c0000,
952 UC_CATEGORY_MASK_Sm = 0x00040000,
953 UC_CATEGORY_MASK_Sc = 0x00080000,
954 UC_CATEGORY_MASK_Sk = 0x00100000,
955 UC_CATEGORY_MASK_So = 0x00200000,
956 UC_CATEGORY_MASK_Z = 0x01c00000,
957 UC_CATEGORY_MASK_Zs = 0x00400000,
958 UC_CATEGORY_MASK_Zl = 0x00800000,
959 UC_CATEGORY_MASK_Zp = 0x01000000,
960 UC_CATEGORY_MASK_C = 0x3e000000,
961 UC_CATEGORY_MASK_Cc = 0x02000000,
962 UC_CATEGORY_MASK_Cf = 0x04000000,
963 UC_CATEGORY_MASK_Cs = 0x08000000,
964 UC_CATEGORY_MASK_Co = 0x10000000,
965 UC_CATEGORY_MASK_Cn = 0x20000000
966 };
967
968 static int
969 general_category_byname (const char *category_name)
970 {
971 if (category_name[0] != '\0'
972 && (category_name[1] == '\0' || category_name[2] == '\0'))
973 switch (category_name[0])
974 {
975 case 'L':
976 switch (category_name[1])
977 {
978 case '\0': return UC_CATEGORY_MASK_L;
979 case 'C': return UC_CATEGORY_MASK_LC;
980 case 'u': return UC_CATEGORY_MASK_Lu;
981 case 'l': return UC_CATEGORY_MASK_Ll;
982 case 't': return UC_CATEGORY_MASK_Lt;
983 case 'm': return UC_CATEGORY_MASK_Lm;
984 case 'o': return UC_CATEGORY_MASK_Lo;
985 }
986 break;
987 case 'M':
988 switch (category_name[1])
989 {
990 case '\0': return UC_CATEGORY_MASK_M;
991 case 'n': return UC_CATEGORY_MASK_Mn;
992 case 'c': return UC_CATEGORY_MASK_Mc;
993 case 'e': return UC_CATEGORY_MASK_Me;
994 }
995 break;
996 case 'N':
997 switch (category_name[1])
998 {
999 case '\0': return UC_CATEGORY_MASK_N;
1000 case 'd': return UC_CATEGORY_MASK_Nd;
1001 case 'l': return UC_CATEGORY_MASK_Nl;
1002 case 'o': return UC_CATEGORY_MASK_No;
1003 }
1004 break;
1005 case 'P':
1006 switch (category_name[1])
1007 {
1008 case '\0': return UC_CATEGORY_MASK_P;
1009 case 'c': return UC_CATEGORY_MASK_Pc;
1010 case 'd': return UC_CATEGORY_MASK_Pd;
1011 case 's': return UC_CATEGORY_MASK_Ps;
1012 case 'e': return UC_CATEGORY_MASK_Pe;
1013 case 'i': return UC_CATEGORY_MASK_Pi;
1014 case 'f': return UC_CATEGORY_MASK_Pf;
1015 case 'o': return UC_CATEGORY_MASK_Po;
1016 }
1017 break;
1018 case 'S':
1019 switch (category_name[1])
1020 {
1021 case '\0': return UC_CATEGORY_MASK_S;
1022 case 'm': return UC_CATEGORY_MASK_Sm;
1023 case 'c': return UC_CATEGORY_MASK_Sc;
1024 case 'k': return UC_CATEGORY_MASK_Sk;
1025 case 'o': return UC_CATEGORY_MASK_So;
1026 }
1027 break;
1028 case 'Z':
1029 switch (category_name[1])
1030 {
1031 case '\0': return UC_CATEGORY_MASK_Z;
1032 case 's': return UC_CATEGORY_MASK_Zs;
1033 case 'l': return UC_CATEGORY_MASK_Zl;
1034 case 'p': return UC_CATEGORY_MASK_Zp;
1035 }
1036 break;
1037 case 'C':
1038 switch (category_name[1])
1039 {
1040 case '\0': return UC_CATEGORY_MASK_C;
1041 case 'c': return UC_CATEGORY_MASK_Cc;
1042 case 'f': return UC_CATEGORY_MASK_Cf;
1043 case 's': return UC_CATEGORY_MASK_Cs;
1044 case 'o': return UC_CATEGORY_MASK_Co;
1045 case 'n': return UC_CATEGORY_MASK_Cn;
1046 }
1047 break;
1048 }
1049
1050 abort ();
1051 }
1052
1053
1054 #define TABLE category_table
1055 #define ELEMENT uint8_t
1056 #define DEFAULT 29
1057 #define xmalloc malloc
1058 #define xrealloc realloc
1059 #include "3level.h"
1060
1061
1062 static void
1063 output_category (const char *filename, const char *version)
1064 {
1065 FILE *stream;
1066 unsigned int ch, i;
1067 struct category_table t;
1068 unsigned int level1_offset, level2_offset, level3_offset;
1069 uint16_t *level3_packed;
1070
1071 stream = fopen (filename, "w");
1072 if (stream == NULL)
1073 {
1074 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1075 exit (1);
1076 }
1077
1078 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1079 fprintf (stream, "/* Categories of Unicode characters. */\n");
1080 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1081 version);
1082 fprintf (stream, "\n");
1083
1084 fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
1085 fprintf (stream, "\n");
1086 output_library_license (stream, true);
1087 fprintf (stream, "\n");
1088
1089 t.p = 7;
1090 t.q = 9;
1091 category_table_init (&t);
1092
1093 for (ch = 0; ch < 0x110000; ch++)
1094 {
1095 int value;
1096 unsigned int log2_value;
1097
1098 if (is_category_Cs (ch))
1099 value = UC_CATEGORY_MASK_Cs;
1100 else if (unicode_attributes[ch].name != NULL)
1101 value = general_category_byname (unicode_attributes[ch].category);
1102 else
1103 continue;
1104
1105
1106 assert (value != 0 && (value & (value - 1)) == 0);
1107
1108 for (log2_value = 0; value > 1; value >>= 1, log2_value++);
1109
1110 assert (log2_value <= 0x1f);
1111
1112 category_table_add (&t, ch, log2_value);
1113 }
1114
1115 category_table_finalize (&t);
1116
1117
1118 level1_offset =
1119 5 * sizeof (uint32_t);
1120 level2_offset =
1121 5 * sizeof (uint32_t)
1122 + t.level1_size * sizeof (uint32_t);
1123 level3_offset =
1124 5 * sizeof (uint32_t)
1125 + t.level1_size * sizeof (uint32_t)
1126 + (t.level2_size << t.q) * sizeof (uint32_t);
1127
1128 for (i = 0; i < 5; i++)
1129 fprintf (stream, "#define category_header_%d %d\n", i,
1130 ((uint32_t *) t.result)[i]);
1131 fprintf (stream, "static const\n");
1132 fprintf (stream, "struct\n");
1133 fprintf (stream, " {\n");
1134 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1135 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1136 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1137 (1 << t.p) * 5 / 16);
1138 fprintf (stream, " }\n");
1139 fprintf (stream, "u_category =\n");
1140 fprintf (stream, "{\n");
1141 fprintf (stream, " {");
1142 if (t.level1_size > 8)
1143 fprintf (stream, "\n ");
1144 for (i = 0; i < t.level1_size; i++)
1145 {
1146 uint32_t offset;
1147 if (i > 0 && (i % 8) == 0)
1148 fprintf (stream, "\n ");
1149 offset = ((uint32_t *) (t.result + level1_offset))[i];
1150 if (offset == 0)
1151 fprintf (stream, " %5d", -1);
1152 else
1153 fprintf (stream, " %5zu",
1154 (offset - level2_offset) / sizeof (uint32_t));
1155 if (i+1 < t.level1_size)
1156 fprintf (stream, ",");
1157 }
1158 if (t.level1_size > 8)
1159 fprintf (stream, "\n ");
1160 fprintf (stream, " },\n");
1161 fprintf (stream, " {");
1162 if (t.level2_size << t.q > 8)
1163 fprintf (stream, "\n ");
1164 for (i = 0; i < t.level2_size << t.q; i++)
1165 {
1166 uint32_t offset;
1167 if (i > 0 && (i % 8) == 0)
1168 fprintf (stream, "\n ");
1169 offset = ((uint32_t *) (t.result + level2_offset))[i];
1170 if (offset == 0)
1171 fprintf (stream, " %5d", -1);
1172 else
1173 fprintf (stream, " %5zu",
1174 (offset - level3_offset) / sizeof (uint8_t));
1175 if (i+1 < t.level2_size << t.q)
1176 fprintf (stream, ",");
1177 }
1178 if (t.level2_size << t.q > 8)
1179 fprintf (stream, "\n ");
1180 fprintf (stream, " },\n");
1181
1182
1183 level3_packed =
1184 (uint16_t *)
1185 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1186 for (i = 0; i < t.level3_size << t.p; i++)
1187 {
1188 unsigned int j = (i * 5) / 16;
1189 unsigned int k = (i * 5) % 16;
1190 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1191 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1192 level3_packed[j] = value & 0xffff;
1193 level3_packed[j+1] = value >> 16;
1194 }
1195 fprintf (stream, " {");
1196 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1197 fprintf (stream, "\n ");
1198 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1199 {
1200 if (i > 0 && (i % 8) == 0)
1201 fprintf (stream, "\n ");
1202 fprintf (stream, " 0x%04x", level3_packed[i]);
1203 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1204 fprintf (stream, ",");
1205 }
1206 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1207 fprintf (stream, "\n ");
1208 fprintf (stream, " }\n");
1209 free (level3_packed);
1210 fprintf (stream, "};\n");
1211
1212 if (ferror (stream) || fclose (stream))
1213 {
1214 fprintf (stderr, "error writing to '%s'\n", filename);
1215 exit (1);
1216 }
1217 }
1218
1219
1220
1221
1222
1223
1224
1225
1226 #define TABLE combclass_table
1227 #define ELEMENT uint8_t
1228 #define DEFAULT 0
1229 #define xmalloc malloc
1230 #define xrealloc realloc
1231 #include "3level.h"
1232
1233
1234 static void
1235 output_combclass (const char *filename, const char *version)
1236 {
1237 FILE *stream;
1238 unsigned int ch, i;
1239 struct combclass_table t;
1240 unsigned int level1_offset, level2_offset, level3_offset;
1241
1242 stream = fopen (filename, "w");
1243 if (stream == NULL)
1244 {
1245 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1246 exit (1);
1247 }
1248
1249 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1250 fprintf (stream, "/* Combining class of Unicode characters. */\n");
1251 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1252 version);
1253 fprintf (stream, "\n");
1254
1255 fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
1256 fprintf (stream, "\n");
1257 output_library_license (stream, true);
1258 fprintf (stream, "\n");
1259
1260 t.p = 7;
1261 t.q = 9;
1262 combclass_table_init (&t);
1263
1264 for (ch = 0; ch < 0x110000; ch++)
1265 if (unicode_attributes[ch].name != NULL)
1266 {
1267 int value = atoi (unicode_attributes[ch].combining);
1268 assert (value >= 0 && value <= 255);
1269 combclass_table_add (&t, ch, value);
1270 }
1271
1272 combclass_table_finalize (&t);
1273
1274
1275 level1_offset =
1276 5 * sizeof (uint32_t);
1277 level2_offset =
1278 5 * sizeof (uint32_t)
1279 + t.level1_size * sizeof (uint32_t);
1280 level3_offset =
1281 5 * sizeof (uint32_t)
1282 + t.level1_size * sizeof (uint32_t)
1283 + (t.level2_size << t.q) * sizeof (uint32_t);
1284
1285 for (i = 0; i < 5; i++)
1286 fprintf (stream, "#define combclass_header_%d %d\n", i,
1287 ((uint32_t *) t.result)[i]);
1288 fprintf (stream, "static const\n");
1289 fprintf (stream, "struct\n");
1290 fprintf (stream, " {\n");
1291 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1292 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1293 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
1294 fprintf (stream, " }\n");
1295 fprintf (stream, "u_combclass =\n");
1296 fprintf (stream, "{\n");
1297 fprintf (stream, " {");
1298 if (t.level1_size > 8)
1299 fprintf (stream, "\n ");
1300 for (i = 0; i < t.level1_size; i++)
1301 {
1302 uint32_t offset;
1303 if (i > 0 && (i % 8) == 0)
1304 fprintf (stream, "\n ");
1305 offset = ((uint32_t *) (t.result + level1_offset))[i];
1306 if (offset == 0)
1307 fprintf (stream, " %5d", -1);
1308 else
1309 fprintf (stream, " %5zu",
1310 (offset - level2_offset) / sizeof (uint32_t));
1311 if (i+1 < t.level1_size)
1312 fprintf (stream, ",");
1313 }
1314 if (t.level1_size > 8)
1315 fprintf (stream, "\n ");
1316 fprintf (stream, " },\n");
1317 fprintf (stream, " {");
1318 if (t.level2_size << t.q > 8)
1319 fprintf (stream, "\n ");
1320 for (i = 0; i < t.level2_size << t.q; i++)
1321 {
1322 uint32_t offset;
1323 if (i > 0 && (i % 8) == 0)
1324 fprintf (stream, "\n ");
1325 offset = ((uint32_t *) (t.result + level2_offset))[i];
1326 if (offset == 0)
1327 fprintf (stream, " %5d", -1);
1328 else
1329 fprintf (stream, " %5zu",
1330 (offset - level3_offset) / sizeof (uint8_t));
1331 if (i+1 < t.level2_size << t.q)
1332 fprintf (stream, ",");
1333 }
1334 if (t.level2_size << t.q > 8)
1335 fprintf (stream, "\n ");
1336 fprintf (stream, " },\n");
1337 fprintf (stream, " {");
1338 if (t.level3_size << t.p > 8)
1339 fprintf (stream, "\n ");
1340 for (i = 0; i < t.level3_size << t.p; i++)
1341 {
1342 if (i > 0 && (i % 8) == 0)
1343 fprintf (stream, "\n ");
1344 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
1345 if (i+1 < t.level3_size << t.p)
1346 fprintf (stream, ",");
1347 }
1348 if (t.level3_size << t.p > 8)
1349 fprintf (stream, "\n ");
1350 fprintf (stream, " }\n");
1351 fprintf (stream, "};\n");
1352
1353 if (ferror (stream) || fclose (stream))
1354 {
1355 fprintf (stderr, "error writing to '%s'\n", filename);
1356 exit (1);
1357 }
1358 }
1359
1360
1361
1362
1363
1364
1365
1366 enum
1367 {
1368 UC_BIDI_L,
1369 UC_BIDI_LRE,
1370 UC_BIDI_LRO,
1371 UC_BIDI_R,
1372 UC_BIDI_AL,
1373 UC_BIDI_RLE,
1374 UC_BIDI_RLO,
1375 UC_BIDI_PDF,
1376 UC_BIDI_EN,
1377 UC_BIDI_ES,
1378 UC_BIDI_ET,
1379 UC_BIDI_AN,
1380 UC_BIDI_CS,
1381 UC_BIDI_NSM,
1382 UC_BIDI_BN,
1383 UC_BIDI_B,
1384 UC_BIDI_S,
1385 UC_BIDI_WS,
1386 UC_BIDI_ON,
1387 UC_BIDI_LRI,
1388 UC_BIDI_RLI,
1389 UC_BIDI_FSI,
1390 UC_BIDI_PDI
1391 };
1392
1393 static int
1394 bidi_category_byname (const char *category_name)
1395 {
1396 switch (category_name[0])
1397 {
1398 case 'A':
1399 switch (category_name[1])
1400 {
1401 case 'L':
1402 if (category_name[2] == '\0')
1403 return UC_BIDI_AL;
1404 break;
1405 case 'N':
1406 if (category_name[2] == '\0')
1407 return UC_BIDI_AN;
1408 break;
1409 }
1410 break;
1411 case 'B':
1412 switch (category_name[1])
1413 {
1414 case '\0':
1415 return UC_BIDI_B;
1416 case 'N':
1417 if (category_name[2] == '\0')
1418 return UC_BIDI_BN;
1419 break;
1420 }
1421 break;
1422 case 'C':
1423 switch (category_name[1])
1424 {
1425 case 'S':
1426 if (category_name[2] == '\0')
1427 return UC_BIDI_CS;
1428 break;
1429 }
1430 break;
1431 case 'E':
1432 switch (category_name[1])
1433 {
1434 case 'N':
1435 if (category_name[2] == '\0')
1436 return UC_BIDI_EN;
1437 break;
1438 case 'S':
1439 if (category_name[2] == '\0')
1440 return UC_BIDI_ES;
1441 break;
1442 case 'T':
1443 if (category_name[2] == '\0')
1444 return UC_BIDI_ET;
1445 break;
1446 }
1447 break;
1448 case 'F':
1449 switch (category_name[1])
1450 {
1451 case 'S':
1452 switch (category_name[2])
1453 {
1454 case 'I':
1455 if (category_name[3] == '\0')
1456 return UC_BIDI_FSI;
1457 break;
1458 }
1459 }
1460 break;
1461 case 'L':
1462 switch (category_name[1])
1463 {
1464 case '\0':
1465 return UC_BIDI_L;
1466 case 'R':
1467 switch (category_name[2])
1468 {
1469 case 'E':
1470 if (category_name[3] == '\0')
1471 return UC_BIDI_LRE;
1472 break;
1473 case 'O':
1474 if (category_name[3] == '\0')
1475 return UC_BIDI_LRO;
1476 break;
1477 case 'I':
1478 if (category_name[3] == '\0')
1479 return UC_BIDI_LRI;
1480 break;
1481 }
1482 break;
1483 }
1484 break;
1485 case 'N':
1486 switch (category_name[1])
1487 {
1488 case 'S':
1489 switch (category_name[2])
1490 {
1491 case 'M':
1492 if (category_name[3] == '\0')
1493 return UC_BIDI_NSM;
1494 break;
1495 }
1496 break;
1497 }
1498 break;
1499 case 'O':
1500 switch (category_name[1])
1501 {
1502 case 'N':
1503 if (category_name[2] == '\0')
1504 return UC_BIDI_ON;
1505 break;
1506 }
1507 break;
1508 case 'P':
1509 switch (category_name[1])
1510 {
1511 case 'D':
1512 switch (category_name[2])
1513 {
1514 case 'F':
1515 if (category_name[3] == '\0')
1516 return UC_BIDI_PDF;
1517 break;
1518 case 'I':
1519 if (category_name[3] == '\0')
1520 return UC_BIDI_PDI;
1521 break;
1522 }
1523 break;
1524 }
1525 break;
1526 case 'R':
1527 switch (category_name[1])
1528 {
1529 case '\0':
1530 return UC_BIDI_R;
1531 case 'L':
1532 switch (category_name[2])
1533 {
1534 case 'E':
1535 if (category_name[3] == '\0')
1536 return UC_BIDI_RLE;
1537 break;
1538 case 'O':
1539 if (category_name[3] == '\0')
1540 return UC_BIDI_RLO;
1541 break;
1542 case 'I':
1543 if (category_name[3] == '\0')
1544 return UC_BIDI_RLI;
1545 break;
1546 }
1547 break;
1548 }
1549 break;
1550 case 'S':
1551 if (category_name[1] == '\0')
1552 return UC_BIDI_S;
1553 break;
1554 case 'W':
1555 switch (category_name[1])
1556 {
1557 case 'S':
1558 if (category_name[2] == '\0')
1559 return UC_BIDI_WS;
1560 break;
1561 }
1562 break;
1563 }
1564
1565 abort ();
1566 }
1567
1568 static int
1569 get_bidi_category (unsigned int ch)
1570 {
1571 if (unicode_attributes[ch].name != NULL)
1572 return bidi_category_byname (unicode_attributes[ch].bidi);
1573 else
1574 {
1575
1576
1577 if ((ch >= 0x0590 && ch <= 0x05FF)
1578 || (ch >= 0x07FB && ch <= 0x08FF)
1579 || (ch >= 0xFB37 && ch <= 0xFB45)
1580 || (ch >= 0x10800 && ch <= 0x10FFF))
1581 return UC_BIDI_R;
1582 else if ((ch >= 0x0600 && ch <= 0x07BF)
1583 || (ch >= 0x2064 && ch <= 0x2069)
1584 || (ch >= 0xFBB2 && ch <= 0xFDCF)
1585 || (ch >= 0xFDFE && ch <= 0xFEFE))
1586 return UC_BIDI_AL;
1587 else if ((ch >= 0xFDD0 && ch <= 0xFDEF)
1588 || (ch >= 0xFFF0 && ch <= 0xFFFF)
1589 || (ch & 0xFFFF) == 0xFFFE
1590 || (ch & 0xFFFF) == 0xFFFF
1591 || (ch >= 0xE0000 && ch <= 0xE0FFF))
1592 return UC_BIDI_BN;
1593 else
1594 return UC_BIDI_L;
1595 }
1596 }
1597
1598
1599 #define TABLE bidi_category_table
1600 #define ELEMENT uint8_t
1601 #define DEFAULT UC_BIDI_L
1602 #define xmalloc malloc
1603 #define xrealloc realloc
1604 #include "3level.h"
1605
1606
1607 static void
1608 output_bidi_category (const char *filename, const char *version)
1609 {
1610 FILE *stream;
1611 unsigned int ch, i;
1612 struct bidi_category_table t;
1613 unsigned int level1_offset, level2_offset, level3_offset;
1614 uint16_t *level3_packed;
1615
1616 stream = fopen (filename, "w");
1617 if (stream == NULL)
1618 {
1619 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1620 exit (1);
1621 }
1622
1623 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1624 fprintf (stream, "/* Bidi categories of Unicode characters. */\n");
1625 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1626 version);
1627 fprintf (stream, "\n");
1628
1629 fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
1630 fprintf (stream, "\n");
1631 output_library_license (stream, true);
1632 fprintf (stream, "\n");
1633
1634 t.p = 7;
1635 t.q = 9;
1636 bidi_category_table_init (&t);
1637
1638 for (ch = 0; ch < 0x110000; ch++)
1639 {
1640 int value = get_bidi_category (ch);
1641
1642 assert (value <= 0x1f);
1643
1644 bidi_category_table_add (&t, ch, value);
1645 }
1646
1647 bidi_category_table_finalize (&t);
1648
1649
1650 level1_offset =
1651 5 * sizeof (uint32_t);
1652 level2_offset =
1653 5 * sizeof (uint32_t)
1654 + t.level1_size * sizeof (uint32_t);
1655 level3_offset =
1656 5 * sizeof (uint32_t)
1657 + t.level1_size * sizeof (uint32_t)
1658 + (t.level2_size << t.q) * sizeof (uint32_t);
1659
1660 for (i = 0; i < 5; i++)
1661 fprintf (stream, "#define bidi_category_header_%d %d\n", i,
1662 ((uint32_t *) t.result)[i]);
1663 fprintf (stream, "static const\n");
1664 fprintf (stream, "struct\n");
1665 fprintf (stream, " {\n");
1666 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1667 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1668 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1669 (1 << t.p) * 5 / 16);
1670 fprintf (stream, " }\n");
1671 fprintf (stream, "u_bidi_category =\n");
1672 fprintf (stream, "{\n");
1673 fprintf (stream, " {");
1674 if (t.level1_size > 8)
1675 fprintf (stream, "\n ");
1676 for (i = 0; i < t.level1_size; i++)
1677 {
1678 uint32_t offset;
1679 if (i > 0 && (i % 8) == 0)
1680 fprintf (stream, "\n ");
1681 offset = ((uint32_t *) (t.result + level1_offset))[i];
1682 if (offset == 0)
1683 fprintf (stream, " %5d", -1);
1684 else
1685 fprintf (stream, " %5zu",
1686 (offset - level2_offset) / sizeof (uint32_t));
1687 if (i+1 < t.level1_size)
1688 fprintf (stream, ",");
1689 }
1690 if (t.level1_size > 8)
1691 fprintf (stream, "\n ");
1692 fprintf (stream, " },\n");
1693 fprintf (stream, " {");
1694 if (t.level2_size << t.q > 8)
1695 fprintf (stream, "\n ");
1696 for (i = 0; i < t.level2_size << t.q; i++)
1697 {
1698 uint32_t offset;
1699 if (i > 0 && (i % 8) == 0)
1700 fprintf (stream, "\n ");
1701 offset = ((uint32_t *) (t.result + level2_offset))[i];
1702 if (offset == 0)
1703 fprintf (stream, " %5d", -1);
1704 else
1705 fprintf (stream, " %5zu",
1706 (offset - level3_offset) / sizeof (uint8_t));
1707 if (i+1 < t.level2_size << t.q)
1708 fprintf (stream, ",");
1709 }
1710 if (t.level2_size << t.q > 8)
1711 fprintf (stream, "\n ");
1712 fprintf (stream, " },\n");
1713
1714
1715 level3_packed =
1716 (uint16_t *)
1717 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1718 for (i = 0; i < t.level3_size << t.p; i++)
1719 {
1720 unsigned int j = (i * 5) / 16;
1721 unsigned int k = (i * 5) % 16;
1722 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1723 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1724 level3_packed[j] = value & 0xffff;
1725 level3_packed[j+1] = value >> 16;
1726 }
1727 fprintf (stream, " {");
1728 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1729 fprintf (stream, "\n ");
1730 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1731 {
1732 if (i > 0 && (i % 8) == 0)
1733 fprintf (stream, "\n ");
1734 fprintf (stream, " 0x%04x", level3_packed[i]);
1735 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1736 fprintf (stream, ",");
1737 }
1738 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1739 fprintf (stream, "\n ");
1740 fprintf (stream, " }\n");
1741 free (level3_packed);
1742 fprintf (stream, "};\n");
1743
1744 if (ferror (stream) || fclose (stream))
1745 {
1746 fprintf (stderr, "error writing to '%s'\n", filename);
1747 exit (1);
1748 }
1749 }
1750
1751
1752
1753
1754
1755
1756 static int
1757 get_decdigit_value (unsigned int ch)
1758 {
1759 if (unicode_attributes[ch].name != NULL
1760 && unicode_attributes[ch].decdigit[0] != '\0')
1761 return atoi (unicode_attributes[ch].decdigit);
1762 return -1;
1763 }
1764
1765
1766 #define TABLE decdigit_table
1767 #define ELEMENT uint8_t
1768 #define DEFAULT 0
1769 #define xmalloc malloc
1770 #define xrealloc realloc
1771 #include "3level.h"
1772
1773
1774 static void
1775 output_decimal_digit_test (const char *filename, const char *version)
1776 {
1777 FILE *stream;
1778 bool need_comma;
1779 unsigned int ch;
1780
1781 stream = fopen (filename, "w");
1782 if (stream == NULL)
1783 {
1784 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1785 exit (1);
1786 }
1787
1788 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1789 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1790 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1791 version);
1792 fprintf (stream, "\n");
1793
1794 fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
1795 fprintf (stream, "\n");
1796 output_tests_license (stream);
1797 fprintf (stream, "\n");
1798
1799 need_comma = false;
1800 for (ch = 0; ch < 0x110000; ch++)
1801 {
1802 int value = get_decdigit_value (ch);
1803
1804 assert (value >= -1 && value < 10);
1805
1806 if (value >= 0)
1807 {
1808 if (need_comma)
1809 fprintf (stream, ",\n");
1810 fprintf (stream, " { 0x%04X, %d }", ch, value);
1811 need_comma = true;
1812 }
1813 }
1814 if (need_comma)
1815 fprintf (stream, "\n");
1816
1817 if (ferror (stream) || fclose (stream))
1818 {
1819 fprintf (stderr, "error writing to '%s'\n", filename);
1820 exit (1);
1821 }
1822 }
1823
1824
1825 static void
1826 output_decimal_digit (const char *filename, const char *version)
1827 {
1828 FILE *stream;
1829 unsigned int ch, i;
1830 struct decdigit_table t;
1831 unsigned int level1_offset, level2_offset, level3_offset;
1832
1833 stream = fopen (filename, "w");
1834 if (stream == NULL)
1835 {
1836 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1837 exit (1);
1838 }
1839
1840 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1841 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1842 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1843 version);
1844 fprintf (stream, "\n");
1845
1846 fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
1847 fprintf (stream, "\n");
1848 output_library_license (stream, false);
1849 fprintf (stream, "\n");
1850
1851 t.p = 7;
1852 t.q = 9;
1853 decdigit_table_init (&t);
1854
1855 for (ch = 0; ch < 0x110000; ch++)
1856 {
1857 int value = 1 + get_decdigit_value (ch);
1858
1859 assert (value >= 0 && value <= 10);
1860
1861 decdigit_table_add (&t, ch, value);
1862 }
1863
1864 decdigit_table_finalize (&t);
1865
1866
1867 level1_offset =
1868 5 * sizeof (uint32_t);
1869 level2_offset =
1870 5 * sizeof (uint32_t)
1871 + t.level1_size * sizeof (uint32_t);
1872 level3_offset =
1873 5 * sizeof (uint32_t)
1874 + t.level1_size * sizeof (uint32_t)
1875 + (t.level2_size << t.q) * sizeof (uint32_t);
1876
1877 for (i = 0; i < 5; i++)
1878 fprintf (stream, "#define decdigit_header_%d %d\n", i,
1879 ((uint32_t *) t.result)[i]);
1880 fprintf (stream, "static const\n");
1881 fprintf (stream, "struct\n");
1882 fprintf (stream, " {\n");
1883 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1884 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1885 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1886 t.p - 1);
1887 fprintf (stream, " }\n");
1888 fprintf (stream, "u_decdigit =\n");
1889 fprintf (stream, "{\n");
1890 fprintf (stream, " {");
1891 if (t.level1_size > 8)
1892 fprintf (stream, "\n ");
1893 for (i = 0; i < t.level1_size; i++)
1894 {
1895 uint32_t offset;
1896 if (i > 0 && (i % 8) == 0)
1897 fprintf (stream, "\n ");
1898 offset = ((uint32_t *) (t.result + level1_offset))[i];
1899 if (offset == 0)
1900 fprintf (stream, " %5d", -1);
1901 else
1902 fprintf (stream, " %5zu",
1903 (offset - level2_offset) / sizeof (uint32_t));
1904 if (i+1 < t.level1_size)
1905 fprintf (stream, ",");
1906 }
1907 if (t.level1_size > 8)
1908 fprintf (stream, "\n ");
1909 fprintf (stream, " },\n");
1910 fprintf (stream, " {");
1911 if (t.level2_size << t.q > 8)
1912 fprintf (stream, "\n ");
1913 for (i = 0; i < t.level2_size << t.q; i++)
1914 {
1915 uint32_t offset;
1916 if (i > 0 && (i % 8) == 0)
1917 fprintf (stream, "\n ");
1918 offset = ((uint32_t *) (t.result + level2_offset))[i];
1919 if (offset == 0)
1920 fprintf (stream, " %5d", -1);
1921 else
1922 fprintf (stream, " %5zu",
1923 (offset - level3_offset) / sizeof (uint8_t));
1924 if (i+1 < t.level2_size << t.q)
1925 fprintf (stream, ",");
1926 }
1927 if (t.level2_size << t.q > 8)
1928 fprintf (stream, "\n ");
1929 fprintf (stream, " },\n");
1930
1931 fprintf (stream, " {");
1932 if (t.level3_size << (t.p - 1) > 8)
1933 fprintf (stream, "\n ");
1934 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1935 {
1936 if (i > 0 && (i % 8) == 0)
1937 fprintf (stream, "\n ");
1938 fprintf (stream, " 0x%02x",
1939 ((uint8_t *) (t.result + level3_offset))[2*i]
1940 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1941 if (i+1 < t.level3_size << (t.p - 1))
1942 fprintf (stream, ",");
1943 }
1944 if (t.level3_size << (t.p - 1) > 8)
1945 fprintf (stream, "\n ");
1946 fprintf (stream, " }\n");
1947 fprintf (stream, "};\n");
1948
1949 if (ferror (stream) || fclose (stream))
1950 {
1951 fprintf (stderr, "error writing to '%s'\n", filename);
1952 exit (1);
1953 }
1954 }
1955
1956
1957
1958
1959
1960
1961 static int
1962 get_digit_value (unsigned int ch)
1963 {
1964 if (unicode_attributes[ch].name != NULL
1965 && unicode_attributes[ch].digit[0] != '\0')
1966 return atoi (unicode_attributes[ch].digit);
1967 return -1;
1968 }
1969
1970
1971 static void
1972 output_digit_test (const char *filename, const char *version)
1973 {
1974 FILE *stream;
1975 bool need_comma;
1976 unsigned int ch;
1977
1978 stream = fopen (filename, "w");
1979 if (stream == NULL)
1980 {
1981 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1982 exit (1);
1983 }
1984
1985 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1986 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1987 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1988 version);
1989 fprintf (stream, "\n");
1990
1991 fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
1992 fprintf (stream, "\n");
1993 output_tests_license (stream);
1994 fprintf (stream, "\n");
1995
1996 need_comma = false;
1997 for (ch = 0; ch < 0x110000; ch++)
1998 {
1999 int value = get_digit_value (ch);
2000
2001 assert (value >= -1 && value < 10);
2002
2003 if (value >= 0)
2004 {
2005 if (need_comma)
2006 fprintf (stream, ",\n");
2007 fprintf (stream, " { 0x%04X, %d }", ch, value);
2008 need_comma = true;
2009 }
2010 }
2011 if (need_comma)
2012 fprintf (stream, "\n");
2013
2014 if (ferror (stream) || fclose (stream))
2015 {
2016 fprintf (stderr, "error writing to '%s'\n", filename);
2017 exit (1);
2018 }
2019 }
2020
2021
2022 static void
2023 output_digit (const char *filename, const char *version)
2024 {
2025 FILE *stream;
2026 unsigned int ch, i;
2027 struct decdigit_table t;
2028 unsigned int level1_offset, level2_offset, level3_offset;
2029
2030 stream = fopen (filename, "w");
2031 if (stream == NULL)
2032 {
2033 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2034 exit (1);
2035 }
2036
2037 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2038 fprintf (stream, "/* Digit values of Unicode characters. */\n");
2039 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2040 version);
2041 fprintf (stream, "\n");
2042
2043 fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
2044 fprintf (stream, "\n");
2045 output_library_license (stream, false);
2046 fprintf (stream, "\n");
2047
2048 t.p = 7;
2049 t.q = 9;
2050 decdigit_table_init (&t);
2051
2052 for (ch = 0; ch < 0x110000; ch++)
2053 {
2054 int value = 1 + get_digit_value (ch);
2055
2056 assert (value >= 0 && value <= 10);
2057
2058 decdigit_table_add (&t, ch, value);
2059 }
2060
2061 decdigit_table_finalize (&t);
2062
2063
2064 level1_offset =
2065 5 * sizeof (uint32_t);
2066 level2_offset =
2067 5 * sizeof (uint32_t)
2068 + t.level1_size * sizeof (uint32_t);
2069 level3_offset =
2070 5 * sizeof (uint32_t)
2071 + t.level1_size * sizeof (uint32_t)
2072 + (t.level2_size << t.q) * sizeof (uint32_t);
2073
2074 for (i = 0; i < 5; i++)
2075 fprintf (stream, "#define digit_header_%d %d\n", i,
2076 ((uint32_t *) t.result)[i]);
2077 fprintf (stream, "static const\n");
2078 fprintf (stream, "struct\n");
2079 fprintf (stream, " {\n");
2080 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2081 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2082 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
2083 t.p - 1);
2084 fprintf (stream, " }\n");
2085 fprintf (stream, "u_digit =\n");
2086 fprintf (stream, "{\n");
2087 fprintf (stream, " {");
2088 if (t.level1_size > 8)
2089 fprintf (stream, "\n ");
2090 for (i = 0; i < t.level1_size; i++)
2091 {
2092 uint32_t offset;
2093 if (i > 0 && (i % 8) == 0)
2094 fprintf (stream, "\n ");
2095 offset = ((uint32_t *) (t.result + level1_offset))[i];
2096 if (offset == 0)
2097 fprintf (stream, " %5d", -1);
2098 else
2099 fprintf (stream, " %5zu",
2100 (offset - level2_offset) / sizeof (uint32_t));
2101 if (i+1 < t.level1_size)
2102 fprintf (stream, ",");
2103 }
2104 if (t.level1_size > 8)
2105 fprintf (stream, "\n ");
2106 fprintf (stream, " },\n");
2107 fprintf (stream, " {");
2108 if (t.level2_size << t.q > 8)
2109 fprintf (stream, "\n ");
2110 for (i = 0; i < t.level2_size << t.q; i++)
2111 {
2112 uint32_t offset;
2113 if (i > 0 && (i % 8) == 0)
2114 fprintf (stream, "\n ");
2115 offset = ((uint32_t *) (t.result + level2_offset))[i];
2116 if (offset == 0)
2117 fprintf (stream, " %5d", -1);
2118 else
2119 fprintf (stream, " %5zu",
2120 (offset - level3_offset) / sizeof (uint8_t));
2121 if (i+1 < t.level2_size << t.q)
2122 fprintf (stream, ",");
2123 }
2124 if (t.level2_size << t.q > 8)
2125 fprintf (stream, "\n ");
2126 fprintf (stream, " },\n");
2127
2128 fprintf (stream, " {");
2129 if (t.level3_size << (t.p - 1) > 8)
2130 fprintf (stream, "\n ");
2131 for (i = 0; i < t.level3_size << (t.p - 1); i++)
2132 {
2133 if (i > 0 && (i % 8) == 0)
2134 fprintf (stream, "\n ");
2135 fprintf (stream, " 0x%02x",
2136 ((uint8_t *) (t.result + level3_offset))[2*i]
2137 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
2138 if (i+1 < t.level3_size << (t.p - 1))
2139 fprintf (stream, ",");
2140 }
2141 if (t.level3_size << (t.p - 1) > 8)
2142 fprintf (stream, "\n ");
2143 fprintf (stream, " }\n");
2144 fprintf (stream, "};\n");
2145
2146 if (ferror (stream) || fclose (stream))
2147 {
2148 fprintf (stderr, "error writing to '%s'\n", filename);
2149 exit (1);
2150 }
2151 }
2152
2153
2154
2155
2156
2157
2158 typedef struct { int numerator; int denominator; } uc_fraction_t;
2159
2160 static uc_fraction_t
2161 get_numeric_value (unsigned int ch)
2162 {
2163 uc_fraction_t value;
2164
2165 if (unicode_attributes[ch].name != NULL
2166 && unicode_attributes[ch].numeric[0] != '\0')
2167 {
2168 const char *str = unicode_attributes[ch].numeric;
2169
2170 value.numerator = atoi (str);
2171 if (strchr (str, '/') != NULL)
2172 value.denominator = atoi (strchr (str, '/') + 1);
2173 else
2174 value.denominator = 1;
2175 }
2176 else
2177 {
2178 value.numerator = 0;
2179 value.denominator = 0;
2180 }
2181 return value;
2182 }
2183
2184
2185 static void
2186 output_numeric_test (const char *filename, const char *version)
2187 {
2188 FILE *stream;
2189 bool need_comma;
2190 unsigned int ch;
2191
2192 stream = fopen (filename, "w");
2193 if (stream == NULL)
2194 {
2195 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2196 exit (1);
2197 }
2198
2199 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2200 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2201 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2202 version);
2203 fprintf (stream, "\n");
2204
2205 fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
2206 fprintf (stream, "\n");
2207 output_tests_license (stream);
2208 fprintf (stream, "\n");
2209
2210 need_comma = false;
2211 for (ch = 0; ch < 0x110000; ch++)
2212 {
2213 uc_fraction_t value = get_numeric_value (ch);
2214
2215 if (value.numerator != 0 || value.denominator != 0)
2216 {
2217 if (need_comma)
2218 fprintf (stream, ",\n");
2219 fprintf (stream, " { 0x%04X, %d, %d }",
2220 ch, value.numerator, value.denominator);
2221 need_comma = true;
2222 }
2223 }
2224 if (need_comma)
2225 fprintf (stream, "\n");
2226
2227 if (ferror (stream) || fclose (stream))
2228 {
2229 fprintf (stderr, "error writing to '%s'\n", filename);
2230 exit (1);
2231 }
2232 }
2233
2234
2235 #define TABLE numeric_table
2236 #define ELEMENT uint8_t
2237 #define DEFAULT 0
2238 #define xmalloc malloc
2239 #define xrealloc realloc
2240 #include "3level.h"
2241
2242
2243 static void
2244 output_numeric (const char *filename, const char *version)
2245 {
2246 FILE *stream;
2247 uc_fraction_t fractions[160];
2248 unsigned int nfractions;
2249 unsigned int ch, i, j;
2250 struct numeric_table t;
2251 unsigned int level1_offset, level2_offset, level3_offset;
2252 uint16_t *level3_packed;
2253
2254 stream = fopen (filename, "w");
2255 if (stream == NULL)
2256 {
2257 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2258 exit (1);
2259 }
2260
2261 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2262 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2263 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2264 version);
2265 fprintf (stream, "\n");
2266
2267 fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
2268 fprintf (stream, "\n");
2269 output_library_license (stream, false);
2270 fprintf (stream, "\n");
2271
2272
2273 nfractions = 0;
2274 for (ch = 0; ch < 0x110000; ch++)
2275 {
2276 uc_fraction_t value = get_numeric_value (ch);
2277
2278 for (i = 0; i < nfractions; i++)
2279 if (value.numerator == fractions[i].numerator
2280 && value.denominator == fractions[i].denominator)
2281 break;
2282 if (i == nfractions)
2283 {
2284 assert (nfractions != SIZEOF (fractions));
2285 for (i = 0; i < nfractions; i++)
2286 if (value.denominator < fractions[i].denominator
2287 || (value.denominator == fractions[i].denominator
2288 && value.numerator < fractions[i].numerator))
2289 break;
2290 for (j = nfractions; j > i; j--)
2291 fractions[j] = fractions[j - 1];
2292 fractions[i] = value;
2293 nfractions++;
2294 }
2295 }
2296
2297 fprintf (stream, "static const uc_fraction_t u_numeric_values[%d] =\n",
2298 nfractions);
2299 fprintf (stream, "{\n");
2300 for (i = 0; i < nfractions; i++)
2301 {
2302 fprintf (stream, " { %d, %d }", fractions[i].numerator,
2303 fractions[i].denominator);
2304 if (i+1 < nfractions)
2305 fprintf (stream, ",");
2306 fprintf (stream, "\n");
2307 }
2308 fprintf (stream, "};\n");
2309
2310 t.p = 7;
2311 t.q = 9;
2312 numeric_table_init (&t);
2313
2314 for (ch = 0; ch < 0x110000; ch++)
2315 {
2316 uc_fraction_t value = get_numeric_value (ch);
2317
2318 for (i = 0; i < nfractions; i++)
2319 if (value.numerator == fractions[i].numerator
2320 && value.denominator == fractions[i].denominator)
2321 break;
2322 assert (i != nfractions);
2323
2324 numeric_table_add (&t, ch, i);
2325 }
2326
2327 numeric_table_finalize (&t);
2328
2329
2330 level1_offset =
2331 5 * sizeof (uint32_t);
2332 level2_offset =
2333 5 * sizeof (uint32_t)
2334 + t.level1_size * sizeof (uint32_t);
2335 level3_offset =
2336 5 * sizeof (uint32_t)
2337 + t.level1_size * sizeof (uint32_t)
2338 + (t.level2_size << t.q) * sizeof (uint32_t);
2339
2340 for (i = 0; i < 5; i++)
2341 fprintf (stream, "#define numeric_header_%d %d\n", i,
2342 ((uint32_t *) t.result)[i]);
2343 fprintf (stream, "static const\n");
2344 fprintf (stream, "struct\n");
2345 fprintf (stream, " {\n");
2346 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2347 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2348 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
2349 (1 << t.p) * 8 / 16);
2350 fprintf (stream, " }\n");
2351 fprintf (stream, "u_numeric =\n");
2352 fprintf (stream, "{\n");
2353 fprintf (stream, " {");
2354 if (t.level1_size > 8)
2355 fprintf (stream, "\n ");
2356 for (i = 0; i < t.level1_size; i++)
2357 {
2358 uint32_t offset;
2359 if (i > 0 && (i % 8) == 0)
2360 fprintf (stream, "\n ");
2361 offset = ((uint32_t *) (t.result + level1_offset))[i];
2362 if (offset == 0)
2363 fprintf (stream, " %5d", -1);
2364 else
2365 fprintf (stream, " %5zu",
2366 (offset - level2_offset) / sizeof (uint32_t));
2367 if (i+1 < t.level1_size)
2368 fprintf (stream, ",");
2369 }
2370 if (t.level1_size > 8)
2371 fprintf (stream, "\n ");
2372 fprintf (stream, " },\n");
2373 fprintf (stream, " {");
2374 if (t.level2_size << t.q > 8)
2375 fprintf (stream, "\n ");
2376 for (i = 0; i < t.level2_size << t.q; i++)
2377 {
2378 uint32_t offset;
2379 if (i > 0 && (i % 8) == 0)
2380 fprintf (stream, "\n ");
2381 offset = ((uint32_t *) (t.result + level2_offset))[i];
2382 if (offset == 0)
2383 fprintf (stream, " %5d", -1);
2384 else
2385 fprintf (stream, " %5zu",
2386 (offset - level3_offset) / sizeof (uint8_t));
2387 if (i+1 < t.level2_size << t.q)
2388 fprintf (stream, ",");
2389 }
2390 if (t.level2_size << t.q > 8)
2391 fprintf (stream, "\n ");
2392 fprintf (stream, " },\n");
2393
2394
2395 level3_packed =
2396 (uint16_t *)
2397 calloc ((t.level3_size << t.p) * 8 / 16 + 1, sizeof (uint16_t));
2398 for (i = 0; i < t.level3_size << t.p; i++)
2399 {
2400 unsigned int j = (i * 8) / 16;
2401 unsigned int k = (i * 8) % 16;
2402 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
2403 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
2404 level3_packed[j] = value & 0xffff;
2405 level3_packed[j+1] = value >> 16;
2406 }
2407 fprintf (stream, " {");
2408 if ((t.level3_size << t.p) * 8 / 16 + 1 > 8)
2409 fprintf (stream, "\n ");
2410 for (i = 0; i < (t.level3_size << t.p) * 8 / 16 + 1; i++)
2411 {
2412 if (i > 0 && (i % 8) == 0)
2413 fprintf (stream, "\n ");
2414 fprintf (stream, " 0x%04x", level3_packed[i]);
2415 if (i+1 < (t.level3_size << t.p) * 8 / 16 + 1)
2416 fprintf (stream, ",");
2417 }
2418 if ((t.level3_size << t.p) * 8 / 16 + 1 > 8)
2419 fprintf (stream, "\n ");
2420 fprintf (stream, " }\n");
2421 free (level3_packed);
2422 fprintf (stream, "};\n");
2423
2424 if (ferror (stream) || fclose (stream))
2425 {
2426 fprintf (stderr, "error writing to '%s'\n", filename);
2427 exit (1);
2428 }
2429 }
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439 static unsigned int mirror_pairs[][2] =
2440 {
2441 { 0x0028, 0x0029 },
2442 { 0x003C, 0x003E },
2443 { 0x005B, 0x005D },
2444 { 0x007B, 0x007D },
2445 { 0x00AB, 0x00BB },
2446 { 0x2039, 0x203A },
2447 { 0x2045, 0x2046 },
2448 { 0x207D, 0x207E },
2449 { 0x208D, 0x208E },
2450 { 0x2208, 0x220B },
2451 { 0x220A, 0x220D },
2452 { 0x223C, 0x223D },
2453 { 0x2243, 0x22CD },
2454 { 0x2252, 0x2253 },
2455 { 0x2254, 0x2255 },
2456 { 0x2264, 0x2265 },
2457 { 0x2266, 0x2267 },
2458 { 0x226A, 0x226B },
2459 { 0x2276, 0x2277 },
2460 { 0x2278, 0x2279 },
2461 { 0x227A, 0x227B },
2462 { 0x227C, 0x227D },
2463 { 0x2282, 0x2283 },
2464 { 0x2286, 0x2287 },
2465 { 0x228F, 0x2290 },
2466 { 0x2291, 0x2292 },
2467 { 0x22A2, 0x22A3 },
2468 { 0x22B0, 0x22B1 },
2469 { 0x22B2, 0x22B3 },
2470 { 0x22B4, 0x22B5 },
2471 { 0x22B6, 0x22B7 },
2472 { 0x22C9, 0x22CA },
2473 { 0x22CB, 0x22CC },
2474 { 0x22D0, 0x22D1 },
2475 { 0x22D6, 0x22D7 },
2476 { 0x22D8, 0x22D9 },
2477 { 0x22DA, 0x22DB },
2478 { 0x22DC, 0x22DD },
2479 { 0x22DE, 0x22DF },
2480 { 0x22F0, 0x22F1 },
2481 { 0x2308, 0x2309 },
2482 { 0x230A, 0x230B },
2483 { 0x2329, 0x232A },
2484 { 0x3008, 0x3009 },
2485 { 0x300A, 0x300B },
2486 { 0x300C, 0x300D },
2487 { 0x300E, 0x300F },
2488 { 0x3010, 0x3011 },
2489 { 0x3014, 0x3015 },
2490 { 0x3016, 0x3017 },
2491 { 0x3018, 0x3019 },
2492 { 0x301A, 0x301B }
2493 };
2494
2495 static int
2496 get_mirror_value (unsigned int ch)
2497 {
2498 bool mirrored;
2499 unsigned int mirror_char;
2500 unsigned int i;
2501
2502 mirrored = (unicode_attributes[ch].name != NULL
2503 && unicode_attributes[ch].mirrored);
2504 mirror_char = 0xfffd;
2505 for (i = 0; i < sizeof (mirror_pairs) / sizeof (mirror_pairs[0]); i++)
2506 if (ch == mirror_pairs[i][0])
2507 {
2508 mirror_char = mirror_pairs[i][1];
2509 break;
2510 }
2511 else if (ch == mirror_pairs[i][1])
2512 {
2513 mirror_char = mirror_pairs[i][0];
2514 break;
2515 }
2516 if (mirrored)
2517 return (int) mirror_char - (int) ch;
2518 else
2519 {
2520 assert (mirror_char == 0xfffd);
2521 return 0;
2522 }
2523 }
2524
2525
2526 #define TABLE mirror_table
2527 #define ELEMENT int32_t
2528 #define DEFAULT 0
2529 #define xmalloc malloc
2530 #define xrealloc realloc
2531 #include "3level.h"
2532
2533
2534 static void
2535 output_mirror (const char *filename, const char *version)
2536 {
2537 FILE *stream;
2538 unsigned int ch, i;
2539 struct mirror_table t;
2540 unsigned int level1_offset, level2_offset, level3_offset;
2541
2542 stream = fopen (filename, "w");
2543 if (stream == NULL)
2544 {
2545 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2546 exit (1);
2547 }
2548
2549 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2550 fprintf (stream, "/* Mirrored Unicode characters. */\n");
2551 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2552 version);
2553 fprintf (stream, "\n");
2554
2555 fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
2556 fprintf (stream, "\n");
2557 output_library_license (stream, false);
2558 fprintf (stream, "\n");
2559
2560 t.p = 7;
2561 t.q = 9;
2562 mirror_table_init (&t);
2563
2564 for (ch = 0; ch < 0x110000; ch++)
2565 {
2566 int value = get_mirror_value (ch);
2567
2568 mirror_table_add (&t, ch, value);
2569 }
2570
2571 mirror_table_finalize (&t);
2572
2573
2574 level1_offset =
2575 5 * sizeof (uint32_t);
2576 level2_offset =
2577 5 * sizeof (uint32_t)
2578 + t.level1_size * sizeof (uint32_t);
2579 level3_offset =
2580 5 * sizeof (uint32_t)
2581 + t.level1_size * sizeof (uint32_t)
2582 + (t.level2_size << t.q) * sizeof (uint32_t);
2583
2584 for (i = 0; i < 5; i++)
2585 fprintf (stream, "#define mirror_header_%d %d\n", i,
2586 ((uint32_t *) t.result)[i]);
2587 fprintf (stream, "static const\n");
2588 fprintf (stream, "struct\n");
2589 fprintf (stream, " {\n");
2590 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2591 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2592 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
2593 fprintf (stream, " }\n");
2594 fprintf (stream, "u_mirror =\n");
2595 fprintf (stream, "{\n");
2596 fprintf (stream, " {");
2597 if (t.level1_size > 8)
2598 fprintf (stream, "\n ");
2599 for (i = 0; i < t.level1_size; i++)
2600 {
2601 uint32_t offset;
2602 if (i > 0 && (i % 8) == 0)
2603 fprintf (stream, "\n ");
2604 offset = ((uint32_t *) (t.result + level1_offset))[i];
2605 if (offset == 0)
2606 fprintf (stream, " %5d", -1);
2607 else
2608 fprintf (stream, " %5zu",
2609 (offset - level2_offset) / sizeof (uint32_t));
2610 if (i+1 < t.level1_size)
2611 fprintf (stream, ",");
2612 }
2613 if (t.level1_size > 8)
2614 fprintf (stream, "\n ");
2615 fprintf (stream, " },\n");
2616 fprintf (stream, " {");
2617 if (t.level2_size << t.q > 8)
2618 fprintf (stream, "\n ");
2619 for (i = 0; i < t.level2_size << t.q; i++)
2620 {
2621 uint32_t offset;
2622 if (i > 0 && (i % 8) == 0)
2623 fprintf (stream, "\n ");
2624 offset = ((uint32_t *) (t.result + level2_offset))[i];
2625 if (offset == 0)
2626 fprintf (stream, " %5d", -1);
2627 else
2628 fprintf (stream, " %5zu",
2629 (offset - level3_offset) / sizeof (int32_t));
2630 if (i+1 < t.level2_size << t.q)
2631 fprintf (stream, ",");
2632 }
2633 if (t.level2_size << t.q > 8)
2634 fprintf (stream, "\n ");
2635 fprintf (stream, " },\n");
2636 fprintf (stream, " {");
2637 if (t.level3_size << t.p > 8)
2638 fprintf (stream, "\n ");
2639 for (i = 0; i < t.level3_size << t.p; i++)
2640 {
2641 if (i > 0 && (i % 8) == 0)
2642 fprintf (stream, "\n ");
2643 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
2644 if (i+1 < t.level3_size << t.p)
2645 fprintf (stream, ",");
2646 }
2647 if (t.level3_size << t.p > 8)
2648 fprintf (stream, "\n ");
2649 fprintf (stream, " }\n");
2650 fprintf (stream, "};\n");
2651
2652 if (ferror (stream) || fclose (stream))
2653 {
2654 fprintf (stderr, "error writing to '%s'\n", filename);
2655 exit (1);
2656 }
2657 }
2658
2659
2660
2661
2662
2663 static bool
2664 is_WBP_MIDNUMLET (unsigned int ch)
2665 {
2666 return (ch == 0x002E || ch == 0x2018 || ch == 0x2019
2667 || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E);
2668 }
2669
2670 static bool
2671 is_WBP_MIDLETTER (unsigned int ch)
2672 {
2673 return (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A
2674 || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A
2675 || ch == 0x02D7);
2676 }
2677
2678
2679
2680
2681
2682
2683 enum
2684 {
2685
2686 PROP_WHITE_SPACE,
2687 PROP_BIDI_CONTROL,
2688 PROP_JOIN_CONTROL,
2689 PROP_DASH,
2690 PROP_HYPHEN,
2691 PROP_QUOTATION_MARK,
2692 PROP_TERMINAL_PUNCTUATION,
2693 PROP_OTHER_MATH,
2694 PROP_HEX_DIGIT,
2695 PROP_ASCII_HEX_DIGIT,
2696 PROP_OTHER_ALPHABETIC,
2697 PROP_IDEOGRAPHIC,
2698 PROP_DIACRITIC,
2699 PROP_EXTENDER,
2700 PROP_OTHER_LOWERCASE,
2701 PROP_OTHER_UPPERCASE,
2702 PROP_NONCHARACTER_CODE_POINT,
2703 PROP_OTHER_GRAPHEME_EXTEND,
2704 PROP_IDS_BINARY_OPERATOR,
2705 PROP_IDS_TRINARY_OPERATOR,
2706 PROP_RADICAL,
2707 PROP_UNIFIED_IDEOGRAPH,
2708 PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT,
2709 PROP_DEPRECATED,
2710 PROP_SOFT_DOTTED,
2711 PROP_LOGICAL_ORDER_EXCEPTION,
2712 PROP_OTHER_ID_START,
2713 PROP_OTHER_ID_CONTINUE,
2714 PROP_STERM,
2715 PROP_VARIATION_SELECTOR,
2716 PROP_PATTERN_WHITE_SPACE,
2717 PROP_PATTERN_SYNTAX,
2718 PROP_PREPENDED_CONCATENATION_MARK,
2719
2720 PROP_MATH,
2721 PROP_ALPHABETIC,
2722 PROP_LOWERCASE,
2723 PROP_UPPERCASE,
2724 PROP_CASED,
2725 PROP_CASE_IGNORABLE,
2726 PROP_CHANGES_WHEN_LOWERCASED,
2727 PROP_CHANGES_WHEN_UPPERCASED,
2728 PROP_CHANGES_WHEN_TITLECASED,
2729 PROP_CHANGES_WHEN_CASEFOLDED,
2730 PROP_CHANGES_WHEN_CASEMAPPED,
2731 PROP_ID_START,
2732 PROP_ID_CONTINUE,
2733 PROP_XID_START,
2734 PROP_XID_CONTINUE,
2735 PROP_DEFAULT_IGNORABLE_CODE_POINT,
2736 PROP_GRAPHEME_EXTEND,
2737 PROP_GRAPHEME_BASE,
2738 PROP_GRAPHEME_LINK
2739 };
2740 unsigned long long unicode_properties[0x110000];
2741
2742 static void
2743 clear_properties (void)
2744 {
2745 unsigned int i;
2746
2747 for (i = 0; i < 0x110000; i++)
2748 unicode_properties[i] = 0;
2749 }
2750
2751
2752
2753 static void
2754 fill_properties (const char *proplist_filename)
2755 {
2756 unsigned int i;
2757 FILE *stream;
2758
2759 stream = fopen (proplist_filename, "r");
2760 if (stream == NULL)
2761 {
2762 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2763 exit (1);
2764 }
2765
2766 for (;;)
2767 {
2768 char buf[200+1];
2769 unsigned int i1, i2;
2770 char padding[200+1];
2771 char propname[200+1];
2772 unsigned int propvalue;
2773
2774 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
2775 break;
2776
2777 if (buf[0] == '\0' || buf[0] == '#')
2778 continue;
2779
2780 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
2781 {
2782 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
2783 {
2784 fprintf (stderr, "parse error in '%s'\n", proplist_filename);
2785 exit (1);
2786 }
2787 i2 = i1;
2788 }
2789 #define PROP(name,value) \
2790 if (strcmp (propname, name) == 0) propvalue = value; else
2791
2792 PROP ("White_Space", PROP_WHITE_SPACE)
2793 PROP ("Bidi_Control", PROP_BIDI_CONTROL)
2794 PROP ("Join_Control", PROP_JOIN_CONTROL)
2795 PROP ("Dash", PROP_DASH)
2796 PROP ("Hyphen", PROP_HYPHEN)
2797 PROP ("Quotation_Mark", PROP_QUOTATION_MARK)
2798 PROP ("Terminal_Punctuation", PROP_TERMINAL_PUNCTUATION)
2799 PROP ("Other_Math", PROP_OTHER_MATH)
2800 PROP ("Hex_Digit", PROP_HEX_DIGIT)
2801 PROP ("ASCII_Hex_Digit", PROP_ASCII_HEX_DIGIT)
2802 PROP ("Other_Alphabetic", PROP_OTHER_ALPHABETIC)
2803 PROP ("Ideographic", PROP_IDEOGRAPHIC)
2804 PROP ("Diacritic", PROP_DIACRITIC)
2805 PROP ("Extender", PROP_EXTENDER)
2806 PROP ("Other_Lowercase", PROP_OTHER_LOWERCASE)
2807 PROP ("Other_Uppercase", PROP_OTHER_UPPERCASE)
2808 PROP ("Noncharacter_Code_Point", PROP_NONCHARACTER_CODE_POINT)
2809 PROP ("Other_Grapheme_Extend", PROP_OTHER_GRAPHEME_EXTEND)
2810 PROP ("IDS_Binary_Operator", PROP_IDS_BINARY_OPERATOR)
2811 PROP ("IDS_Trinary_Operator", PROP_IDS_TRINARY_OPERATOR)
2812 PROP ("Radical", PROP_RADICAL)
2813 PROP ("Unified_Ideograph", PROP_UNIFIED_IDEOGRAPH)
2814 PROP ("Other_Default_Ignorable_Code_Point", PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)
2815 PROP ("Deprecated", PROP_DEPRECATED)
2816 PROP ("Soft_Dotted", PROP_SOFT_DOTTED)
2817 PROP ("Logical_Order_Exception", PROP_LOGICAL_ORDER_EXCEPTION)
2818 PROP ("Other_ID_Start", PROP_OTHER_ID_START)
2819 PROP ("Other_ID_Continue", PROP_OTHER_ID_CONTINUE)
2820 PROP ("Sentence_Terminal", PROP_STERM)
2821 PROP ("Variation_Selector", PROP_VARIATION_SELECTOR)
2822 PROP ("Pattern_White_Space", PROP_PATTERN_WHITE_SPACE)
2823 PROP ("Pattern_Syntax", PROP_PATTERN_SYNTAX)
2824 PROP ("Prepended_Concatenation_Mark", PROP_PREPENDED_CONCATENATION_MARK)
2825
2826 PROP ("Math", PROP_MATH)
2827 PROP ("Alphabetic", PROP_ALPHABETIC)
2828 PROP ("Lowercase", PROP_LOWERCASE)
2829 PROP ("Uppercase", PROP_UPPERCASE)
2830 PROP ("Cased", PROP_CASED)
2831 PROP ("Case_Ignorable", PROP_CASE_IGNORABLE)
2832 PROP ("Changes_When_Lowercased", PROP_CHANGES_WHEN_LOWERCASED)
2833 PROP ("Changes_When_Uppercased", PROP_CHANGES_WHEN_UPPERCASED)
2834 PROP ("Changes_When_Titlecased", PROP_CHANGES_WHEN_TITLECASED)
2835 PROP ("Changes_When_Casefolded", PROP_CHANGES_WHEN_CASEFOLDED)
2836 PROP ("Changes_When_Casemapped", PROP_CHANGES_WHEN_CASEMAPPED)
2837 PROP ("ID_Start", PROP_ID_START)
2838 PROP ("ID_Continue", PROP_ID_CONTINUE)
2839 PROP ("XID_Start", PROP_XID_START)
2840 PROP ("XID_Continue", PROP_XID_CONTINUE)
2841 PROP ("Default_Ignorable_Code_Point", PROP_DEFAULT_IGNORABLE_CODE_POINT)
2842 PROP ("Grapheme_Extend", PROP_GRAPHEME_EXTEND)
2843 PROP ("Grapheme_Base", PROP_GRAPHEME_BASE)
2844 PROP ("Grapheme_Link", PROP_GRAPHEME_LINK)
2845 #undef PROP
2846 {
2847 fprintf (stderr, "unknown property named '%s' in '%s'\n", propname,
2848 proplist_filename);
2849 exit (1);
2850 }
2851 assert (i1 <= i2 && i2 < 0x110000);
2852
2853 for (i = i1; i <= i2; i++)
2854 unicode_properties[i] |= 1ULL << propvalue;
2855 }
2856
2857 if (ferror (stream) || fclose (stream))
2858 {
2859 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2860 exit (1);
2861 }
2862 }
2863
2864
2865
2866 static void
2867 fill_property30 (char array[0x110000], const char *proplist_filename, const char *property_name)
2868 {
2869 unsigned int i;
2870 FILE *stream;
2871 char buf[100+1];
2872
2873 for (i = 0; i < 0x110000; i++)
2874 array[i] = 0;
2875
2876 stream = fopen (proplist_filename, "r");
2877 if (stream == NULL)
2878 {
2879 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2880 exit (1);
2881 }
2882
2883
2884 do
2885 {
2886 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2887 {
2888 fprintf (stderr, "no property found in '%s'\n", proplist_filename);
2889 exit (1);
2890 }
2891 }
2892 while (strstr (buf, property_name) == NULL);
2893
2894 for (;;)
2895 {
2896 unsigned int i1, i2;
2897
2898 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2899 break;
2900 if (buf[0] == '*')
2901 break;
2902 if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.')
2903 {
2904 if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2)
2905 {
2906 fprintf (stderr, "parse error in property in '%s'\n",
2907 proplist_filename);
2908 exit (1);
2909 }
2910 }
2911 else if (strlen (buf) >= 4)
2912 {
2913 if (sscanf (buf, "%4X", &i1) < 1)
2914 {
2915 fprintf (stderr, "parse error in property in '%s'\n",
2916 proplist_filename);
2917 exit (1);
2918 }
2919 i2 = i1;
2920 }
2921 else
2922 {
2923 fprintf (stderr, "parse error in property in '%s'\n",
2924 proplist_filename);
2925 exit (1);
2926 }
2927 assert (i1 <= i2 && i2 < 0x110000);
2928 for (i = i1; i <= i2; i++)
2929 array[i] = 1;
2930 }
2931
2932 if (ferror (stream) || fclose (stream))
2933 {
2934 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2935 exit (1);
2936 }
2937 }
2938
2939
2940
2941
2942 char unicode_pairedpunctuation[0x110000];
2943
2944
2945 char unicode_leftofpair[0x110000];
2946
2947 static void
2948 fill_properties30 (const char *proplist30_filename)
2949 {
2950 fill_property30 (unicode_pairedpunctuation, proplist30_filename, "(Paired Punctuation)");
2951 fill_property30 (unicode_leftofpair, proplist30_filename, "(Left of Pair)");
2952 }
2953
2954
2955
2956
2957 static bool
2958 is_property_white_space (unsigned int ch)
2959 {
2960 return ((unicode_properties[ch] & (1ULL << PROP_WHITE_SPACE)) != 0);
2961 }
2962
2963
2964
2965
2966 static bool
2967 is_property_alphabetic (unsigned int ch)
2968 {
2969 bool result1 =
2970 is_category_L (ch)
2971 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0)
2972
2973
2974 || (ch >= 0x16EE && ch <= 0x16F0)
2975 || (ch >= 0x2160 && ch <= 0x2182)
2976 || (ch >= 0x2185 && ch <= 0x2188)
2977 || (ch >= 0x24D0 && ch <= 0x24E9)
2978 || (ch == 0x3007)
2979 || (ch >= 0x3021 && ch <= 0x3029)
2980 || (ch >= 0x3038 && ch <= 0x303A)
2981 || (ch >= 0xA6E6 && ch <= 0xA6EF)
2982 || (ch >= 0x10140 && ch <= 0x10174)
2983 || (ch == 0x10341)
2984 || (ch == 0x1034A)
2985 || (ch >= 0x103D1 && ch <= 0x103D5)
2986 || (ch >= 0x12400 && ch <= 0x1246E);
2987 bool result2 =
2988 ((unicode_properties[ch] & (1ULL << PROP_ALPHABETIC)) != 0);
2989
2990 assert (result1 == result2);
2991 return result1;
2992 }
2993
2994
2995 static bool
2996 is_property_other_alphabetic (unsigned int ch)
2997 {
2998 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0);
2999 }
3000
3001
3002 static bool
3003 is_property_not_a_character (unsigned int ch)
3004 {
3005 return ((unicode_properties[ch] & (1ULL << PROP_NONCHARACTER_CODE_POINT)) != 0);
3006 }
3007
3008
3009
3010 static bool
3011 is_property_default_ignorable_code_point (unsigned int ch)
3012 {
3013 bool result1 =
3014 (is_category_Cf (ch)
3015 && !(ch >= 0xFFF9 && ch <= 0xFFFB)
3016 && !((ch >= 0x0600 && ch <= 0x0605) || ch == 0x06DD || ch == 0x070F)
3017
3018
3019 && !(ch == 0x110BD)
3020 && !(ch == 0x8E2))
3021 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0)
3022 || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
3023 bool result2 =
3024 ((unicode_properties[ch] & (1ULL << PROP_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
3025
3026 assert (result1 == result2);
3027 return result1;
3028 }
3029
3030
3031 static bool
3032 is_property_other_default_ignorable_code_point (unsigned int ch)
3033 {
3034 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
3035 }
3036
3037
3038 static bool
3039 is_property_deprecated (unsigned int ch)
3040 {
3041 return ((unicode_properties[ch] & (1ULL << PROP_DEPRECATED)) != 0);
3042 }
3043
3044
3045 static bool
3046 is_property_logical_order_exception (unsigned int ch)
3047 {
3048 return ((unicode_properties[ch] & (1ULL << PROP_LOGICAL_ORDER_EXCEPTION)) != 0);
3049 }
3050
3051
3052 static bool
3053 is_property_variation_selector (unsigned int ch)
3054 {
3055 return ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
3056 }
3057
3058
3059 static bool
3060 is_property_private_use (unsigned int ch)
3061 {
3062
3063 return (ch >= 0xE000 && ch <= 0xF8FF)
3064 || (ch >= 0xF0000 && ch <= 0xFFFFD)
3065 || (ch >= 0x100000 && ch <= 0x10FFFD);
3066 }
3067
3068
3069 static bool
3070 is_property_unassigned_code_value (unsigned int ch)
3071 {
3072 return (is_category_Cn (ch) && !is_property_not_a_character (ch));
3073 }
3074
3075
3076
3077 static bool
3078 is_property_uppercase (unsigned int ch)
3079 {
3080 bool result1 =
3081 is_category_Lu (ch)
3082 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
3083 bool result2 =
3084 ((unicode_properties[ch] & (1ULL << PROP_UPPERCASE)) != 0);
3085
3086 assert (result1 == result2);
3087 return result1;
3088 }
3089
3090
3091 static bool
3092 is_property_other_uppercase (unsigned int ch)
3093 {
3094 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
3095 }
3096
3097
3098
3099 static bool
3100 is_property_lowercase (unsigned int ch)
3101 {
3102 bool result1 =
3103 is_category_Ll (ch)
3104 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
3105 bool result2 =
3106 ((unicode_properties[ch] & (1ULL << PROP_LOWERCASE)) != 0);
3107
3108 assert (result1 == result2);
3109 return result1;
3110 }
3111
3112
3113 static bool
3114 is_property_other_lowercase (unsigned int ch)
3115 {
3116 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
3117 }
3118
3119
3120 static bool
3121 is_property_titlecase (unsigned int ch)
3122 {
3123 return is_category_Lt (ch);
3124 }
3125
3126
3127 static bool
3128 is_property_cased (unsigned int ch)
3129 {
3130 bool result1 = (is_property_lowercase (ch)
3131 || is_property_uppercase (ch)
3132 || is_category_Lt (ch));
3133 bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASED)) != 0);
3134
3135 assert (result1 == result2);
3136 return result1;
3137 }
3138
3139
3140 static bool
3141 is_property_case_ignorable (unsigned int ch)
3142 {
3143 bool result1 = (is_WBP_MIDLETTER (ch) || is_WBP_MIDNUMLET (ch)
3144 || ch == 0x0027
3145 || is_category_Mn (ch)
3146 || is_category_Me (ch)
3147 || is_category_Cf (ch)
3148 || is_category_Lm (ch)
3149 || is_category_Sk (ch));
3150 bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASE_IGNORABLE)) != 0);
3151
3152 assert (result1 == result2);
3153 return result1;
3154 }
3155
3156
3157 static bool
3158 is_property_changes_when_lowercased (unsigned int ch)
3159 {
3160 bool result1 = ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_LOWERCASED)) != 0);
3161 bool result2 = (unicode_attributes[ch].name != NULL
3162 && unicode_attributes[ch].lower != NONE
3163 && unicode_attributes[ch].lower != ch);
3164
3165 assert (result1 == result2);
3166 return result1;
3167 }
3168
3169
3170 static bool
3171 is_property_changes_when_uppercased (unsigned int ch)
3172 {
3173 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_UPPERCASED)) != 0);
3174 }
3175
3176
3177 static bool
3178 is_property_changes_when_titlecased (unsigned int ch)
3179 {
3180 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_TITLECASED)) != 0);
3181 }
3182
3183
3184 static bool
3185 is_property_changes_when_casefolded (unsigned int ch)
3186 {
3187 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEFOLDED)) != 0);
3188 }
3189
3190
3191 static bool
3192 is_property_changes_when_casemapped (unsigned int ch)
3193 {
3194 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEMAPPED)) != 0);
3195 }
3196
3197
3198 static bool
3199 is_property_soft_dotted (unsigned int ch)
3200 {
3201 return ((unicode_properties[ch] & (1ULL << PROP_SOFT_DOTTED)) != 0);
3202 }
3203
3204
3205 static bool
3206 is_property_id_start (unsigned int ch)
3207 {
3208 return ((unicode_properties[ch] & (1ULL << PROP_ID_START)) != 0);
3209 }
3210
3211
3212 static bool
3213 is_property_other_id_start (unsigned int ch)
3214 {
3215 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_START)) != 0);
3216 }
3217
3218
3219 static bool
3220 is_property_id_continue (unsigned int ch)
3221 {
3222 return ((unicode_properties[ch] & (1ULL << PROP_ID_CONTINUE)) != 0);
3223 }
3224
3225
3226 static bool
3227 is_property_other_id_continue (unsigned int ch)
3228 {
3229 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_CONTINUE)) != 0);
3230 }
3231
3232
3233 static bool
3234 is_property_xid_start (unsigned int ch)
3235 {
3236 return ((unicode_properties[ch] & (1ULL << PROP_XID_START)) != 0);
3237 }
3238
3239
3240 static bool
3241 is_property_xid_continue (unsigned int ch)
3242 {
3243 return ((unicode_properties[ch] & (1ULL << PROP_XID_CONTINUE)) != 0);
3244 }
3245
3246
3247 static bool
3248 is_property_pattern_white_space (unsigned int ch)
3249 {
3250 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_WHITE_SPACE)) != 0);
3251 }
3252
3253
3254 static bool
3255 is_property_pattern_syntax (unsigned int ch)
3256 {
3257 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_SYNTAX)) != 0);
3258 }
3259
3260
3261 static bool
3262 is_property_join_control (unsigned int ch)
3263 {
3264 return ((unicode_properties[ch] & (1ULL << PROP_JOIN_CONTROL)) != 0);
3265 }
3266
3267
3268 static bool
3269 is_property_grapheme_base (unsigned int ch)
3270 {
3271 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_BASE)) != 0);
3272 }
3273
3274
3275 static bool
3276 is_property_grapheme_extend (unsigned int ch)
3277 {
3278 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_EXTEND)) != 0);
3279 }
3280
3281
3282 static bool
3283 is_property_other_grapheme_extend (unsigned int ch)
3284 {
3285 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_GRAPHEME_EXTEND)) != 0);
3286 }
3287
3288
3289 static bool
3290 is_property_grapheme_link (unsigned int ch)
3291 {
3292 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_LINK)) != 0);
3293 }
3294
3295
3296 static bool
3297 is_property_bidi_control (unsigned int ch)
3298 {
3299 return ((unicode_properties[ch] & (1ULL << PROP_BIDI_CONTROL)) != 0);
3300 }
3301
3302
3303 static bool
3304 is_property_bidi_left_to_right (unsigned int ch)
3305 {
3306 return (get_bidi_category (ch) == UC_BIDI_L);
3307 }
3308
3309
3310 static bool
3311 is_property_bidi_hebrew_right_to_left (unsigned int ch)
3312 {
3313 return (get_bidi_category (ch) == UC_BIDI_R);
3314 }
3315
3316
3317 static bool
3318 is_property_bidi_arabic_right_to_left (unsigned int ch)
3319 {
3320 return (get_bidi_category (ch) == UC_BIDI_AL);
3321 }
3322
3323
3324 static bool
3325 is_property_bidi_european_digit (unsigned int ch)
3326 {
3327 return (get_bidi_category (ch) == UC_BIDI_EN);
3328 }
3329
3330
3331 static bool
3332 is_property_bidi_eur_num_separator (unsigned int ch)
3333 {
3334 return (get_bidi_category (ch) == UC_BIDI_ES);
3335 }
3336
3337
3338 static bool
3339 is_property_bidi_eur_num_terminator (unsigned int ch)
3340 {
3341 return (get_bidi_category (ch) == UC_BIDI_ET);
3342 }
3343
3344
3345 static bool
3346 is_property_bidi_arabic_digit (unsigned int ch)
3347 {
3348 return (get_bidi_category (ch) == UC_BIDI_AN);
3349 }
3350
3351
3352 static bool
3353 is_property_bidi_common_separator (unsigned int ch)
3354 {
3355 return (get_bidi_category (ch) == UC_BIDI_CS);
3356 }
3357
3358
3359 static bool
3360 is_property_bidi_block_separator (unsigned int ch)
3361 {
3362 return (get_bidi_category (ch) == UC_BIDI_B);
3363 }
3364
3365
3366 static bool
3367 is_property_bidi_segment_separator (unsigned int ch)
3368 {
3369 return (get_bidi_category (ch) == UC_BIDI_S);
3370 }
3371
3372
3373 static bool
3374 is_property_bidi_whitespace (unsigned int ch)
3375 {
3376 return (get_bidi_category (ch) == UC_BIDI_WS);
3377 }
3378
3379
3380 static bool
3381 is_property_bidi_non_spacing_mark (unsigned int ch)
3382 {
3383 return (get_bidi_category (ch) == UC_BIDI_NSM);
3384 }
3385
3386
3387 static bool
3388 is_property_bidi_boundary_neutral (unsigned int ch)
3389 {
3390 return (get_bidi_category (ch) == UC_BIDI_BN);
3391 }
3392
3393
3394 static bool
3395 is_property_bidi_pdf (unsigned int ch)
3396 {
3397 return (get_bidi_category (ch) == UC_BIDI_PDF);
3398 }
3399
3400
3401 static bool
3402 is_property_bidi_embedding_or_override (unsigned int ch)
3403 {
3404 int category = get_bidi_category (ch);
3405 return (category == UC_BIDI_LRE || category == UC_BIDI_LRO
3406 || category == UC_BIDI_RLE || category == UC_BIDI_RLO);
3407 }
3408
3409
3410 static bool
3411 is_property_bidi_other_neutral (unsigned int ch)
3412 {
3413 return (get_bidi_category (ch) == UC_BIDI_ON);
3414 }
3415
3416
3417 static bool
3418 is_property_hex_digit (unsigned int ch)
3419 {
3420 return ((unicode_properties[ch] & (1ULL << PROP_HEX_DIGIT)) != 0);
3421 }
3422
3423
3424 static bool
3425 is_property_ascii_hex_digit (unsigned int ch)
3426 {
3427 return ((unicode_properties[ch] & (1ULL << PROP_ASCII_HEX_DIGIT)) != 0);
3428 }
3429
3430
3431
3432 static bool
3433 is_property_ideographic (unsigned int ch)
3434 {
3435 return ((unicode_properties[ch] & (1ULL << PROP_IDEOGRAPHIC)) != 0);
3436 }
3437
3438
3439 static bool
3440 is_property_unified_ideograph (unsigned int ch)
3441 {
3442 return ((unicode_properties[ch] & (1ULL << PROP_UNIFIED_IDEOGRAPH)) != 0);
3443 }
3444
3445
3446 static bool
3447 is_property_radical (unsigned int ch)
3448 {
3449 return ((unicode_properties[ch] & (1ULL << PROP_RADICAL)) != 0);
3450 }
3451
3452
3453 static bool
3454 is_property_ids_binary_operator (unsigned int ch)
3455 {
3456 return ((unicode_properties[ch] & (1ULL << PROP_IDS_BINARY_OPERATOR)) != 0);
3457 }
3458
3459
3460 static bool
3461 is_property_ids_trinary_operator (unsigned int ch)
3462 {
3463 return ((unicode_properties[ch] & (1ULL << PROP_IDS_TRINARY_OPERATOR)) != 0);
3464 }
3465
3466
3467 static bool
3468 is_property_zero_width (unsigned int ch)
3469 {
3470 return is_category_Cf (ch)
3471 || (unicode_attributes[ch].name != NULL
3472 && strstr (unicode_attributes[ch].name, "ZERO WIDTH") != NULL);
3473 }
3474
3475
3476 static bool
3477 is_property_space (unsigned int ch)
3478 {
3479 return is_category_Zs (ch);
3480 }
3481
3482
3483 static bool
3484 is_property_non_break (unsigned int ch)
3485 {
3486
3487
3488 return (ch == 0x00A0
3489 || ch == 0x034F
3490 || ch == 0x035C
3491 || ch == 0x035D
3492 || ch == 0x035E
3493 || ch == 0x035F
3494 || ch == 0x0360
3495 || ch == 0x0361
3496 || ch == 0x0362
3497 || ch == 0x0F08
3498 || ch == 0x0F0C
3499 || ch == 0x0F12
3500 || ch == 0x180E
3501 || ch == 0x2007
3502 || ch == 0x2011
3503 || ch == 0x202F );
3504 }
3505
3506
3507 static bool
3508 is_property_iso_control (unsigned int ch)
3509 {
3510 bool result1 =
3511 (unicode_attributes[ch].name != NULL
3512 && strcmp (unicode_attributes[ch].name, "<control>") == 0);
3513 bool result2 =
3514 is_category_Cc (ch);
3515
3516 assert (result1 == result2);
3517 return result1;
3518 }
3519
3520
3521 static bool
3522 is_property_format_control (unsigned int ch)
3523 {
3524 return (is_category_Cf (ch)
3525 && get_bidi_category (ch) == UC_BIDI_BN
3526 && !is_property_join_control (ch)
3527 && ch != 0xFEFF);
3528 }
3529
3530
3531 static bool
3532 is_property_dash (unsigned int ch)
3533 {
3534 return ((unicode_properties[ch] & (1ULL << PROP_DASH)) != 0);
3535 }
3536
3537
3538 static bool
3539 is_property_hyphen (unsigned int ch)
3540 {
3541 return ((unicode_properties[ch] & (1ULL << PROP_HYPHEN)) != 0);
3542 }
3543
3544
3545 static bool
3546 is_property_punctuation (unsigned int ch)
3547 {
3548 return is_category_P (ch);
3549 }
3550
3551
3552 static bool
3553 is_property_line_separator (unsigned int ch)
3554 {
3555 return is_category_Zl (ch);
3556 }
3557
3558
3559 static bool
3560 is_property_paragraph_separator (unsigned int ch)
3561 {
3562 return is_category_Zp (ch);
3563 }
3564
3565
3566 static bool
3567 is_property_quotation_mark (unsigned int ch)
3568 {
3569 return ((unicode_properties[ch] & (1ULL << PROP_QUOTATION_MARK)) != 0);
3570 }
3571
3572
3573 static bool
3574 is_property_sentence_terminal (unsigned int ch)
3575 {
3576 return ((unicode_properties[ch] & (1ULL << PROP_STERM)) != 0);
3577 }
3578
3579
3580 static bool
3581 is_property_terminal_punctuation (unsigned int ch)
3582 {
3583 return ((unicode_properties[ch] & (1ULL << PROP_TERMINAL_PUNCTUATION)) != 0);
3584 }
3585
3586
3587 static bool
3588 is_property_currency_symbol (unsigned int ch)
3589 {
3590 return is_category_Sc (ch);
3591 }
3592
3593
3594
3595
3596 static bool
3597 is_property_math (unsigned int ch)
3598 {
3599 bool result1 =
3600 is_category_Sm (ch)
3601 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3602 bool result2 =
3603 ((unicode_properties[ch] & (1ULL << PROP_MATH)) != 0);
3604
3605 assert (result1 == result2);
3606 return result1;
3607 }
3608
3609
3610 static bool
3611 is_property_other_math (unsigned int ch)
3612 {
3613 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3614 }
3615
3616
3617 static bool
3618 is_property_paired_punctuation (unsigned int ch)
3619 {
3620 return unicode_pairedpunctuation[ch];
3621 }
3622
3623
3624 static bool
3625 is_property_left_of_pair (unsigned int ch)
3626 {
3627 return unicode_leftofpair[ch];
3628 }
3629
3630
3631 static bool
3632 is_property_combining (unsigned int ch)
3633 {
3634 return (unicode_attributes[ch].name != NULL
3635 && (strcmp (unicode_attributes[ch].combining, "0") != 0
3636 || is_category_Mc (ch)
3637 || is_category_Me (ch)
3638 || is_category_Mn (ch)));
3639 }
3640
3641 #if 0
3642
3643 static bool
3644 is_property_non_spacing (unsigned int ch)
3645 {
3646 return (unicode_attributes[ch].name != NULL
3647 && get_bidi_category (ch) == UC_BIDI_NSM);
3648 }
3649 #endif
3650
3651
3652 static bool
3653 is_property_composite (unsigned int ch)
3654 {
3655
3656
3657 if (ch >= 0xAC00 && ch <= 0xD7A4)
3658 return true;
3659 if (unicode_attributes[ch].name != NULL
3660 && unicode_attributes[ch].decomposition != NULL)
3661 {
3662
3663
3664 const char *decomp = unicode_attributes[ch].decomposition;
3665 if (decomp[0] == '<')
3666 {
3667 decomp = strchr (decomp, '>') + 1;
3668 if (decomp[0] == ' ')
3669 decomp++;
3670 }
3671 return strchr (decomp, ' ') != NULL && strncmp (decomp, "0020 ", 5) != 0;
3672 }
3673 return false;
3674 }
3675
3676
3677 static bool
3678 is_property_decimal_digit (unsigned int ch)
3679 {
3680 return is_category_Nd (ch);
3681 }
3682
3683
3684 static bool
3685 is_property_numeric (unsigned int ch)
3686 {
3687 return ((get_numeric_value (ch)).denominator > 0)
3688 || (ch == 0x09F8)
3689 || (ch == 0x2183);
3690 }
3691
3692
3693 static bool
3694 is_property_diacritic (unsigned int ch)
3695 {
3696 return ((unicode_properties[ch] & (1ULL << PROP_DIACRITIC)) != 0);
3697 }
3698
3699
3700 static bool
3701 is_property_extender (unsigned int ch)
3702 {
3703 return ((unicode_properties[ch] & (1ULL << PROP_EXTENDER)) != 0);
3704 }
3705
3706
3707 static bool
3708 is_property_ignorable_control (unsigned int ch)
3709 {
3710 return ((is_category_Cc (ch) && get_bidi_category (ch) == UC_BIDI_BN)
3711 || is_category_Cf (ch))
3712 && ch != 0x0000;
3713 }
3714
3715
3716
3717
3718 static void
3719 output_properties (const char *version)
3720 {
3721 #define PROPERTY(P) \
3722 debug_output_predicate ("unictype/pr_" #P ".txt", is_property_ ## P); \
3723 output_predicate_test ("../tests/unictype/test-pr_" #P ".c", is_property_ ## P, "uc_is_property_" #P " (c)"); \
3724 output_predicate ("unictype/pr_" #P ".h", is_property_ ## P, "u_property_" #P, "Properties", version);
3725 PROPERTY(white_space)
3726 PROPERTY(alphabetic)
3727 PROPERTY(other_alphabetic)
3728 PROPERTY(not_a_character)
3729 PROPERTY(default_ignorable_code_point)
3730 PROPERTY(other_default_ignorable_code_point)
3731 PROPERTY(deprecated)
3732 PROPERTY(logical_order_exception)
3733 PROPERTY(variation_selector)
3734 PROPERTY(private_use)
3735 PROPERTY(unassigned_code_value)
3736 PROPERTY(uppercase)
3737 PROPERTY(other_uppercase)
3738 PROPERTY(lowercase)
3739 PROPERTY(other_lowercase)
3740 PROPERTY(titlecase)
3741 PROPERTY(cased)
3742 PROPERTY(case_ignorable)
3743 PROPERTY(changes_when_lowercased)
3744 PROPERTY(changes_when_uppercased)
3745 PROPERTY(changes_when_titlecased)
3746 PROPERTY(changes_when_casefolded)
3747 PROPERTY(changes_when_casemapped)
3748 PROPERTY(soft_dotted)
3749 PROPERTY(id_start)
3750 PROPERTY(other_id_start)
3751 PROPERTY(id_continue)
3752 PROPERTY(other_id_continue)
3753 PROPERTY(xid_start)
3754 PROPERTY(xid_continue)
3755 PROPERTY(pattern_white_space)
3756 PROPERTY(pattern_syntax)
3757 PROPERTY(join_control)
3758 PROPERTY(grapheme_base)
3759 PROPERTY(grapheme_extend)
3760 PROPERTY(other_grapheme_extend)
3761 PROPERTY(grapheme_link)
3762 PROPERTY(bidi_control)
3763 PROPERTY(bidi_left_to_right)
3764 PROPERTY(bidi_hebrew_right_to_left)
3765 PROPERTY(bidi_arabic_right_to_left)
3766 PROPERTY(bidi_european_digit)
3767 PROPERTY(bidi_eur_num_separator)
3768 PROPERTY(bidi_eur_num_terminator)
3769 PROPERTY(bidi_arabic_digit)
3770 PROPERTY(bidi_common_separator)
3771 PROPERTY(bidi_block_separator)
3772 PROPERTY(bidi_segment_separator)
3773 PROPERTY(bidi_whitespace)
3774 PROPERTY(bidi_non_spacing_mark)
3775 PROPERTY(bidi_boundary_neutral)
3776 PROPERTY(bidi_pdf)
3777 PROPERTY(bidi_embedding_or_override)
3778 PROPERTY(bidi_other_neutral)
3779 PROPERTY(hex_digit)
3780 PROPERTY(ascii_hex_digit)
3781 PROPERTY(ideographic)
3782 PROPERTY(unified_ideograph)
3783 PROPERTY(radical)
3784 PROPERTY(ids_binary_operator)
3785 PROPERTY(ids_trinary_operator)
3786 PROPERTY(zero_width)
3787 PROPERTY(space)
3788 PROPERTY(non_break)
3789 PROPERTY(iso_control)
3790 PROPERTY(format_control)
3791 PROPERTY(dash)
3792 PROPERTY(hyphen)
3793 PROPERTY(punctuation)
3794 PROPERTY(line_separator)
3795 PROPERTY(paragraph_separator)
3796 PROPERTY(quotation_mark)
3797 PROPERTY(sentence_terminal)
3798 PROPERTY(terminal_punctuation)
3799 PROPERTY(currency_symbol)
3800 PROPERTY(math)
3801 PROPERTY(other_math)
3802 PROPERTY(paired_punctuation)
3803 PROPERTY(left_of_pair)
3804 PROPERTY(combining)
3805 PROPERTY(composite)
3806 PROPERTY(decimal_digit)
3807 PROPERTY(numeric)
3808 PROPERTY(diacritic)
3809 PROPERTY(extender)
3810 PROPERTY(ignorable_control)
3811 #undef PROPERTY
3812 }
3813
3814
3815
3816
3817
3818 enum
3819 {
3820 UC_JOINING_TYPE_U,
3821 UC_JOINING_TYPE_T,
3822 UC_JOINING_TYPE_C,
3823 UC_JOINING_TYPE_L,
3824 UC_JOINING_TYPE_R,
3825 UC_JOINING_TYPE_D
3826 };
3827
3828 static uint8_t unicode_joining_type[0x110000];
3829
3830 enum
3831 {
3832 UC_JOINING_GROUP_NONE,
3833 UC_JOINING_GROUP_AIN,
3834 UC_JOINING_GROUP_ALAPH,
3835 UC_JOINING_GROUP_ALEF,
3836 UC_JOINING_GROUP_BEH,
3837 UC_JOINING_GROUP_BETH,
3838 UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE,
3839 UC_JOINING_GROUP_DAL,
3840 UC_JOINING_GROUP_DALATH_RISH,
3841 UC_JOINING_GROUP_E,
3842 UC_JOINING_GROUP_FARSI_YEH,
3843 UC_JOINING_GROUP_FE,
3844 UC_JOINING_GROUP_FEH,
3845 UC_JOINING_GROUP_FINAL_SEMKATH,
3846 UC_JOINING_GROUP_GAF,
3847 UC_JOINING_GROUP_GAMAL,
3848 UC_JOINING_GROUP_HAH,
3849 UC_JOINING_GROUP_HE,
3850 UC_JOINING_GROUP_HEH,
3851 UC_JOINING_GROUP_HEH_GOAL,
3852 UC_JOINING_GROUP_HETH,
3853 UC_JOINING_GROUP_KAF,
3854 UC_JOINING_GROUP_KAPH,
3855 UC_JOINING_GROUP_KHAPH,
3856 UC_JOINING_GROUP_KNOTTED_HEH,
3857 UC_JOINING_GROUP_LAM,
3858 UC_JOINING_GROUP_LAMADH,
3859 UC_JOINING_GROUP_MEEM,
3860 UC_JOINING_GROUP_MIM,
3861 UC_JOINING_GROUP_NOON,
3862 UC_JOINING_GROUP_NUN,
3863 UC_JOINING_GROUP_NYA,
3864 UC_JOINING_GROUP_PE,
3865 UC_JOINING_GROUP_QAF,
3866 UC_JOINING_GROUP_QAPH,
3867 UC_JOINING_GROUP_REH,
3868 UC_JOINING_GROUP_REVERSED_PE,
3869 UC_JOINING_GROUP_SAD,
3870 UC_JOINING_GROUP_SADHE,
3871 UC_JOINING_GROUP_SEEN,
3872 UC_JOINING_GROUP_SEMKATH,
3873 UC_JOINING_GROUP_SHIN,
3874 UC_JOINING_GROUP_SWASH_KAF,
3875 UC_JOINING_GROUP_SYRIAC_WAW,
3876 UC_JOINING_GROUP_TAH,
3877 UC_JOINING_GROUP_TAW,
3878 UC_JOINING_GROUP_TEH_MARBUTA,
3879 UC_JOINING_GROUP_TEH_MARBUTA_GOAL,
3880 UC_JOINING_GROUP_TETH,
3881 UC_JOINING_GROUP_WAW,
3882 UC_JOINING_GROUP_YEH,
3883 UC_JOINING_GROUP_YEH_BARREE,
3884 UC_JOINING_GROUP_YEH_WITH_TAIL,
3885 UC_JOINING_GROUP_YUDH,
3886 UC_JOINING_GROUP_YUDH_HE,
3887 UC_JOINING_GROUP_ZAIN,
3888 UC_JOINING_GROUP_ZHAIN,
3889 UC_JOINING_GROUP_ROHINGYA_YEH,
3890 UC_JOINING_GROUP_STRAIGHT_WAW,
3891 UC_JOINING_GROUP_MANICHAEAN_ALEPH,
3892 UC_JOINING_GROUP_MANICHAEAN_BETH,
3893 UC_JOINING_GROUP_MANICHAEAN_GIMEL,
3894 UC_JOINING_GROUP_MANICHAEAN_DALETH,
3895 UC_JOINING_GROUP_MANICHAEAN_WAW,
3896 UC_JOINING_GROUP_MANICHAEAN_ZAYIN,
3897 UC_JOINING_GROUP_MANICHAEAN_HETH,
3898 UC_JOINING_GROUP_MANICHAEAN_TETH,
3899 UC_JOINING_GROUP_MANICHAEAN_YODH,
3900 UC_JOINING_GROUP_MANICHAEAN_KAPH,
3901 UC_JOINING_GROUP_MANICHAEAN_LAMEDH,
3902 UC_JOINING_GROUP_MANICHAEAN_DHAMEDH,
3903 UC_JOINING_GROUP_MANICHAEAN_THAMEDH,
3904 UC_JOINING_GROUP_MANICHAEAN_MEM,
3905 UC_JOINING_GROUP_MANICHAEAN_NUN,
3906 UC_JOINING_GROUP_MANICHAEAN_SAMEKH,
3907 UC_JOINING_GROUP_MANICHAEAN_AYIN,
3908 UC_JOINING_GROUP_MANICHAEAN_PE,
3909 UC_JOINING_GROUP_MANICHAEAN_SADHE,
3910 UC_JOINING_GROUP_MANICHAEAN_QOPH,
3911 UC_JOINING_GROUP_MANICHAEAN_RESH,
3912 UC_JOINING_GROUP_MANICHAEAN_TAW,
3913 UC_JOINING_GROUP_MANICHAEAN_ONE,
3914 UC_JOINING_GROUP_MANICHAEAN_FIVE,
3915 UC_JOINING_GROUP_MANICHAEAN_TEN,
3916 UC_JOINING_GROUP_MANICHAEAN_TWENTY,
3917 UC_JOINING_GROUP_MANICHAEAN_HUNDRED,
3918 UC_JOINING_GROUP_AFRICAN_FEH,
3919 UC_JOINING_GROUP_AFRICAN_QAF,
3920 UC_JOINING_GROUP_AFRICAN_NOON
3921 };
3922
3923 static uint8_t unicode_joining_group[0x110000];
3924
3925 static void
3926 fill_arabicshaping (const char *arabicshaping_filename)
3927 {
3928 FILE *stream;
3929 unsigned int i;
3930 int lineno;
3931
3932 stream = fopen (arabicshaping_filename, "r");
3933 if (stream == NULL)
3934 {
3935 fprintf (stderr, "error during fopen of '%s'\n", arabicshaping_filename);
3936 exit (1);
3937 }
3938
3939 for (i = 0; i < 0x110000; i++)
3940 {
3941 unicode_joining_type[i] = (uint8_t)~(uint8_t)0;
3942 unicode_joining_group[i] = UC_JOINING_GROUP_NONE;
3943 }
3944
3945 lineno = 0;
3946 for (;;)
3947 {
3948 char buf[200+1];
3949 char separator1[200+1];
3950 char schematic_name[200+1];
3951 char separator2[200+1];
3952 char joining_type_name[200+1];
3953 char separator3[200+1];
3954 char joining_group_name[200+1];
3955 int joining_type;
3956 int joining_group;
3957
3958 lineno++;
3959 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
3960 break;
3961
3962 if (buf[0] == '\0' || buf[0] == '#')
3963 continue;
3964
3965 if (sscanf (buf, "%X%[; ]%[^;]%[; ]%[^;]%[; ]%100[^\n]",
3966 &i, separator1, schematic_name, separator2, joining_type_name,
3967 separator3, joining_group_name) != 7)
3968 {
3969 fprintf (stderr, "parse error in '%s':%d\n",
3970 arabicshaping_filename, lineno);
3971 exit (1);
3972 }
3973 assert (i < 0x110000);
3974
3975 #define TRY(name) else if (strcmp (joining_type_name, #name + 16) == 0) joining_type = name;
3976 if (false) {}
3977 TRY(UC_JOINING_TYPE_U)
3978 TRY(UC_JOINING_TYPE_T)
3979 TRY(UC_JOINING_TYPE_C)
3980 TRY(UC_JOINING_TYPE_L)
3981 TRY(UC_JOINING_TYPE_R)
3982 TRY(UC_JOINING_TYPE_D)
3983 #undef TRY
3984 else
3985 {
3986 fprintf (stderr, "unknown joining type value \"%s\" in '%s':%d\n",
3987 joining_type_name, arabicshaping_filename, lineno);
3988 exit (1);
3989 }
3990
3991
3992 while (joining_group_name[0] != '\0'
3993 && joining_group_name[strlen (joining_group_name) - 1] == ' ')
3994 joining_group_name[strlen (joining_group_name) - 1] = '\0';
3995
3996 #define TRY(value,name) else if (strcmp (joining_group_name, name) == 0) joining_group = value;
3997 if (false) {}
3998 TRY(UC_JOINING_GROUP_NONE, "No_Joining_Group")
3999 TRY(UC_JOINING_GROUP_AIN, "AIN")
4000 TRY(UC_JOINING_GROUP_ALAPH, "ALAPH")
4001 TRY(UC_JOINING_GROUP_ALEF, "ALEF")
4002 TRY(UC_JOINING_GROUP_BEH, "BEH")
4003 TRY(UC_JOINING_GROUP_BETH, "BETH")
4004 TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, "BURUSHASKI YEH BARREE")
4005 TRY(UC_JOINING_GROUP_DAL, "DAL")
4006 TRY(UC_JOINING_GROUP_DALATH_RISH, "DALATH RISH")
4007 TRY(UC_JOINING_GROUP_E, "E")
4008 TRY(UC_JOINING_GROUP_FARSI_YEH, "FARSI YEH")
4009 TRY(UC_JOINING_GROUP_FE, "FE")
4010 TRY(UC_JOINING_GROUP_FEH, "FEH")
4011 TRY(UC_JOINING_GROUP_FINAL_SEMKATH, "FINAL SEMKATH")
4012 TRY(UC_JOINING_GROUP_GAF, "GAF")
4013 TRY(UC_JOINING_GROUP_GAMAL, "GAMAL")
4014 TRY(UC_JOINING_GROUP_HAH, "HAH")
4015 TRY(UC_JOINING_GROUP_HE, "HE")
4016 TRY(UC_JOINING_GROUP_HEH, "HEH")
4017 TRY(UC_JOINING_GROUP_HEH_GOAL, "HEH GOAL")
4018 TRY(UC_JOINING_GROUP_HETH, "HETH")
4019 TRY(UC_JOINING_GROUP_KAF, "KAF")
4020 TRY(UC_JOINING_GROUP_KAPH, "KAPH")
4021 TRY(UC_JOINING_GROUP_KHAPH, "KHAPH")
4022 TRY(UC_JOINING_GROUP_KNOTTED_HEH, "KNOTTED HEH")
4023 TRY(UC_JOINING_GROUP_LAM, "LAM")
4024 TRY(UC_JOINING_GROUP_LAMADH, "LAMADH")
4025 TRY(UC_JOINING_GROUP_MEEM, "MEEM")
4026 TRY(UC_JOINING_GROUP_MIM, "MIM")
4027 TRY(UC_JOINING_GROUP_NOON, "NOON")
4028 TRY(UC_JOINING_GROUP_NUN, "NUN")
4029 TRY(UC_JOINING_GROUP_NYA, "NYA")
4030 TRY(UC_JOINING_GROUP_PE, "PE")
4031 TRY(UC_JOINING_GROUP_QAF, "QAF")
4032 TRY(UC_JOINING_GROUP_QAPH, "QAPH")
4033 TRY(UC_JOINING_GROUP_REH, "REH")
4034 TRY(UC_JOINING_GROUP_REVERSED_PE, "REVERSED PE")
4035 TRY(UC_JOINING_GROUP_SAD, "SAD")
4036 TRY(UC_JOINING_GROUP_SADHE, "SADHE")
4037 TRY(UC_JOINING_GROUP_SEEN, "SEEN")
4038 TRY(UC_JOINING_GROUP_SEMKATH, "SEMKATH")
4039 TRY(UC_JOINING_GROUP_SHIN, "SHIN")
4040 TRY(UC_JOINING_GROUP_SWASH_KAF, "SWASH KAF")
4041 TRY(UC_JOINING_GROUP_SYRIAC_WAW, "SYRIAC WAW")
4042 TRY(UC_JOINING_GROUP_TAH, "TAH")
4043 TRY(UC_JOINING_GROUP_TAW, "TAW")
4044 TRY(UC_JOINING_GROUP_TEH_MARBUTA, "TEH MARBUTA")
4045 TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL, "TEH MARBUTA GOAL")
4046 TRY(UC_JOINING_GROUP_TETH, "TETH")
4047 TRY(UC_JOINING_GROUP_WAW, "WAW")
4048 TRY(UC_JOINING_GROUP_YEH, "YEH")
4049 TRY(UC_JOINING_GROUP_YEH_BARREE, "YEH BARREE")
4050 TRY(UC_JOINING_GROUP_YEH_WITH_TAIL, "YEH WITH TAIL")
4051 TRY(UC_JOINING_GROUP_YUDH, "YUDH")
4052 TRY(UC_JOINING_GROUP_YUDH_HE, "YUDH HE")
4053 TRY(UC_JOINING_GROUP_ZAIN, "ZAIN")
4054 TRY(UC_JOINING_GROUP_ZHAIN, "ZHAIN")
4055 TRY(UC_JOINING_GROUP_ROHINGYA_YEH, "ROHINGYA YEH")
4056 TRY(UC_JOINING_GROUP_STRAIGHT_WAW, "STRAIGHT WAW")
4057 TRY(UC_JOINING_GROUP_MANICHAEAN_ALEPH, "MANICHAEAN ALEPH")
4058 TRY(UC_JOINING_GROUP_MANICHAEAN_BETH, "MANICHAEAN BETH")
4059 TRY(UC_JOINING_GROUP_MANICHAEAN_GIMEL, "MANICHAEAN GIMEL")
4060 TRY(UC_JOINING_GROUP_MANICHAEAN_DALETH, "MANICHAEAN DALETH")
4061 TRY(UC_JOINING_GROUP_MANICHAEAN_WAW, "MANICHAEAN WAW")
4062 TRY(UC_JOINING_GROUP_MANICHAEAN_ZAYIN, "MANICHAEAN ZAYIN")
4063 TRY(UC_JOINING_GROUP_MANICHAEAN_HETH, "MANICHAEAN HETH")
4064 TRY(UC_JOINING_GROUP_MANICHAEAN_TETH, "MANICHAEAN TETH")
4065 TRY(UC_JOINING_GROUP_MANICHAEAN_YODH, "MANICHAEAN YODH")
4066 TRY(UC_JOINING_GROUP_MANICHAEAN_KAPH, "MANICHAEAN KAPH")
4067 TRY(UC_JOINING_GROUP_MANICHAEAN_LAMEDH, "MANICHAEAN LAMEDH")
4068 TRY(UC_JOINING_GROUP_MANICHAEAN_DHAMEDH, "MANICHAEAN DHAMEDH")
4069 TRY(UC_JOINING_GROUP_MANICHAEAN_THAMEDH, "MANICHAEAN THAMEDH")
4070 TRY(UC_JOINING_GROUP_MANICHAEAN_MEM, "MANICHAEAN MEM")
4071 TRY(UC_JOINING_GROUP_MANICHAEAN_NUN, "MANICHAEAN NUN")
4072 TRY(UC_JOINING_GROUP_MANICHAEAN_SAMEKH, "MANICHAEAN SAMEKH")
4073 TRY(UC_JOINING_GROUP_MANICHAEAN_AYIN, "MANICHAEAN AYIN")
4074 TRY(UC_JOINING_GROUP_MANICHAEAN_PE, "MANICHAEAN PE")
4075 TRY(UC_JOINING_GROUP_MANICHAEAN_SADHE, "MANICHAEAN SADHE")
4076 TRY(UC_JOINING_GROUP_MANICHAEAN_QOPH, "MANICHAEAN QOPH")
4077 TRY(UC_JOINING_GROUP_MANICHAEAN_RESH, "MANICHAEAN RESH")
4078 TRY(UC_JOINING_GROUP_MANICHAEAN_TAW, "MANICHAEAN TAW")
4079 TRY(UC_JOINING_GROUP_MANICHAEAN_ONE, "MANICHAEAN ONE")
4080 TRY(UC_JOINING_GROUP_MANICHAEAN_FIVE, "MANICHAEAN FIVE")
4081 TRY(UC_JOINING_GROUP_MANICHAEAN_TEN, "MANICHAEAN TEN")
4082 TRY(UC_JOINING_GROUP_MANICHAEAN_TWENTY, "MANICHAEAN TWENTY")
4083 TRY(UC_JOINING_GROUP_MANICHAEAN_HUNDRED, "MANICHAEAN HUNDRED")
4084 TRY(UC_JOINING_GROUP_AFRICAN_FEH, "AFRICAN FEH")
4085 TRY(UC_JOINING_GROUP_AFRICAN_QAF, "AFRICAN QAF")
4086 TRY(UC_JOINING_GROUP_AFRICAN_NOON, "AFRICAN NOON")
4087 #undef TRY
4088 else
4089 {
4090 fprintf (stderr, "unknown joining group value \"%s\" in '%s':%d\n",
4091 joining_group_name, arabicshaping_filename, lineno);
4092 exit (1);
4093 }
4094
4095 unicode_joining_type[i] = joining_type;
4096 unicode_joining_group[i] = joining_group;
4097 }
4098
4099 if (ferror (stream) || fclose (stream))
4100 {
4101 fprintf (stderr, "error reading from '%s'\n", arabicshaping_filename);
4102 exit (1);
4103 }
4104 }
4105
4106
4107 static const char *
4108 joining_type_as_c_identifier (int joining_type)
4109 {
4110 #define TRY(value) if (joining_type == value) return #value;
4111 TRY(UC_JOINING_TYPE_U)
4112 TRY(UC_JOINING_TYPE_T)
4113 TRY(UC_JOINING_TYPE_C)
4114 TRY(UC_JOINING_TYPE_L)
4115 TRY(UC_JOINING_TYPE_R)
4116 TRY(UC_JOINING_TYPE_D)
4117 #undef TRY
4118 abort ();
4119 }
4120
4121 static void
4122 output_joining_type_test (const char *filename, const char *version)
4123 {
4124 FILE *stream;
4125 bool need_comma;
4126 unsigned int ch;
4127
4128 stream = fopen (filename, "w");
4129 if (stream == NULL)
4130 {
4131 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4132 exit (1);
4133 }
4134
4135 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4136 fprintf (stream, "/* Arabic joining type of Unicode characters. */\n");
4137 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4138 version);
4139 fprintf (stream, "\n");
4140
4141 fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
4142 fprintf (stream, "\n");
4143 output_tests_license (stream);
4144 fprintf (stream, "\n");
4145
4146 need_comma = false;
4147 for (ch = 0; ch < 0x110000; ch++)
4148 {
4149 int value = unicode_joining_type[ch];
4150
4151 if (value != (uint8_t)~(uint8_t)0)
4152 {
4153 if (need_comma)
4154 fprintf (stream, ",\n");
4155 fprintf (stream, " { 0x%04X, %s }", ch, joining_type_as_c_identifier (value));
4156 need_comma = true;
4157 }
4158 }
4159 if (need_comma)
4160 fprintf (stream, "\n");
4161
4162 if (ferror (stream) || fclose (stream))
4163 {
4164 fprintf (stderr, "error writing to '%s'\n", filename);
4165 exit (1);
4166 }
4167 }
4168
4169
4170 #define TABLE joining_type_table
4171 #define ELEMENT uint8_t
4172 #define DEFAULT (uint8_t)~(uint8_t)0
4173 #define xmalloc malloc
4174 #define xrealloc realloc
4175 #include "3level.h"
4176
4177 static void
4178 output_joining_type (const char *filename, const char *version)
4179 {
4180 FILE *stream;
4181 unsigned int ch, i;
4182 struct joining_type_table t;
4183 unsigned int level1_offset, level2_offset, level3_offset;
4184 uint8_t *level3_packed;
4185
4186 stream = fopen (filename, "w");
4187 if (stream == NULL)
4188 {
4189 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4190 exit (1);
4191 }
4192
4193 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4194 fprintf (stream, "/* Arabic joining type of Unicode characters. */\n");
4195 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4196 version);
4197 fprintf (stream, "\n");
4198
4199 fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
4200 fprintf (stream, "\n");
4201 output_library_license (stream, true);
4202 fprintf (stream, "\n");
4203
4204 t.p = 7;
4205 t.q = 9;
4206 joining_type_table_init (&t);
4207
4208 for (ch = 0; ch < 0x110000; ch++)
4209 {
4210 uint8_t value = unicode_joining_type[ch];
4211
4212 assert (value == (uint8_t)~(uint8_t)0 || value <= 0x0f);
4213
4214 joining_type_table_add (&t, ch, value);
4215 }
4216
4217 joining_type_table_finalize (&t);
4218
4219
4220 level1_offset =
4221 5 * sizeof (uint32_t);
4222 level2_offset =
4223 5 * sizeof (uint32_t)
4224 + t.level1_size * sizeof (uint32_t);
4225 level3_offset =
4226 5 * sizeof (uint32_t)
4227 + t.level1_size * sizeof (uint32_t)
4228 + (t.level2_size << t.q) * sizeof (uint32_t);
4229
4230 for (i = 0; i < 5; i++)
4231 fprintf (stream, "#define joining_type_header_%d %d\n", i,
4232 ((uint32_t *) t.result)[i]);
4233 fprintf (stream, "static const\n");
4234 fprintf (stream, "struct\n");
4235 fprintf (stream, " {\n");
4236 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4237 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4238 fprintf (stream, " unsigned char level3[%zu * %d];\n", t.level3_size,
4239 (1 << t.p) * 4 / 8);
4240 fprintf (stream, " }\n");
4241 fprintf (stream, "u_joining_type =\n");
4242 fprintf (stream, "{\n");
4243 fprintf (stream, " {");
4244 if (t.level1_size > 8)
4245 fprintf (stream, "\n ");
4246 for (i = 0; i < t.level1_size; i++)
4247 {
4248 uint32_t offset;
4249 if (i > 0 && (i % 8) == 0)
4250 fprintf (stream, "\n ");
4251 offset = ((uint32_t *) (t.result + level1_offset))[i];
4252 if (offset == 0)
4253 fprintf (stream, " %5d", -1);
4254 else
4255 fprintf (stream, " %5zu",
4256 (offset - level2_offset) / sizeof (uint32_t));
4257 if (i+1 < t.level1_size)
4258 fprintf (stream, ",");
4259 }
4260 if (t.level1_size > 8)
4261 fprintf (stream, "\n ");
4262 fprintf (stream, " },\n");
4263 fprintf (stream, " {");
4264 if (t.level2_size << t.q > 8)
4265 fprintf (stream, "\n ");
4266 for (i = 0; i < t.level2_size << t.q; i++)
4267 {
4268 uint32_t offset;
4269 if (i > 0 && (i % 8) == 0)
4270 fprintf (stream, "\n ");
4271 offset = ((uint32_t *) (t.result + level2_offset))[i];
4272 if (offset == 0)
4273 fprintf (stream, " %5d", -1);
4274 else
4275 fprintf (stream, " %5zu",
4276 (offset - level3_offset) / sizeof (uint8_t));
4277 if (i+1 < t.level2_size << t.q)
4278 fprintf (stream, ",");
4279 }
4280 if (t.level2_size << t.q > 8)
4281 fprintf (stream, "\n ");
4282 fprintf (stream, " },\n");
4283
4284 level3_packed =
4285 (uint8_t *) calloc ((t.level3_size << t.p) * 4 / 8, sizeof (uint8_t));
4286 for (i = 0; i < t.level3_size << t.p; i++)
4287 {
4288 unsigned int j = (i * 4) / 8;
4289 unsigned int k = (i * 4) % 8;
4290 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i] & 0x0f;
4291 level3_packed[j] |= (value << k);
4292 }
4293 fprintf (stream, " {");
4294 if ((t.level3_size << t.p) * 4 / 8 > 8)
4295 fprintf (stream, "\n ");
4296 for (i = 0; i < (t.level3_size << t.p) * 4 / 8; i++)
4297 {
4298 if (i > 0 && (i % 8) == 0)
4299 fprintf (stream, "\n ");
4300 fprintf (stream, " 0x%02x", level3_packed[i]);
4301 if (i+1 < (t.level3_size << t.p) * 4 / 8)
4302 fprintf (stream, ",");
4303 }
4304 if ((t.level3_size << t.p) * 4 / 8 > 8)
4305 fprintf (stream, "\n ");
4306 fprintf (stream, " }\n");
4307 free (level3_packed);
4308 fprintf (stream, "};\n");
4309
4310 if (ferror (stream) || fclose (stream))
4311 {
4312 fprintf (stderr, "error writing to '%s'\n", filename);
4313 exit (1);
4314 }
4315 }
4316
4317
4318 static const char *
4319 joining_group_as_c_identifier (int joining_group)
4320 {
4321 #define TRY(value) if (joining_group == value) return #value;
4322 TRY(UC_JOINING_GROUP_NONE)
4323 TRY(UC_JOINING_GROUP_AIN)
4324 TRY(UC_JOINING_GROUP_ALAPH)
4325 TRY(UC_JOINING_GROUP_ALEF)
4326 TRY(UC_JOINING_GROUP_BEH)
4327 TRY(UC_JOINING_GROUP_BETH)
4328 TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE)
4329 TRY(UC_JOINING_GROUP_DAL)
4330 TRY(UC_JOINING_GROUP_DALATH_RISH)
4331 TRY(UC_JOINING_GROUP_E)
4332 TRY(UC_JOINING_GROUP_FARSI_YEH)
4333 TRY(UC_JOINING_GROUP_FE)
4334 TRY(UC_JOINING_GROUP_FEH)
4335 TRY(UC_JOINING_GROUP_FINAL_SEMKATH)
4336 TRY(UC_JOINING_GROUP_GAF)
4337 TRY(UC_JOINING_GROUP_GAMAL)
4338 TRY(UC_JOINING_GROUP_HAH)
4339 TRY(UC_JOINING_GROUP_HE)
4340 TRY(UC_JOINING_GROUP_HEH)
4341 TRY(UC_JOINING_GROUP_HEH_GOAL)
4342 TRY(UC_JOINING_GROUP_HETH)
4343 TRY(UC_JOINING_GROUP_KAF)
4344 TRY(UC_JOINING_GROUP_KAPH)
4345 TRY(UC_JOINING_GROUP_KHAPH)
4346 TRY(UC_JOINING_GROUP_KNOTTED_HEH)
4347 TRY(UC_JOINING_GROUP_LAM)
4348 TRY(UC_JOINING_GROUP_LAMADH)
4349 TRY(UC_JOINING_GROUP_MEEM)
4350 TRY(UC_JOINING_GROUP_MIM)
4351 TRY(UC_JOINING_GROUP_NOON)
4352 TRY(UC_JOINING_GROUP_NUN)
4353 TRY(UC_JOINING_GROUP_NYA)
4354 TRY(UC_JOINING_GROUP_PE)
4355 TRY(UC_JOINING_GROUP_QAF)
4356 TRY(UC_JOINING_GROUP_QAPH)
4357 TRY(UC_JOINING_GROUP_REH)
4358 TRY(UC_JOINING_GROUP_REVERSED_PE)
4359 TRY(UC_JOINING_GROUP_SAD)
4360 TRY(UC_JOINING_GROUP_SADHE)
4361 TRY(UC_JOINING_GROUP_SEEN)
4362 TRY(UC_JOINING_GROUP_SEMKATH)
4363 TRY(UC_JOINING_GROUP_SHIN)
4364 TRY(UC_JOINING_GROUP_SWASH_KAF)
4365 TRY(UC_JOINING_GROUP_SYRIAC_WAW)
4366 TRY(UC_JOINING_GROUP_TAH)
4367 TRY(UC_JOINING_GROUP_TAW)
4368 TRY(UC_JOINING_GROUP_TEH_MARBUTA)
4369 TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL)
4370 TRY(UC_JOINING_GROUP_TETH)
4371 TRY(UC_JOINING_GROUP_WAW)
4372 TRY(UC_JOINING_GROUP_YEH)
4373 TRY(UC_JOINING_GROUP_YEH_BARREE)
4374 TRY(UC_JOINING_GROUP_YEH_WITH_TAIL)
4375 TRY(UC_JOINING_GROUP_YUDH)
4376 TRY(UC_JOINING_GROUP_YUDH_HE)
4377 TRY(UC_JOINING_GROUP_ZAIN)
4378 TRY(UC_JOINING_GROUP_ZHAIN)
4379 TRY(UC_JOINING_GROUP_ROHINGYA_YEH)
4380 TRY(UC_JOINING_GROUP_STRAIGHT_WAW)
4381 TRY(UC_JOINING_GROUP_MANICHAEAN_ALEPH)
4382 TRY(UC_JOINING_GROUP_MANICHAEAN_BETH)
4383 TRY(UC_JOINING_GROUP_MANICHAEAN_GIMEL)
4384 TRY(UC_JOINING_GROUP_MANICHAEAN_DALETH)
4385 TRY(UC_JOINING_GROUP_MANICHAEAN_WAW)
4386 TRY(UC_JOINING_GROUP_MANICHAEAN_ZAYIN)
4387 TRY(UC_JOINING_GROUP_MANICHAEAN_HETH)
4388 TRY(UC_JOINING_GROUP_MANICHAEAN_TETH)
4389 TRY(UC_JOINING_GROUP_MANICHAEAN_YODH)
4390 TRY(UC_JOINING_GROUP_MANICHAEAN_KAPH)
4391 TRY(UC_JOINING_GROUP_MANICHAEAN_LAMEDH)
4392 TRY(UC_JOINING_GROUP_MANICHAEAN_DHAMEDH)
4393 TRY(UC_JOINING_GROUP_MANICHAEAN_THAMEDH)
4394 TRY(UC_JOINING_GROUP_MANICHAEAN_MEM)
4395 TRY(UC_JOINING_GROUP_MANICHAEAN_NUN)
4396 TRY(UC_JOINING_GROUP_MANICHAEAN_SAMEKH)
4397 TRY(UC_JOINING_GROUP_MANICHAEAN_AYIN)
4398 TRY(UC_JOINING_GROUP_MANICHAEAN_PE)
4399 TRY(UC_JOINING_GROUP_MANICHAEAN_SADHE)
4400 TRY(UC_JOINING_GROUP_MANICHAEAN_QOPH)
4401 TRY(UC_JOINING_GROUP_MANICHAEAN_RESH)
4402 TRY(UC_JOINING_GROUP_MANICHAEAN_TAW)
4403 TRY(UC_JOINING_GROUP_MANICHAEAN_ONE)
4404 TRY(UC_JOINING_GROUP_MANICHAEAN_FIVE)
4405 TRY(UC_JOINING_GROUP_MANICHAEAN_TEN)
4406 TRY(UC_JOINING_GROUP_MANICHAEAN_TWENTY)
4407 TRY(UC_JOINING_GROUP_MANICHAEAN_HUNDRED)
4408 TRY(UC_JOINING_GROUP_AFRICAN_FEH)
4409 TRY(UC_JOINING_GROUP_AFRICAN_QAF)
4410 TRY(UC_JOINING_GROUP_AFRICAN_NOON)
4411 #undef TRY
4412 abort ();
4413 }
4414
4415 static void
4416 output_joining_group_test (const char *filename, const char *version)
4417 {
4418 FILE *stream;
4419 bool need_comma;
4420 unsigned int ch;
4421
4422 stream = fopen (filename, "w");
4423 if (stream == NULL)
4424 {
4425 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4426 exit (1);
4427 }
4428
4429 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4430 fprintf (stream, "/* Arabic joining group of Unicode characters. */\n");
4431 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4432 version);
4433 fprintf (stream, "\n");
4434
4435 fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
4436 fprintf (stream, "\n");
4437 output_tests_license (stream);
4438 fprintf (stream, "\n");
4439
4440 need_comma = false;
4441 for (ch = 0; ch < 0x110000; ch++)
4442 {
4443 int value = unicode_joining_group[ch];
4444
4445 if (value != UC_JOINING_GROUP_NONE)
4446 {
4447 if (need_comma)
4448 fprintf (stream, ",\n");
4449 fprintf (stream, " { 0x%04X, %s }", ch, joining_group_as_c_identifier (value));
4450 need_comma = true;
4451 }
4452 }
4453 if (need_comma)
4454 fprintf (stream, "\n");
4455
4456 if (ferror (stream) || fclose (stream))
4457 {
4458 fprintf (stderr, "error writing to '%s'\n", filename);
4459 exit (1);
4460 }
4461 }
4462
4463
4464 #define TABLE joining_group_table
4465 #define ELEMENT uint8_t
4466 #define DEFAULT UC_JOINING_GROUP_NONE
4467 #define xmalloc malloc
4468 #define xrealloc realloc
4469 #include "3level.h"
4470
4471 static void
4472 output_joining_group (const char *filename, const char *version)
4473 {
4474 FILE *stream;
4475 unsigned int ch, i;
4476 struct joining_group_table t;
4477 unsigned int level1_offset, level2_offset, level3_offset;
4478 uint16_t *level3_packed;
4479
4480 stream = fopen (filename, "w");
4481 if (stream == NULL)
4482 {
4483 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4484 exit (1);
4485 }
4486
4487 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4488 fprintf (stream, "/* Arabic joining group of Unicode characters. */\n");
4489 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4490 version);
4491 fprintf (stream, "\n");
4492
4493 fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
4494 fprintf (stream, "\n");
4495 output_library_license (stream, false);
4496 fprintf (stream, "\n");
4497
4498 t.p = 7;
4499 t.q = 9;
4500 joining_group_table_init (&t);
4501
4502 for (ch = 0; ch < 0x110000; ch++)
4503 {
4504 uint8_t value = unicode_joining_group[ch];
4505
4506 assert (value <= 0x7f);
4507
4508 joining_group_table_add (&t, ch, value);
4509 }
4510
4511 joining_group_table_finalize (&t);
4512
4513
4514 level1_offset =
4515 5 * sizeof (uint32_t);
4516 level2_offset =
4517 5 * sizeof (uint32_t)
4518 + t.level1_size * sizeof (uint32_t);
4519 level3_offset =
4520 5 * sizeof (uint32_t)
4521 + t.level1_size * sizeof (uint32_t)
4522 + (t.level2_size << t.q) * sizeof (uint32_t);
4523
4524 for (i = 0; i < 5; i++)
4525 fprintf (stream, "#define joining_group_header_%d %d\n", i,
4526 ((uint32_t *) t.result)[i]);
4527 fprintf (stream, "static const\n");
4528 fprintf (stream, "struct\n");
4529 fprintf (stream, " {\n");
4530 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4531 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4532 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
4533 (1 << t.p) * 7 / 16);
4534 fprintf (stream, " }\n");
4535 fprintf (stream, "u_joining_group =\n");
4536 fprintf (stream, "{\n");
4537 fprintf (stream, " {");
4538 if (t.level1_size > 8)
4539 fprintf (stream, "\n ");
4540 for (i = 0; i < t.level1_size; i++)
4541 {
4542 uint32_t offset;
4543 if (i > 0 && (i % 8) == 0)
4544 fprintf (stream, "\n ");
4545 offset = ((uint32_t *) (t.result + level1_offset))[i];
4546 if (offset == 0)
4547 fprintf (stream, " %5d", -1);
4548 else
4549 fprintf (stream, " %5zu",
4550 (offset - level2_offset) / sizeof (uint32_t));
4551 if (i+1 < t.level1_size)
4552 fprintf (stream, ",");
4553 }
4554 if (t.level1_size > 8)
4555 fprintf (stream, "\n ");
4556 fprintf (stream, " },\n");
4557 fprintf (stream, " {");
4558 if (t.level2_size << t.q > 8)
4559 fprintf (stream, "\n ");
4560 for (i = 0; i < t.level2_size << t.q; i++)
4561 {
4562 uint32_t offset;
4563 if (i > 0 && (i % 8) == 0)
4564 fprintf (stream, "\n ");
4565 offset = ((uint32_t *) (t.result + level2_offset))[i];
4566 if (offset == 0)
4567 fprintf (stream, " %5d", -1);
4568 else
4569 fprintf (stream, " %5zu",
4570 (offset - level3_offset) / sizeof (uint8_t));
4571 if (i+1 < t.level2_size << t.q)
4572 fprintf (stream, ",");
4573 }
4574 if (t.level2_size << t.q > 8)
4575 fprintf (stream, "\n ");
4576 fprintf (stream, " },\n");
4577
4578
4579 level3_packed =
4580 (uint16_t *)
4581 calloc ((t.level3_size << t.p) * 7 / 16 + 1, sizeof (uint16_t));
4582 for (i = 0; i < t.level3_size << t.p; i++)
4583 {
4584 unsigned int j = (i * 7) / 16;
4585 unsigned int k = (i * 7) % 16;
4586 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
4587 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
4588 level3_packed[j] = value & 0xffff;
4589 level3_packed[j+1] = value >> 16;
4590 }
4591 fprintf (stream, " {");
4592 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
4593 fprintf (stream, "\n ");
4594 for (i = 0; i < (t.level3_size << t.p) * 7 / 16 + 1; i++)
4595 {
4596 if (i > 0 && (i % 8) == 0)
4597 fprintf (stream, "\n ");
4598 fprintf (stream, " 0x%04x", level3_packed[i]);
4599 if (i+1 < (t.level3_size << t.p) * 7 / 16 + 1)
4600 fprintf (stream, ",");
4601 }
4602 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
4603 fprintf (stream, "\n ");
4604 fprintf (stream, " }\n");
4605 free (level3_packed);
4606 fprintf (stream, "};\n");
4607
4608 if (ferror (stream) || fclose (stream))
4609 {
4610 fprintf (stderr, "error writing to '%s'\n", filename);
4611 exit (1);
4612 }
4613 }
4614
4615
4616
4617
4618
4619 static const char *scripts[256];
4620 static unsigned int numscripts;
4621
4622 static uint8_t unicode_scripts[0x110000];
4623
4624 static void
4625 fill_scripts (const char *scripts_filename)
4626 {
4627 FILE *stream;
4628 unsigned int i;
4629
4630 stream = fopen (scripts_filename, "r");
4631 if (stream == NULL)
4632 {
4633 fprintf (stderr, "error during fopen of '%s'\n", scripts_filename);
4634 exit (1);
4635 }
4636
4637 numscripts = 0;
4638
4639 for (i = 0; i < 0x110000; i++)
4640 unicode_scripts[i] = (uint8_t)~(uint8_t)0;
4641
4642 for (;;)
4643 {
4644 char buf[200+1];
4645 unsigned int i1, i2;
4646 char padding[200+1];
4647 char scriptname[200+1];
4648 int script;
4649
4650 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
4651 break;
4652
4653 if (buf[0] == '\0' || buf[0] == '#')
4654 continue;
4655
4656 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4)
4657 {
4658 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3)
4659 {
4660 fprintf (stderr, "parse error in '%s'\n", scripts_filename);
4661 exit (1);
4662 }
4663 i2 = i1;
4664 }
4665 assert (i2 >= i1);
4666 assert (i2 < 0x110000);
4667
4668 for (script = numscripts - 1; script >= 0; script--)
4669 if (strcmp (scripts[script], scriptname) == 0)
4670 break;
4671 if (script < 0)
4672 {
4673 scripts[numscripts] = strdup (scriptname);
4674 script = numscripts;
4675 numscripts++;
4676 assert (numscripts != 256);
4677 }
4678
4679 for (i = i1; i <= i2; i++)
4680 {
4681 if (unicode_scripts[i] != (uint8_t)~(uint8_t)0)
4682 fprintf (stderr, "0x%04X belongs to multiple scripts\n", i);
4683 unicode_scripts[i] = script;
4684 }
4685 }
4686
4687 if (ferror (stream) || fclose (stream))
4688 {
4689 fprintf (stderr, "error reading from '%s'\n", scripts_filename);
4690 exit (1);
4691 }
4692 }
4693
4694
4695 #define TABLE script_table
4696 #define ELEMENT uint8_t
4697 #define DEFAULT (uint8_t)~(uint8_t)0
4698 #define xmalloc malloc
4699 #define xrealloc realloc
4700 #include "3level.h"
4701
4702 static void
4703 output_scripts (const char *version)
4704 {
4705 const char *filename = "unictype/scripts.h";
4706 FILE *stream;
4707 unsigned int ch, s, i;
4708 struct script_table t;
4709 unsigned int level1_offset, level2_offset, level3_offset;
4710
4711 typedef struct
4712 {
4713 const char *lowercase_name;
4714 }
4715 scriptinfo_t;
4716 scriptinfo_t scriptinfo[256];
4717
4718 stream = fopen (filename, "w");
4719 if (stream == NULL)
4720 {
4721 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4722 exit (1);
4723 }
4724
4725 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4726 fprintf (stream, "/* Unicode scripts. */\n");
4727 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4728 version);
4729 fprintf (stream, "\n");
4730
4731 fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
4732 fprintf (stream, "\n");
4733 output_library_license (stream, true);
4734 fprintf (stream, "\n");
4735
4736 for (s = 0; s < numscripts; s++)
4737 {
4738 char *lcp = strdup (scripts[s]);
4739 char *cp;
4740
4741 for (cp = lcp; *cp != '\0'; cp++)
4742 if (*cp >= 'A' && *cp <= 'Z')
4743 *cp += 'a' - 'A';
4744
4745 scriptinfo[s].lowercase_name = lcp;
4746 }
4747
4748 for (s = 0; s < numscripts; s++)
4749 {
4750 fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n",
4751 scriptinfo[s].lowercase_name);
4752 fprintf (stream, "{\n");
4753 i = 0;
4754 for (ch = 0; ch < 0x110000; ch++)
4755 if (unicode_scripts[ch] == s)
4756 {
4757 unsigned int start;
4758 unsigned int end;
4759
4760 start = ch;
4761 while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s)
4762 ch++;
4763 end = ch;
4764
4765 if (i > 0)
4766 fprintf (stream, ",\n");
4767 if (start == end)
4768 fprintf (stream, " { 0x%04X, 1, 1 }", start);
4769 else
4770 fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }",
4771 start, end);
4772 i++;
4773 }
4774 fprintf (stream, "\n");
4775 fprintf (stream, "};\n");
4776 }
4777
4778 fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts);
4779 fprintf (stream, "{\n");
4780 for (s = 0; s < numscripts; s++)
4781 {
4782 fprintf (stream, " {\n");
4783 fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n",
4784 scriptinfo[s].lowercase_name);
4785 fprintf (stream, " script_%s_intervals,\n",
4786 scriptinfo[s].lowercase_name);
4787 fprintf (stream, " \"%s\"\n", scripts[s]);
4788 fprintf (stream, " }");
4789 if (s+1 < numscripts)
4790 fprintf (stream, ",");
4791 fprintf (stream, "\n");
4792 }
4793 fprintf (stream, "};\n");
4794
4795 t.p = 7;
4796 t.q = 9;
4797 script_table_init (&t);
4798
4799 for (ch = 0; ch < 0x110000; ch++)
4800 {
4801 unsigned int s = unicode_scripts[ch];
4802 if (s != (uint8_t)~(uint8_t)0)
4803 script_table_add (&t, ch, s);
4804 }
4805
4806 script_table_finalize (&t);
4807
4808
4809 level1_offset =
4810 5 * sizeof (uint32_t);
4811 level2_offset =
4812 5 * sizeof (uint32_t)
4813 + t.level1_size * sizeof (uint32_t);
4814 level3_offset =
4815 5 * sizeof (uint32_t)
4816 + t.level1_size * sizeof (uint32_t)
4817 + (t.level2_size << t.q) * sizeof (uint32_t);
4818
4819 for (i = 0; i < 5; i++)
4820 fprintf (stream, "#define script_header_%d %d\n", i,
4821 ((uint32_t *) t.result)[i]);
4822 fprintf (stream, "static const\n");
4823 fprintf (stream, "struct\n");
4824 fprintf (stream, " {\n");
4825 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4826 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4827 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
4828 fprintf (stream, " }\n");
4829 fprintf (stream, "u_script =\n");
4830 fprintf (stream, "{\n");
4831 fprintf (stream, " {");
4832 if (t.level1_size > 8)
4833 fprintf (stream, "\n ");
4834 for (i = 0; i < t.level1_size; i++)
4835 {
4836 uint32_t offset;
4837 if (i > 0 && (i % 8) == 0)
4838 fprintf (stream, "\n ");
4839 offset = ((uint32_t *) (t.result + level1_offset))[i];
4840 if (offset == 0)
4841 fprintf (stream, " %5d", -1);
4842 else
4843 fprintf (stream, " %5zu",
4844 (offset - level2_offset) / sizeof (uint32_t));
4845 if (i+1 < t.level1_size)
4846 fprintf (stream, ",");
4847 }
4848 if (t.level1_size > 8)
4849 fprintf (stream, "\n ");
4850 fprintf (stream, " },\n");
4851 fprintf (stream, " {");
4852 if (t.level2_size << t.q > 8)
4853 fprintf (stream, "\n ");
4854 for (i = 0; i < t.level2_size << t.q; i++)
4855 {
4856 uint32_t offset;
4857 if (i > 0 && (i % 8) == 0)
4858 fprintf (stream, "\n ");
4859 offset = ((uint32_t *) (t.result + level2_offset))[i];
4860 if (offset == 0)
4861 fprintf (stream, " %5d", -1);
4862 else
4863 fprintf (stream, " %5zu",
4864 (offset - level3_offset) / sizeof (uint8_t));
4865 if (i+1 < t.level2_size << t.q)
4866 fprintf (stream, ",");
4867 }
4868 if (t.level2_size << t.q > 8)
4869 fprintf (stream, "\n ");
4870 fprintf (stream, " },\n");
4871 fprintf (stream, " {");
4872 if (t.level3_size << t.p > 8)
4873 fprintf (stream, "\n ");
4874 for (i = 0; i < t.level3_size << t.p; i++)
4875 {
4876 if (i > 0 && (i % 8) == 0)
4877 fprintf (stream, "\n ");
4878 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
4879 if (i+1 < t.level3_size << t.p)
4880 fprintf (stream, ",");
4881 }
4882 if (t.level3_size << t.p > 8)
4883 fprintf (stream, "\n ");
4884 fprintf (stream, " }\n");
4885 fprintf (stream, "};\n");
4886
4887 if (ferror (stream) || fclose (stream))
4888 {
4889 fprintf (stderr, "error writing to '%s'\n", filename);
4890 exit (1);
4891 }
4892 }
4893
4894 static void
4895 output_scripts_byname (const char *version)
4896 {
4897 const char *filename = "unictype/scripts_byname.gperf";
4898 FILE *stream;
4899 unsigned int s;
4900
4901 stream = fopen (filename, "w");
4902 if (stream == NULL)
4903 {
4904 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4905 exit (1);
4906 }
4907
4908 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4909 fprintf (stream, "/* Unicode scripts. */\n");
4910 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4911 version);
4912 fprintf (stream, "\n");
4913
4914 fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
4915 fprintf (stream, "\n");
4916 output_library_license (stream, true);
4917 fprintf (stream, "\n");
4918
4919 fprintf (stream, "struct named_script { int name; unsigned int index; };\n");
4920 fprintf (stream, "%%struct-type\n");
4921 fprintf (stream, "%%language=ANSI-C\n");
4922 fprintf (stream, "%%define hash-function-name scripts_hash\n");
4923 fprintf (stream, "%%define lookup-function-name uc_script_lookup\n");
4924 fprintf (stream, "%%readonly-tables\n");
4925 fprintf (stream, "%%global-table\n");
4926 fprintf (stream, "%%define word-array-name script_names\n");
4927 fprintf (stream, "%%pic\n");
4928 fprintf (stream, "%%define string-pool-name script_stringpool\n");
4929 fprintf (stream, "%%%%\n");
4930 for (s = 0; s < numscripts; s++)
4931 fprintf (stream, "%s, %u\n", scripts[s], s);
4932
4933 if (ferror (stream) || fclose (stream))
4934 {
4935 fprintf (stderr, "error writing to '%s'\n", filename);
4936 exit (1);
4937 }
4938 }
4939
4940
4941
4942
4943
4944 typedef struct { unsigned int start; unsigned int end; const char *name; }
4945 block_t;
4946 static block_t blocks[384];
4947 static unsigned int numblocks;
4948
4949 static void
4950 fill_blocks (const char *blocks_filename)
4951 {
4952 FILE *stream;
4953
4954 stream = fopen (blocks_filename, "r");
4955 if (stream == NULL)
4956 {
4957 fprintf (stderr, "error during fopen of '%s'\n", blocks_filename);
4958 exit (1);
4959 }
4960
4961 for (;;)
4962 {
4963 char buf[200+1];
4964 unsigned int i1, i2;
4965 char padding[200+1];
4966 char blockname[200+1];
4967
4968 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
4969 break;
4970
4971 if (buf[0] == '\0' || buf[0] == '#')
4972 continue;
4973
4974 if (sscanf (buf, "%X..%X%[ ;]%[^\r]", &i1, &i2, padding, blockname) != 4)
4975 {
4976 fprintf (stderr, "parse error in '%s'\n", blocks_filename);
4977 exit (1);
4978 }
4979 blocks[numblocks].start = i1;
4980 blocks[numblocks].end = i2;
4981 blocks[numblocks].name = strdup (blockname);
4982
4983 assert (numblocks == 0 || blocks[numblocks-1].end < blocks[numblocks].start);
4984 numblocks++;
4985 assert (numblocks != SIZEOF (blocks));
4986 }
4987
4988 if (ferror (stream) || fclose (stream))
4989 {
4990 fprintf (stderr, "error reading from '%s'\n", blocks_filename);
4991 exit (1);
4992 }
4993 }
4994
4995
4996 static unsigned int
4997 block_first_index (unsigned int ch)
4998 {
4999
5000 unsigned int lo = 0;
5001 unsigned int hi = numblocks;
5002
5003
5004
5005 while (lo < hi)
5006 {
5007 unsigned int mid = (lo + hi) / 2;
5008 if (blocks[mid].end < ch)
5009 lo = mid + 1;
5010 else
5011 hi = mid;
5012 }
5013 return hi;
5014 }
5015
5016
5017
5018 static unsigned int
5019 block_last_index (unsigned int ch)
5020 {
5021
5022 unsigned int lo = 0;
5023 unsigned int hi = numblocks;
5024
5025
5026
5027 while (lo < hi)
5028 {
5029 unsigned int mid = (lo + hi) / 2;
5030 if (blocks[mid].start <= ch)
5031 lo = mid + 1;
5032 else
5033 hi = mid;
5034 }
5035 return hi;
5036 }
5037
5038 static void
5039 output_blocks (const char *version)
5040 {
5041 const char *filename = "unictype/blocks.h";
5042 const unsigned int shift = 8;
5043 const unsigned int threshold = 0x28000;
5044 FILE *stream;
5045 unsigned int i;
5046 unsigned int i1;
5047
5048 stream = fopen (filename, "w");
5049 if (stream == NULL)
5050 {
5051 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5052 exit (1);
5053 }
5054
5055 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
5056 fprintf (stream, "/* Unicode blocks. */\n");
5057 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
5058 version);
5059 fprintf (stream, "\n");
5060
5061 fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
5062 fprintf (stream, "\n");
5063 output_library_license (stream, false);
5064 fprintf (stream, "\n");
5065
5066 fprintf (stream, "static const uc_block_t blocks[] =\n");
5067 fprintf (stream, "{\n");
5068 for (i = 0; i < numblocks; i++)
5069 {
5070 fprintf (stream, " { 0x%04X, 0x%04X, \"%s\" }", blocks[i].start,
5071 blocks[i].end, blocks[i].name);
5072 if (i+1 < numblocks)
5073 fprintf (stream, ",");
5074 fprintf (stream, "\n");
5075 }
5076 fprintf (stream, "};\n");
5077 fprintf (stream, "#define blocks_level1_shift %d\n", shift);
5078 fprintf (stream, "#define blocks_level1_threshold 0x%04X\n", threshold);
5079 fprintf (stream, "static const uint16_t blocks_level1[%d * 2] =\n",
5080 threshold >> shift);
5081 fprintf (stream, "{\n");
5082 for (i1 = 0; i1 < (threshold >> shift); i1++)
5083 {
5084 unsigned int first_index = block_first_index (i1 << shift);
5085 unsigned int last_index = block_last_index (((i1 + 1) << shift) - 1);
5086 fprintf (stream, " %3d, %3d", first_index, last_index);
5087 if (i1+1 < (threshold >> shift))
5088 fprintf (stream, ",");
5089 fprintf (stream, "\n");
5090 }
5091 fprintf (stream, "};\n");
5092 fprintf (stream, "#define blocks_upper_first_index %d\n",
5093 block_first_index (threshold));
5094 fprintf (stream, "#define blocks_upper_last_index %d\n",
5095 block_last_index (0x10FFFF));
5096
5097 if (ferror (stream) || fclose (stream))
5098 {
5099 fprintf (stderr, "error writing to '%s'\n", filename);
5100 exit (1);
5101 }
5102 }
5103
5104
5105
5106
5107
5108 enum
5109 {
5110 UC_IDENTIFIER_START,
5111 UC_IDENTIFIER_VALID,
5112 UC_IDENTIFIER_INVALID,
5113 UC_IDENTIFIER_IGNORABLE
5114 };
5115
5116
5117 static bool
5118 is_c_whitespace (unsigned int ch)
5119 {
5120 return (ch == ' '
5121 || ch == '\t'
5122 || ch == '\n' || ch == '\r'
5123 || ch == '\v'
5124 || ch == '\f');
5125 }
5126
5127
5128 static int
5129 c_ident_category (unsigned int ch)
5130 {
5131
5132 if (ch >= '0' && ch <= '9')
5133 return UC_IDENTIFIER_VALID;
5134 if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_')
5135 return UC_IDENTIFIER_START;
5136
5137 if (0
5138
5139 || (ch == 0x00AA)
5140 || (ch == 0x00BA)
5141 || (ch >= 0x00C0 && ch <= 0x00D6)
5142 || (ch >= 0x00D8 && ch <= 0x00F6)
5143 || (ch >= 0x00F8 && ch <= 0x01F5)
5144 || (ch >= 0x01FA && ch <= 0x0217)
5145 || (ch >= 0x0250 && ch <= 0x02A8)
5146 || (ch >= 0x1E00 && ch <= 0x1E9B)
5147 || (ch >= 0x1EA0 && ch <= 0x1EF9)
5148 || (ch == 0x207F)
5149
5150 || (ch == 0x0386)
5151 || (ch >= 0x0388 && ch <= 0x038A)
5152 || (ch == 0x038C)
5153 || (ch >= 0x038E && ch <= 0x03A1)
5154 || (ch >= 0x03A3 && ch <= 0x03CE)
5155 || (ch >= 0x03D0 && ch <= 0x03D6)
5156 || (ch == 0x03DA)
5157 || (ch == 0x03DC)
5158 || (ch == 0x03DE)
5159 || (ch == 0x03E0)
5160 || (ch >= 0x03E2 && ch <= 0x03F3)
5161 || (ch >= 0x1F00 && ch <= 0x1F15)
5162 || (ch >= 0x1F18 && ch <= 0x1F1D)
5163 || (ch >= 0x1F20 && ch <= 0x1F45)
5164 || (ch >= 0x1F48 && ch <= 0x1F4D)
5165 || (ch >= 0x1F50 && ch <= 0x1F57)
5166 || (ch == 0x1F59)
5167 || (ch == 0x1F5B)
5168 || (ch == 0x1F5D)
5169 || (ch >= 0x1F5F && ch <= 0x1F7D)
5170 || (ch >= 0x1F80 && ch <= 0x1FB4)
5171 || (ch >= 0x1FB6 && ch <= 0x1FBC)
5172 || (ch >= 0x1FC2 && ch <= 0x1FC4)
5173 || (ch >= 0x1FC6 && ch <= 0x1FCC)
5174 || (ch >= 0x1FD0 && ch <= 0x1FD3)
5175 || (ch >= 0x1FD6 && ch <= 0x1FDB)
5176 || (ch >= 0x1FE0 && ch <= 0x1FEC)
5177 || (ch >= 0x1FF2 && ch <= 0x1FF4)
5178 || (ch >= 0x1FF6 && ch <= 0x1FFC)
5179
5180 || (ch >= 0x0401 && ch <= 0x040C)
5181 || (ch >= 0x040E && ch <= 0x044F)
5182 || (ch >= 0x0451 && ch <= 0x045C)
5183 || (ch >= 0x045E && ch <= 0x0481)
5184 || (ch >= 0x0490 && ch <= 0x04C4)
5185 || (ch >= 0x04C7 && ch <= 0x04C8)
5186 || (ch >= 0x04CB && ch <= 0x04CC)
5187 || (ch >= 0x04D0 && ch <= 0x04EB)
5188 || (ch >= 0x04EE && ch <= 0x04F5)
5189 || (ch >= 0x04F8 && ch <= 0x04F9)
5190
5191 || (ch >= 0x0531 && ch <= 0x0556)
5192 || (ch >= 0x0561 && ch <= 0x0587)
5193
5194 || (ch >= 0x05B0 && ch <= 0x05B9)
5195 || (ch >= 0x05BB && ch <= 0x05BD)
5196 || (ch == 0x05BF)
5197 || (ch >= 0x05C1 && ch <= 0x05C2)
5198 || (ch >= 0x05D0 && ch <= 0x05EA)
5199 || (ch >= 0x05F0 && ch <= 0x05F2)
5200
5201 || (ch >= 0x0621 && ch <= 0x063A)
5202 || (ch >= 0x0640 && ch <= 0x0652)
5203 || (ch >= 0x0670 && ch <= 0x06B7)
5204 || (ch >= 0x06BA && ch <= 0x06BE)
5205 || (ch >= 0x06C0 && ch <= 0x06CE)
5206 || (ch >= 0x06D0 && ch <= 0x06DC)
5207 || (ch >= 0x06E5 && ch <= 0x06E8)
5208 || (ch >= 0x06EA && ch <= 0x06ED)
5209
5210 || (ch >= 0x0901 && ch <= 0x0903)
5211 || (ch >= 0x0905 && ch <= 0x0939)
5212 || (ch >= 0x093E && ch <= 0x094D)
5213 || (ch >= 0x0950 && ch <= 0x0952)
5214 || (ch >= 0x0958 && ch <= 0x0963)
5215
5216 || (ch >= 0x0981 && ch <= 0x0983)
5217 || (ch >= 0x0985 && ch <= 0x098C)
5218 || (ch >= 0x098F && ch <= 0x0990)
5219 || (ch >= 0x0993 && ch <= 0x09A8)
5220 || (ch >= 0x09AA && ch <= 0x09B0)
5221 || (ch == 0x09B2)
5222 || (ch >= 0x09B6 && ch <= 0x09B9)
5223 || (ch >= 0x09BE && ch <= 0x09C4)
5224 || (ch >= 0x09C7 && ch <= 0x09C8)
5225 || (ch >= 0x09CB && ch <= 0x09CD)
5226 || (ch >= 0x09DC && ch <= 0x09DD)
5227 || (ch >= 0x09DF && ch <= 0x09E3)
5228 || (ch >= 0x09F0 && ch <= 0x09F1)
5229
5230 || (ch == 0x0A02)
5231 || (ch >= 0x0A05 && ch <= 0x0A0A)
5232 || (ch >= 0x0A0F && ch <= 0x0A10)
5233 || (ch >= 0x0A13 && ch <= 0x0A28)
5234 || (ch >= 0x0A2A && ch <= 0x0A30)
5235 || (ch >= 0x0A32 && ch <= 0x0A33)
5236 || (ch >= 0x0A35 && ch <= 0x0A36)
5237 || (ch >= 0x0A38 && ch <= 0x0A39)
5238 || (ch >= 0x0A3E && ch <= 0x0A42)
5239 || (ch >= 0x0A47 && ch <= 0x0A48)
5240 || (ch >= 0x0A4B && ch <= 0x0A4D)
5241 || (ch >= 0x0A59 && ch <= 0x0A5C)
5242 || (ch == 0x0A5E)
5243 || (ch == 0x0A74)
5244
5245 || (ch >= 0x0A81 && ch <= 0x0A83)
5246 || (ch >= 0x0A85 && ch <= 0x0A8B)
5247 || (ch == 0x0A8D)
5248 || (ch >= 0x0A8F && ch <= 0x0A91)
5249 || (ch >= 0x0A93 && ch <= 0x0AA8)
5250 || (ch >= 0x0AAA && ch <= 0x0AB0)
5251 || (ch >= 0x0AB2 && ch <= 0x0AB3)
5252 || (ch >= 0x0AB5 && ch <= 0x0AB9)
5253 || (ch >= 0x0ABD && ch <= 0x0AC5)
5254 || (ch >= 0x0AC7 && ch <= 0x0AC9)
5255 || (ch >= 0x0ACB && ch <= 0x0ACD)
5256 || (ch == 0x0AD0)
5257 || (ch == 0x0AE0)
5258
5259 || (ch >= 0x0B01 && ch <= 0x0B03)
5260 || (ch >= 0x0B05 && ch <= 0x0B0C)
5261 || (ch >= 0x0B0F && ch <= 0x0B10)
5262 || (ch >= 0x0B13 && ch <= 0x0B28)
5263 || (ch >= 0x0B2A && ch <= 0x0B30)
5264 || (ch >= 0x0B32 && ch <= 0x0B33)
5265 || (ch >= 0x0B36 && ch <= 0x0B39)
5266 || (ch >= 0x0B3E && ch <= 0x0B43)
5267 || (ch >= 0x0B47 && ch <= 0x0B48)
5268 || (ch >= 0x0B4B && ch <= 0x0B4D)
5269 || (ch >= 0x0B5C && ch <= 0x0B5D)
5270 || (ch >= 0x0B5F && ch <= 0x0B61)
5271
5272 || (ch >= 0x0B82 && ch <= 0x0B83)
5273 || (ch >= 0x0B85 && ch <= 0x0B8A)
5274 || (ch >= 0x0B8E && ch <= 0x0B90)
5275 || (ch >= 0x0B92 && ch <= 0x0B95)
5276 || (ch >= 0x0B99 && ch <= 0x0B9A)
5277 || (ch == 0x0B9C)
5278 || (ch >= 0x0B9E && ch <= 0x0B9F)
5279 || (ch >= 0x0BA3 && ch <= 0x0BA4)
5280 || (ch >= 0x0BA8 && ch <= 0x0BAA)
5281 || (ch >= 0x0BAE && ch <= 0x0BB5)
5282 || (ch >= 0x0BB7 && ch <= 0x0BB9)
5283 || (ch >= 0x0BBE && ch <= 0x0BC2)
5284 || (ch >= 0x0BC6 && ch <= 0x0BC8)
5285 || (ch >= 0x0BCA && ch <= 0x0BCD)
5286
5287 || (ch >= 0x0C01 && ch <= 0x0C03)
5288 || (ch >= 0x0C05 && ch <= 0x0C0C)
5289 || (ch >= 0x0C0E && ch <= 0x0C10)
5290 || (ch >= 0x0C12 && ch <= 0x0C28)
5291 || (ch >= 0x0C2A && ch <= 0x0C33)
5292 || (ch >= 0x0C35 && ch <= 0x0C39)
5293 || (ch >= 0x0C3E && ch <= 0x0C44)
5294 || (ch >= 0x0C46 && ch <= 0x0C48)
5295 || (ch >= 0x0C4A && ch <= 0x0C4D)
5296 || (ch >= 0x0C60 && ch <= 0x0C61)
5297
5298 || (ch >= 0x0C82 && ch <= 0x0C83)
5299 || (ch >= 0x0C85 && ch <= 0x0C8C)
5300 || (ch >= 0x0C8E && ch <= 0x0C90)
5301 || (ch >= 0x0C92 && ch <= 0x0CA8)
5302 || (ch >= 0x0CAA && ch <= 0x0CB3)
5303 || (ch >= 0x0CB5 && ch <= 0x0CB9)
5304 || (ch >= 0x0CBE && ch <= 0x0CC4)
5305 || (ch >= 0x0CC6 && ch <= 0x0CC8)
5306 || (ch >= 0x0CCA && ch <= 0x0CCD)
5307 || (ch == 0x0CDE)
5308 || (ch >= 0x0CE0 && ch <= 0x0CE1)
5309
5310 || (ch >= 0x0D02 && ch <= 0x0D03)
5311 || (ch >= 0x0D05 && ch <= 0x0D0C)
5312 || (ch >= 0x0D0E && ch <= 0x0D10)
5313 || (ch >= 0x0D12 && ch <= 0x0D28)
5314 || (ch >= 0x0D2A && ch <= 0x0D39)
5315 || (ch >= 0x0D3E && ch <= 0x0D43)
5316 || (ch >= 0x0D46 && ch <= 0x0D48)
5317 || (ch >= 0x0D4A && ch <= 0x0D4D)
5318 || (ch >= 0x0D60 && ch <= 0x0D61)
5319
5320 || (ch >= 0x0E01 && ch <= 0x0E3A)
5321 || (ch >= 0x0E40 && ch <= 0x0E5B)
5322
5323 || (ch >= 0x0E81 && ch <= 0x0E82)
5324 || (ch == 0x0E84)
5325 || (ch >= 0x0E87 && ch <= 0x0E88)
5326 || (ch == 0x0E8A)
5327 || (ch == 0x0E8D)
5328 || (ch >= 0x0E94 && ch <= 0x0E97)
5329 || (ch >= 0x0E99 && ch <= 0x0E9F)
5330 || (ch >= 0x0EA1 && ch <= 0x0EA3)
5331 || (ch == 0x0EA5)
5332 || (ch == 0x0EA7)
5333 || (ch >= 0x0EAA && ch <= 0x0EAB)
5334 || (ch >= 0x0EAD && ch <= 0x0EAE)
5335 || (ch >= 0x0EB0 && ch <= 0x0EB9)
5336 || (ch >= 0x0EBB && ch <= 0x0EBD)
5337 || (ch >= 0x0EC0 && ch <= 0x0EC4)
5338 || (ch == 0x0EC6)
5339 || (ch >= 0x0EC8 && ch <= 0x0ECD)
5340 || (ch >= 0x0EDC && ch <= 0x0EDD)
5341
5342 || (ch == 0x0F00)
5343 || (ch >= 0x0F18 && ch <= 0x0F19)
5344 || (ch == 0x0F35)
5345 || (ch == 0x0F37)
5346 || (ch == 0x0F39)
5347 || (ch >= 0x0F3E && ch <= 0x0F47)
5348 || (ch >= 0x0F49 && ch <= 0x0F69)
5349 || (ch >= 0x0F71 && ch <= 0x0F84)
5350 || (ch >= 0x0F86 && ch <= 0x0F8B)
5351 || (ch >= 0x0F90 && ch <= 0x0F95)
5352 || (ch == 0x0F97)
5353 || (ch >= 0x0F99 && ch <= 0x0FAD)
5354 || (ch >= 0x0FB1 && ch <= 0x0FB7)
5355 || (ch == 0x0FB9)
5356
5357 || (ch >= 0x10A0 && ch <= 0x10C5)
5358 || (ch >= 0x10D0 && ch <= 0x10F6)
5359
5360 || (ch >= 0x3041 && ch <= 0x3093)
5361 || (ch >= 0x309B && ch <= 0x309C)
5362
5363 || (ch >= 0x30A1 && ch <= 0x30F6)
5364 || (ch >= 0x30FB && ch <= 0x30FC)
5365
5366 || (ch >= 0x3105 && ch <= 0x312C)
5367
5368 || (ch >= 0x4E00 && ch <= 0x9FA5)
5369
5370 || (ch >= 0xAC00 && ch <= 0xD7A3)
5371
5372 || (ch >= 0x0660 && ch <= 0x0669)
5373 || (ch >= 0x06F0 && ch <= 0x06F9)
5374 || (ch >= 0x0966 && ch <= 0x096F)
5375 || (ch >= 0x09E6 && ch <= 0x09EF)
5376 || (ch >= 0x0A66 && ch <= 0x0A6F)
5377 || (ch >= 0x0AE6 && ch <= 0x0AEF)
5378 || (ch >= 0x0B66 && ch <= 0x0B6F)
5379 || (ch >= 0x0BE7 && ch <= 0x0BEF)
5380 || (ch >= 0x0C66 && ch <= 0x0C6F)
5381 || (ch >= 0x0CE6 && ch <= 0x0CEF)
5382 || (ch >= 0x0D66 && ch <= 0x0D6F)
5383 || (ch >= 0x0E50 && ch <= 0x0E59)
5384 || (ch >= 0x0ED0 && ch <= 0x0ED9)
5385 || (ch >= 0x0F20 && ch <= 0x0F33)
5386
5387 || (ch == 0x00B5)
5388 || (ch == 0x00B7)
5389 || (ch >= 0x02B0 && ch <= 0x02B8)
5390 || (ch == 0x02BB)
5391 || (ch >= 0x02BD && ch <= 0x02C1)
5392 || (ch >= 0x02D0 && ch <= 0x02D1)
5393 || (ch >= 0x02E0 && ch <= 0x02E4)
5394 || (ch == 0x037A)
5395 || (ch == 0x0559)
5396 || (ch == 0x093D)
5397 || (ch == 0x0B3D)
5398 || (ch == 0x1FBE)
5399 || (ch >= 0x203F && ch <= 0x2040)
5400 || (ch == 0x2102)
5401 || (ch == 0x2107)
5402 || (ch >= 0x210A && ch <= 0x2113)
5403 || (ch == 0x2115)
5404 || (ch >= 0x2118 && ch <= 0x211D)
5405 || (ch == 0x2124)
5406 || (ch == 0x2126)
5407 || (ch == 0x2128)
5408 || (ch >= 0x212A && ch <= 0x2131)
5409 || (ch >= 0x2133 && ch <= 0x2138)
5410 || (ch >= 0x2160 && ch <= 0x2182)
5411 || (ch >= 0x3005 && ch <= 0x3007)
5412 || (ch >= 0x3021 && ch <= 0x3029)
5413 )
5414 return UC_IDENTIFIER_START;
5415 return UC_IDENTIFIER_INVALID;
5416 }
5417
5418
5419
5420 static bool
5421 is_java_whitespace (unsigned int ch)
5422 {
5423 return (ch == ' ' || ch == '\t' || ch == '\f'
5424 || ch == '\n' || ch == '\r');
5425 }
5426
5427
5428
5429
5430 static int
5431 java_ident_category (unsigned int ch)
5432 {
5433
5434 if (is_category_L (ch)
5435 || is_category_Nl (ch)
5436 || is_category_Sc (ch)
5437 || is_category_Pc (ch)
5438 )
5439 return UC_IDENTIFIER_START;
5440 if (is_category_Nd (ch)
5441 || is_category_Mc (ch)
5442 || is_category_Mn (ch)
5443 )
5444 return UC_IDENTIFIER_VALID;
5445 if ((ch >= 0x0000 && ch <= 0x0008)
5446 || (ch >= 0x000E && ch <= 0x001B)
5447 || (ch >= 0x007F && ch <= 0x009F)
5448 || is_category_Cf (ch)
5449 )
5450 return UC_IDENTIFIER_IGNORABLE;
5451 return UC_IDENTIFIER_INVALID;
5452 }
5453
5454
5455 #define TABLE identsyntax_table
5456 #define ELEMENT uint8_t
5457 #define DEFAULT UC_IDENTIFIER_INVALID
5458 #define xmalloc malloc
5459 #define xrealloc realloc
5460 #include "3level.h"
5461
5462
5463 static void
5464 output_ident_category (const char *filename, int (*predicate) (unsigned int), const char *name, const char *version)
5465 {
5466 FILE *stream;
5467 unsigned int ch, i;
5468 struct identsyntax_table t;
5469 unsigned int level1_offset, level2_offset, level3_offset;
5470
5471 stream = fopen (filename, "w");
5472 if (stream == NULL)
5473 {
5474 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5475 exit (1);
5476 }
5477
5478 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
5479 fprintf (stream, "/* Language syntax properties of Unicode characters. */\n");
5480 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
5481 version);
5482 fprintf (stream, "\n");
5483
5484 fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
5485 fprintf (stream, "\n");
5486 output_library_license (stream, false);
5487 fprintf (stream, "\n");
5488
5489 t.p = 7;
5490 t.q = 5;
5491 identsyntax_table_init (&t);
5492
5493 for (ch = 0; ch < 0x110000; ch++)
5494 {
5495 int syntaxcode = predicate (ch);
5496
5497 assert (syntaxcode <= 0x03);
5498
5499 if (syntaxcode != UC_IDENTIFIER_INVALID)
5500 identsyntax_table_add (&t, ch, syntaxcode);
5501 }
5502
5503 identsyntax_table_finalize (&t);
5504
5505
5506 level1_offset =
5507 5 * sizeof (uint32_t);
5508 level2_offset =
5509 5 * sizeof (uint32_t)
5510 + t.level1_size * sizeof (uint32_t);
5511 level3_offset =
5512 5 * sizeof (uint32_t)
5513 + t.level1_size * sizeof (uint32_t)
5514 + (t.level2_size << t.q) * sizeof (uint32_t);
5515
5516 for (i = 0; i < 5; i++)
5517 fprintf (stream, "#define identsyntax_header_%d %d\n", i,
5518 ((uint32_t *) t.result)[i]);
5519 fprintf (stream, "static const\n");
5520 fprintf (stream, "struct\n");
5521 fprintf (stream, " {\n");
5522 fprintf (stream, " int level1[%zu];\n", t.level1_size);
5523 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
5524 fprintf (stream, " unsigned short level3[%zu * %d];\n", t.level3_size,
5525 (1 << t.p) * 2 / 16);
5526 fprintf (stream, " }\n");
5527 fprintf (stream, "%s =\n", name);
5528 fprintf (stream, "{\n");
5529 fprintf (stream, " {");
5530 if (t.level1_size > 8)
5531 fprintf (stream, "\n ");
5532 for (i = 0; i < t.level1_size; i++)
5533 {
5534 uint32_t offset;
5535 if (i > 0 && (i % 8) == 0)
5536 fprintf (stream, "\n ");
5537 offset = ((uint32_t *) (t.result + level1_offset))[i];
5538 if (offset == 0)
5539 fprintf (stream, " %5d", -1);
5540 else
5541 fprintf (stream, " %5zu",
5542 (offset - level2_offset) / sizeof (uint32_t));
5543 if (i+1 < t.level1_size)
5544 fprintf (stream, ",");
5545 }
5546 if (t.level1_size > 8)
5547 fprintf (stream, "\n ");
5548 fprintf (stream, " },\n");
5549 fprintf (stream, " {");
5550 if (t.level2_size << t.q > 8)
5551 fprintf (stream, "\n ");
5552 for (i = 0; i < t.level2_size << t.q; i++)
5553 {
5554 uint32_t offset;
5555 if (i > 0 && (i % 8) == 0)
5556 fprintf (stream, "\n ");
5557 offset = ((uint32_t *) (t.result + level2_offset))[i];
5558 if (offset == 0)
5559 fprintf (stream, " %5d", -1);
5560 else
5561 fprintf (stream, " %5zu",
5562 (offset - level3_offset) / sizeof (uint8_t));
5563 if (i+1 < t.level2_size << t.q)
5564 fprintf (stream, ",");
5565 }
5566 if (t.level2_size << t.q > 8)
5567 fprintf (stream, "\n ");
5568 fprintf (stream, " },\n");
5569
5570 fprintf (stream, " {");
5571 if ((t.level3_size << t.p) * 2 / 16 > 8)
5572 fprintf (stream, "\n ");
5573 for (i = 0; i < (t.level3_size << t.p) * 2 / 16; i++)
5574 {
5575 if (i > 0 && (i % 8) == 0)
5576 fprintf (stream, "\n ");
5577 fprintf (stream, " 0x%04x",
5578 (((uint8_t *) (t.result + level3_offset))[8 * i] << 0)
5579 | (((uint8_t *) (t.result + level3_offset))[8 * i + 1] << 2)
5580 | (((uint8_t *) (t.result + level3_offset))[8 * i + 2] << 4)
5581 | (((uint8_t *) (t.result + level3_offset))[8 * i + 3] << 6)
5582 | (((uint8_t *) (t.result + level3_offset))[8 * i + 4] << 8)
5583 | (((uint8_t *) (t.result + level3_offset))[8 * i + 5] << 10)
5584 | (((uint8_t *) (t.result + level3_offset))[8 * i + 6] << 12)
5585 | (((uint8_t *) (t.result + level3_offset))[8 * i + 7] << 14));
5586 if (i+1 < (t.level3_size << t.p) * 2 / 16)
5587 fprintf (stream, ",");
5588 }
5589 if ((t.level3_size << t.p) * 2 / 16 > 8)
5590 fprintf (stream, "\n ");
5591 fprintf (stream, " }\n");
5592 fprintf (stream, "};\n");
5593
5594 if (ferror (stream) || fclose (stream))
5595 {
5596 fprintf (stderr, "error writing to '%s'\n", filename);
5597 exit (1);
5598 }
5599 }
5600
5601 static void
5602 output_ident_properties (const char *version)
5603 {
5604 #define PROPERTY(P) \
5605 debug_output_predicate ("unictype/sy_" #P ".txt", is_ ## P); \
5606 output_predicate_test ("../tests/unictype/test-sy_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
5607 output_predicate ("unictype/sy_" #P ".h", is_ ## P, "u_" #P, "Language syntax properties", version);
5608 PROPERTY(c_whitespace)
5609 PROPERTY(java_whitespace)
5610 #undef PROPERTY
5611
5612 output_ident_category ("unictype/sy_c_ident.h", c_ident_category, "u_c_ident", version);
5613 output_ident_category ("unictype/sy_java_ident.h", java_ident_category, "u_java_ident", version);
5614 }
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624 static unsigned int
5625 to_upper (unsigned int ch)
5626 {
5627 if (unicode_attributes[ch].name != NULL
5628 && unicode_attributes[ch].upper != NONE)
5629 return unicode_attributes[ch].upper;
5630 else
5631 return ch;
5632 }
5633
5634 static unsigned int
5635 to_lower (unsigned int ch)
5636 {
5637 if (unicode_attributes[ch].name != NULL
5638 && unicode_attributes[ch].lower != NONE)
5639 return unicode_attributes[ch].lower;
5640 else
5641 return ch;
5642 }
5643
5644 static unsigned int
5645 to_title (unsigned int ch)
5646 {
5647 if (unicode_attributes[ch].name != NULL
5648 && unicode_attributes[ch].title != NONE)
5649 return unicode_attributes[ch].title;
5650 else
5651 return ch;
5652 }
5653
5654
5655
5656 static bool
5657 is_upper (unsigned int ch)
5658 {
5659 return (to_lower (ch) != ch);
5660 }
5661
5662 static bool
5663 is_lower (unsigned int ch)
5664 {
5665 return (to_upper (ch) != ch)
5666
5667 || (ch == 0x00DF);
5668 }
5669
5670 static bool
5671 is_alpha (unsigned int ch)
5672 {
5673 return (unicode_attributes[ch].name != NULL
5674 && ((unicode_attributes[ch].category[0] == 'L'
5675
5676
5677 && (ch != 0x0E2F) && (ch != 0x0E46))
5678
5679
5680 || (ch == 0x0E31)
5681 || (ch >= 0x0E34 && ch <= 0x0E3A)
5682 || (ch >= 0x0E47 && ch <= 0x0E4E)
5683
5684 || (ch == 0x0345)
5685
5686 || (unicode_attributes[ch].category[0] == 'N'
5687 && unicode_attributes[ch].category[1] == 'l')
5688
5689 || (unicode_attributes[ch].category[0] == 'S'
5690 && unicode_attributes[ch].category[1] == 'o'
5691 && strstr (unicode_attributes[ch].name, " LETTER ")
5692 != NULL)
5693
5694
5695
5696 || (unicode_attributes[ch].category[0] == 'N'
5697 && unicode_attributes[ch].category[1] == 'd'
5698 && !(ch >= 0x0030 && ch <= 0x0039))));
5699 }
5700
5701 static bool
5702 is_digit (unsigned int ch)
5703 {
5704 #if 0
5705 return (unicode_attributes[ch].name != NULL
5706 && unicode_attributes[ch].category[0] == 'N'
5707 && unicode_attributes[ch].category[1] == 'd');
5708
5709
5710 #else
5711
5712
5713
5714
5715
5716
5717
5718
5719 return (ch >= 0x0030 && ch <= 0x0039);
5720 #endif
5721 }
5722
5723 static bool
5724 is_alnum (unsigned int ch)
5725 {
5726 return is_alpha (ch) || is_digit (ch);
5727 }
5728
5729 static bool
5730 is_blank (unsigned int ch)
5731 {
5732 return (ch == 0x0009
5733
5734 || (unicode_attributes[ch].name != NULL
5735 && unicode_attributes[ch].category[0] == 'Z'
5736 && unicode_attributes[ch].category[1] == 's'
5737 && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
5738 }
5739
5740 static bool
5741 is_space (unsigned int ch)
5742 {
5743
5744
5745 return (ch == 0x0020
5746 || ch == 0x000C
5747 || ch == 0x000A
5748 || ch == 0x000D
5749 || ch == 0x0009
5750 || ch == 0x000B
5751
5752 || (unicode_attributes[ch].name != NULL
5753 && unicode_attributes[ch].category[0] == 'Z'
5754 && (unicode_attributes[ch].category[1] == 'l'
5755 || unicode_attributes[ch].category[1] == 'p'
5756 || (unicode_attributes[ch].category[1] == 's'
5757 && !strstr (unicode_attributes[ch].decomposition,
5758 "<noBreak>")))));
5759 }
5760
5761 static bool
5762 is_cntrl (unsigned int ch)
5763 {
5764 return (unicode_attributes[ch].name != NULL
5765 && (strcmp (unicode_attributes[ch].name, "<control>") == 0
5766
5767 || (unicode_attributes[ch].category[0] == 'Z'
5768 && (unicode_attributes[ch].category[1] == 'l'
5769 || unicode_attributes[ch].category[1] == 'p'))));
5770 }
5771
5772 static bool
5773 is_xdigit (unsigned int ch)
5774 {
5775 #if 0
5776 return is_digit (ch)
5777 || (ch >= 0x0041 && ch <= 0x0046)
5778 || (ch >= 0x0061 && ch <= 0x0066);
5779 #else
5780
5781
5782
5783
5784
5785
5786
5787
5788 return (ch >= 0x0030 && ch <= 0x0039)
5789 || (ch >= 0x0041 && ch <= 0x0046)
5790 || (ch >= 0x0061 && ch <= 0x0066);
5791 #endif
5792 }
5793
5794 static bool
5795 is_graph (unsigned int ch)
5796 {
5797 return (unicode_attributes[ch].name != NULL
5798 && strcmp (unicode_attributes[ch].name, "<control>")
5799 && !is_space (ch));
5800 }
5801
5802 static bool
5803 is_print (unsigned int ch)
5804 {
5805 return (unicode_attributes[ch].name != NULL
5806 && strcmp (unicode_attributes[ch].name, "<control>")
5807
5808 && !(unicode_attributes[ch].name != NULL
5809 && unicode_attributes[ch].category[0] == 'Z'
5810 && (unicode_attributes[ch].category[1] == 'l'
5811 || unicode_attributes[ch].category[1] == 'p')));
5812 }
5813
5814 static bool
5815 is_punct (unsigned int ch)
5816 {
5817 #if 0
5818 return (unicode_attributes[ch].name != NULL
5819 && unicode_attributes[ch].category[0] == 'P');
5820 #else
5821
5822
5823 return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
5824 #endif
5825 }
5826
5827
5828 static void
5829 output_old_ctype (const char *version)
5830 {
5831 #define PROPERTY(P) \
5832 debug_output_predicate ("unictype/ctype_" #P ".txt", is_ ## P); \
5833 output_predicate_test ("../tests/unictype/test-ctype_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
5834 output_predicate ("unictype/ctype_" #P ".h", is_ ## P, "u_is_" #P, "ISO C <ctype.h> like properties", version);
5835 PROPERTY(alnum)
5836 PROPERTY(alpha)
5837 PROPERTY(cntrl)
5838 PROPERTY(digit)
5839 PROPERTY(graph)
5840 PROPERTY(lower)
5841 PROPERTY(print)
5842 PROPERTY(punct)
5843 PROPERTY(space)
5844 PROPERTY(upper)
5845 PROPERTY(xdigit)
5846 PROPERTY(blank)
5847 #undef PROPERTY
5848 }
5849
5850 #if 0
5851
5852 static bool
5853 is_combining (unsigned int ch)
5854 {
5855
5856
5857
5858
5859 return (unicode_attributes[ch].name != NULL
5860 && unicode_attributes[ch].category[0] == 'M'
5861 && (unicode_attributes[ch].category[1] == 'n'
5862 || unicode_attributes[ch].category[1] == 'c'
5863 || unicode_attributes[ch].category[1] == 'e'));
5864 }
5865
5866 static bool
5867 is_combining_level3 (unsigned int ch)
5868 {
5869 return is_combining (ch)
5870 && !(unicode_attributes[ch].combining[0] != '\0'
5871 && unicode_attributes[ch].combining[0] != '0'
5872 && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
5873 }
5874
5875
5876 static const char *
5877 ucs_symbol (unsigned int i)
5878 {
5879 static char buf[11+1];
5880
5881 sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
5882 return buf;
5883 }
5884
5885
5886 static const char *
5887 ucs_symbol_range (unsigned int low, unsigned int high)
5888 {
5889 static char buf[24+1];
5890
5891 strcpy (buf, ucs_symbol (low));
5892 strcat (buf, "..");
5893 strcat (buf, ucs_symbol (high));
5894 return buf;
5895 }
5896
5897
5898
5899 static void
5900 output_charclass (FILE *stream, const char *classname,
5901 bool (*func) (unsigned int))
5902 {
5903 char table[0x110000];
5904 unsigned int i;
5905 bool need_semicolon;
5906 const int max_column = 75;
5907 int column;
5908
5909 for (i = 0; i < 0x110000; i++)
5910 table[i] = (int) func (i);
5911
5912 fprintf (stream, "%s ", classname);
5913 need_semicolon = false;
5914 column = 1000;
5915 for (i = 0; i < 0x110000; )
5916 {
5917 if (!table[i])
5918 i++;
5919 else
5920 {
5921 unsigned int low, high;
5922 char buf[25];
5923
5924 low = i;
5925 do
5926 i++;
5927 while (i < 0x110000 && table[i]);
5928 high = i - 1;
5929
5930 if (low == high)
5931 strcpy (buf, ucs_symbol (low));
5932 else
5933 strcpy (buf, ucs_symbol_range (low, high));
5934
5935 if (need_semicolon)
5936 {
5937 fprintf (stream, ";");
5938 column++;
5939 }
5940
5941 if (column + strlen (buf) > max_column)
5942 {
5943 fprintf (stream, "/\n ");
5944 column = 3;
5945 }
5946
5947 fprintf (stream, "%s", buf);
5948 column += strlen (buf);
5949 need_semicolon = true;
5950 }
5951 }
5952 fprintf (stream, "\n");
5953 }
5954
5955
5956
5957 static void
5958 output_charmap (FILE *stream, const char *mapname,
5959 unsigned int (*func) (unsigned int))
5960 {
5961 char table[0x110000];
5962 unsigned int i;
5963 bool need_semicolon;
5964 const int max_column = 75;
5965 int column;
5966
5967 for (i = 0; i < 0x110000; i++)
5968 table[i] = (func (i) != i);
5969
5970 fprintf (stream, "%s ", mapname);
5971 need_semicolon = false;
5972 column = 1000;
5973 for (i = 0; i < 0x110000; i++)
5974 if (table[i])
5975 {
5976 char buf[25+1];
5977
5978 strcpy (buf, "(");
5979 strcat (buf, ucs_symbol (i));
5980 strcat (buf, ",");
5981 strcat (buf, ucs_symbol (func (i)));
5982 strcat (buf, ")");
5983
5984 if (need_semicolon)
5985 {
5986 fprintf (stream, ";");
5987 column++;
5988 }
5989
5990 if (column + strlen (buf) > max_column)
5991 {
5992 fprintf (stream, "/\n ");
5993 column = 3;
5994 }
5995
5996 fprintf (stream, "%s", buf);
5997 column += strlen (buf);
5998 need_semicolon = true;
5999 }
6000 fprintf (stream, "\n");
6001 }
6002
6003
6004
6005 static void
6006 output_widthmap (FILE *stream)
6007 {
6008 }
6009
6010
6011
6012 static void
6013 output_tables (const char *filename, const char *version)
6014 {
6015 FILE *stream;
6016 unsigned int ch;
6017
6018 stream = fopen (filename, "w");
6019 if (stream == NULL)
6020 {
6021 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6022 exit (1);
6023 }
6024
6025 fprintf (stream, "escape_char /\n");
6026 fprintf (stream, "comment_char %%\n");
6027 fprintf (stream, "\n");
6028 fprintf (stream, "%% Generated automatically by gen-uni-tables.c for Unicode %s.\n",
6029 version);
6030 fprintf (stream, "\n");
6031
6032 fprintf (stream, "LC_IDENTIFICATION\n");
6033 fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version);
6034 fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n");
6035 fprintf (stream, "address \"\"\n");
6036 fprintf (stream, "contact \"\"\n");
6037 fprintf (stream, "email \"bug-glibc@gnu.org\"\n");
6038 fprintf (stream, "tel \"\"\n");
6039 fprintf (stream, "fax \"\"\n");
6040 fprintf (stream, "language \"\"\n");
6041 fprintf (stream, "territory \"Earth\"\n");
6042 fprintf (stream, "revision \"%s\"\n", version);
6043 {
6044 time_t now;
6045 char date[11];
6046 now = time (NULL);
6047 strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
6048 fprintf (stream, "date \"%s\"\n", date);
6049 }
6050 fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n");
6051 fprintf (stream, "END LC_IDENTIFICATION\n");
6052 fprintf (stream, "\n");
6053
6054
6055 for (ch = 0; ch < 0x110000; ch++)
6056 {
6057
6058
6059 if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
6060 fprintf (stderr,
6061 "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
6062 ucs_symbol (ch), ch, to_upper (ch));
6063
6064
6065
6066 if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
6067 fprintf (stderr,
6068 "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
6069 ucs_symbol (ch), ch, to_lower (ch));
6070
6071
6072
6073 if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
6074 fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
6075
6076
6077
6078 if (is_alpha (ch) && is_cntrl (ch))
6079 fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
6080 if (is_alpha (ch) && is_digit (ch))
6081 fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
6082 if (is_alpha (ch) && is_punct (ch))
6083 fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
6084 if (is_alpha (ch) && is_space (ch))
6085 fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
6086
6087
6088
6089
6090 if (is_space (ch) && is_digit (ch))
6091 fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
6092 if (is_space (ch) && is_graph (ch))
6093 fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
6094 if (is_space (ch) && is_xdigit (ch))
6095 fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
6096
6097
6098
6099
6100 if (is_cntrl (ch) && is_digit (ch))
6101 fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
6102 if (is_cntrl (ch) && is_punct (ch))
6103 fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
6104 if (is_cntrl (ch) && is_graph (ch))
6105 fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
6106 if (is_cntrl (ch) && is_print (ch))
6107 fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
6108 if (is_cntrl (ch) && is_xdigit (ch))
6109 fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
6110
6111
6112
6113
6114 if (is_punct (ch) && is_digit (ch))
6115 fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
6116 if (is_punct (ch) && is_xdigit (ch))
6117 fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
6118 if (is_punct (ch) && (ch == 0x0020))
6119 fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131 if (is_print (ch) && !(is_graph (ch) || is_space (ch)))
6132 fprintf (stderr,
6133 "%s is print but not graph|<space>\n", ucs_symbol (ch));
6134 if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
6135 fprintf (stderr,
6136 "%s is graph|<space> but not print\n", ucs_symbol (ch));
6137 }
6138
6139 fprintf (stream, "LC_CTYPE\n");
6140 output_charclass (stream, "upper", is_upper);
6141 output_charclass (stream, "lower", is_lower);
6142 output_charclass (stream, "alpha", is_alpha);
6143 output_charclass (stream, "digit", is_digit);
6144 output_charclass (stream, "outdigit", is_outdigit);
6145 output_charclass (stream, "blank", is_blank);
6146 output_charclass (stream, "space", is_space);
6147 output_charclass (stream, "cntrl", is_cntrl);
6148 output_charclass (stream, "punct", is_punct);
6149 output_charclass (stream, "xdigit", is_xdigit);
6150 output_charclass (stream, "graph", is_graph);
6151 output_charclass (stream, "print", is_print);
6152 output_charclass (stream, "class \"combining\";", is_combining);
6153 output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
6154 output_charmap (stream, "toupper", to_upper);
6155 output_charmap (stream, "tolower", to_lower);
6156 output_charmap (stream, "map \"totitle\";", to_title);
6157 output_widthmap (stream);
6158 fprintf (stream, "END LC_CTYPE\n");
6159
6160 if (ferror (stream) || fclose (stream))
6161 {
6162 fprintf (stderr, "error writing to '%s'\n", filename);
6163 exit (1);
6164 }
6165 }
6166
6167 #endif
6168
6169
6170
6171
6172
6173 const char * unicode_width[0x110000];
6174
6175
6176
6177 static void
6178 fill_width (const char *width_filename)
6179 {
6180 unsigned int i, j;
6181 FILE *stream;
6182 char field0[FIELDLEN];
6183 char field1[FIELDLEN];
6184 char field2[FIELDLEN];
6185 int lineno = 0;
6186
6187 for (i = 0; i < 0x110000; i++)
6188 unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
6189
6190 stream = fopen (width_filename, "r");
6191 if (stream == NULL)
6192 {
6193 fprintf (stderr, "error during fopen of '%s'\n", width_filename);
6194 exit (1);
6195 }
6196
6197 for (;;)
6198 {
6199 int n;
6200 int c;
6201
6202 lineno++;
6203 c = getc (stream);
6204 if (c == EOF)
6205 break;
6206 if (c == '#')
6207 {
6208 do c = getc (stream); while (c != EOF && c != '\n');
6209 continue;
6210 }
6211 ungetc (c, stream);
6212 n = getfield (stream, field0, ';');
6213 n += getfield (stream, field1, ' ');
6214 n += getfield (stream, field2, '\n');
6215 if (n == 0)
6216 break;
6217 if (n != 3)
6218 {
6219 fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
6220 exit (1);
6221 }
6222 i = strtoul (field0, NULL, 16);
6223 if (strstr (field0, "..") != NULL)
6224 {
6225
6226 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
6227 for (; i <= j; i++)
6228 unicode_width[i] = strdup (field1);
6229 }
6230 else
6231 {
6232
6233 unicode_width[i] = strdup (field1);
6234 }
6235 }
6236
6237 if (ferror (stream) || fclose (stream))
6238 {
6239 fprintf (stderr, "error reading from '%s'\n", width_filename);
6240 exit (1);
6241 }
6242 }
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257 static bool
6258 is_nonspacing (unsigned int ch)
6259 {
6260 return (unicode_attributes[ch].name != NULL
6261 && (get_bidi_category (ch) == UC_BIDI_NSM
6262 || is_category_Cc (ch) || is_category_Cf (ch)
6263 || strncmp (unicode_attributes[ch].name, "ZERO WIDTH ", 11) == 0));
6264 }
6265
6266 static void
6267 output_nonspacing_property (const char *filename)
6268 {
6269 FILE *stream;
6270 int ind[0x110000 / 0x200];
6271 unsigned int i;
6272 unsigned int i_max;
6273 int next_ind;
6274
6275 stream = fopen (filename, "w");
6276 if (stream == NULL)
6277 {
6278 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6279 exit (1);
6280 }
6281
6282 next_ind = 0;
6283 for (i = 0; i < 0x110000 / 0x200; i++)
6284 {
6285 bool nontrivial = false;
6286 unsigned int ch;
6287
6288 if (i != 0xe0000 / 0x200)
6289 for (ch = i * 0x200; ch < (i + 1) * 0x200; ch++)
6290 if (is_nonspacing (ch))
6291 {
6292 nontrivial = true;
6293 break;
6294 }
6295 if (nontrivial)
6296 ind[i] = next_ind++;
6297 else
6298 ind[i] = -1;
6299 }
6300
6301 fprintf (stream, "static const unsigned char nonspacing_table_data[%d*64] = {\n",
6302 next_ind);
6303 i_max = 0;
6304 for (i = 0; i < 0x110000 / 0x200; i++)
6305 {
6306 bool nontrivial = (ind[i] >= 0);
6307
6308 if (nontrivial)
6309 {
6310 unsigned int j;
6311
6312 fprintf (stream, " /* 0x%04x-0x%04x */\n", i * 0x200, (i + 1) * 0x200 - 1);
6313 for (j = 0; j < 8; j++)
6314 {
6315 unsigned int k;
6316
6317 fprintf (stream, " ");
6318 for (k = 0; k < 8; k++)
6319 {
6320 unsigned int l;
6321 unsigned char bits = 0;
6322
6323 for (l = 0; l < 8; l++)
6324 {
6325 unsigned int ch = i * 0x200 + j * 0x40 + k * 8 + l;
6326
6327 if (is_nonspacing (ch))
6328 bits |= 1 << l;
6329 }
6330 fprintf (stream, " 0x%02x%c", bits,
6331 ind[i] + 1 == next_ind && j == 8 - 1 && k == 8 - 1 ? ' ' : ',');
6332 }
6333 fprintf (stream, " /* 0x%04x-0x%04x */\n",
6334 i * 0x200 + j * 0x40, i * 0x200 + (j + 1) * 0x40 - 1);
6335 }
6336 i_max = i;
6337 }
6338 }
6339 fprintf (stream, "};\n");
6340
6341 i_max = ((i_max + 8 - 1) / 8) * 8;
6342 fprintf (stream, "static const signed char nonspacing_table_ind[%u] = {\n",
6343 i_max);
6344 {
6345 unsigned int j;
6346
6347 for (j = 0; j < i_max / 8; j++)
6348 {
6349 unsigned int k;
6350
6351 fprintf (stream, " ");
6352 for (k = 0; k < 8; k++)
6353 {
6354 i = j * 8 + k;
6355 fprintf (stream, " %2d%c", ind[i],
6356 j == i_max / 8 - 1 && k == 8 - 1 ? ' ' : ',');
6357 }
6358 fprintf (stream, " /* 0x%04x-0x%04x */\n",
6359 j * 8 * 0x200, (j + 1) * 8 * 0x200 - 1);
6360 }
6361 }
6362 fprintf (stream, "};\n");
6363
6364 if (ferror (stream) || fclose (stream))
6365 {
6366 fprintf (stderr, "error writing to '%s'\n", filename);
6367 exit (1);
6368 }
6369 }
6370
6371
6372 static char
6373 symbolic_width (unsigned int ch)
6374 {
6375
6376 if (is_property_unassigned_code_value (ch))
6377 {
6378
6379 if (ch >= 0xE000 && ch <= 0xF8FF)
6380 return 'A';
6381 if ((ch >= 0x4E00 && ch <= 0x9FFF)
6382 || (ch >= 0x3400 && ch <= 0x4DBF)
6383 || (ch >= 0xF900 && ch <= 0xFAFF)
6384 || (ch >= 0x20000 && ch <= 0x2FFFF)
6385 || (ch >= 0x30000 && ch <= 0x3FFFF) )
6386 return '2';
6387 return 0;
6388 }
6389 else
6390 {
6391
6392 if (is_category_Cc (ch) && ch < 0x00A0)
6393 return 0;
6394 if (is_nonspacing (ch))
6395 return '0';
6396
6397 if (unicode_width[ch] != NULL
6398 && (strcmp (unicode_width[ch], "W") == 0
6399 || strcmp (unicode_width[ch], "F") == 0))
6400 return '2';
6401
6402 if (unicode_width[ch] != NULL
6403 && strcmp (unicode_width[ch], "H") == 0)
6404 return '1';
6405 }
6406
6407
6408 if (ch >= 0x00A1 && ch < 0x10000)
6409 return 'A';
6410 return '1';
6411 }
6412
6413 static void
6414 output_width_property_test (const char *filename)
6415 {
6416 FILE *stream;
6417 unsigned int interval_start, interval_end, ch;
6418 char interval_value;
6419
6420 stream = fopen (filename, "w");
6421 if (stream == NULL)
6422 {
6423 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6424 exit (1);
6425 }
6426
6427 interval_value = 0;
6428 interval_start = interval_end = 0;
6429 for (ch = 0; ch < 0x110000; ch++)
6430 {
6431 char value = symbolic_width (ch);
6432 if (value != 0)
6433 {
6434 if (value == interval_value)
6435
6436 interval_end = ch;
6437 else
6438 {
6439
6440 if (interval_value != 0)
6441 {
6442 if (interval_end == interval_start)
6443 fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value);
6444 else
6445 fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value);
6446 }
6447
6448 interval_start = interval_end = ch;
6449 interval_value = value;
6450 }
6451 }
6452 }
6453
6454 if (interval_value != 0)
6455 {
6456 if (interval_end == interval_start)
6457 fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value);
6458 else
6459 fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value);
6460 }
6461
6462 if (ferror (stream) || fclose (stream))
6463 {
6464 fprintf (stderr, "error writing to '%s'\n", filename);
6465 exit (1);
6466 }
6467 }
6468
6469
6470
6471
6472
6473
6474 enum
6475 {
6476
6477 LBP_BK = 30,
6478
6479
6480 LBP_CM = 31,
6481
6482
6483 LBP_WJ = 0,
6484 LBP_ZW = 32,
6485 LBP_GL = 1,
6486 LBP_SP = 33,
6487 LBP_B2 = 2,
6488 LBP_BA = 3,
6489 LBP_BB = 4,
6490 LBP_HY = 5,
6491 LBP_CB = 34,
6492 LBP_CL = 6,
6493 LBP_CP = 7,
6494 LBP_EX = 8,
6495 LBP_IN = 9,
6496 LBP_NS = 10,
6497 LBP_OP = 11,
6498 LBP_QU = 12,
6499 LBP_IS = 13,
6500 LBP_NU = 14,
6501 LBP_PO = 15,
6502 LBP_PR = 16,
6503 LBP_SY = 17,
6504 LBP_AI = 35,
6505 LBP_AL = 18,
6506
6507 LBP_H2 = 19,
6508 LBP_H3 = 20,
6509 LBP_HL = 25,
6510 LBP_ID = 21,
6511 LBP_JL = 22,
6512 LBP_JV = 23,
6513 LBP_JT = 24,
6514 LBP_RI = 26,
6515 LBP_SA = 36,
6516 LBP_ZWJ = 27,
6517 LBP_EB = 28,
6518 LBP_EM = 29,
6519 LBP_XX = 37
6520 };
6521
6522
6523 static int64_t
6524 get_lbp (unsigned int ch)
6525 {
6526 int64_t attr = 0;
6527
6528
6529 if (unicode_attributes[ch].name == NULL && (ch >= 0x20BC && ch <= 0x20CF))
6530 return (int64_t) 1 << LBP_PR;
6531
6532 if (unicode_attributes[ch].name != NULL)
6533 {
6534
6535 if (ch == 0x000A || ch == 0x000D || ch == 0x0085
6536 || ch == 0x000C
6537 || ch == 0x000B
6538 || ch == 0x2028
6539 || ch == 0x2029 )
6540 attr |= (int64_t) 1 << LBP_BK;
6541
6542 if (ch == 0x2060
6543 || ch == 0xFEFF )
6544 attr |= (int64_t) 1 << LBP_WJ;
6545
6546
6547 if (ch == 0x200B )
6548 attr |= (int64_t) 1 << LBP_ZW;
6549
6550
6551 if (ch == 0x200D )
6552 attr |= (int64_t) 1 << LBP_ZWJ;
6553
6554
6555 if (ch == 0x261D
6556 || ch == 0x26F9
6557 || (ch >= 0x270A && ch <= 0x270D)
6558 || ch == 0x1F385
6559 || (ch >= 0x1F3C3 && ch <= 0x1F3C4)
6560 || (ch >= 0x1F3CA && ch <= 0x1F3CB)
6561 || (ch >= 0x1F442 && ch <= 0x1F443)
6562 || (ch >= 0x1F446 && ch <= 0x1F450)
6563 || (ch >= 0x1F466 && ch <= 0x1F469)
6564 || ch == 0x1F46E
6565 || (ch >= 0x1F470 && ch <= 0x1F478)
6566 || ch == 0x1F47C
6567 || (ch >= 0x1F481 && ch <= 0x1F483)
6568 || (ch >= 0x1F485 && ch <= 0x1F487)
6569 || ch == 0x1F4AA
6570 || ch == 0x1F575
6571 || ch == 0x1F57A
6572 || ch == 0x1F590
6573 || (ch >= 0x1F595 && ch <= 0x1F596)
6574 || (ch >= 0x1F645 && ch <= 0x1F647)
6575 || (ch >= 0x1F64B && ch <= 0x1F64F)
6576 || ch == 0x1F6A3
6577 || (ch >= 0x1F6B4 && ch <= 0x1F6B6)
6578 || ch == 0x1F6C0
6579 || (ch >= 0x1F918 && ch <= 0x1F91E)
6580 || ch == 0x1F926
6581 || ch == 0x1F930
6582 || (ch >= 0x1F933 && ch <= 0x1F939)
6583 || (ch >= 0x1F93C && ch <= 0x1F93E) )
6584 attr |= (int64_t) 1 << LBP_EB;
6585
6586 if ((ch >= 0x1F3FB && ch <= 0x1F3FF) )
6587 attr |= (int64_t) 1 << LBP_EM;
6588
6589
6590 if (ch == 0x00A0
6591 || ch == 0x202F
6592 || ch == 0x180E
6593 || ch == 0x034F
6594 || ch == 0x2007
6595 || ch == 0x2011
6596 || ch == 0x0F08
6597 || ch == 0x0F0C
6598 || ch == 0x0F12
6599 || (ch >= 0x035C && ch <= 0x0362)
6600
6601 || ch == 0x0FD9
6602 || ch == 0x0FDA )
6603 attr |= (int64_t) 1 << LBP_GL;
6604
6605
6606 if (ch == 0x0020 )
6607 attr |= (int64_t) 1 << LBP_SP;
6608
6609
6610 if (ch == 0x2014
6611 || ch == 0x2E3A
6612 || ch == 0x2E3B )
6613 attr |= (int64_t) 1 << LBP_B2;
6614
6615
6616 if (
6617 ch == 0x1680
6618 || ch == 0x2000
6619 || ch == 0x2001
6620 || ch == 0x2002
6621 || ch == 0x2003
6622 || ch == 0x2004
6623 || ch == 0x2005
6624 || ch == 0x2006
6625 || ch == 0x2008
6626 || ch == 0x2009
6627 || ch == 0x200A
6628 || ch == 0x205F
6629 || ch == 0x3000
6630
6631 || ch == 0x0009
6632
6633 || ch == 0x00AD
6634
6635 || ch == 0x058A
6636 || ch == 0x1400
6637 || ch == 0x2010
6638 || ch == 0x2012
6639 || ch == 0x2013
6640
6641 || ch == 0x05BE
6642 || ch == 0x0F0B
6643 || ch == 0x1361
6644 || ch == 0x17D8
6645 || ch == 0x17DA
6646 || ch == 0x2027
6647 || ch == 0x007C
6648
6649 || ch == 0x16EB
6650 || ch == 0x16EC
6651 || ch == 0x16ED
6652 || ch == 0x2056
6653 || ch == 0x2058
6654 || ch == 0x2059
6655 || ch == 0x205A
6656 || ch == 0x205B
6657 || ch == 0x205D
6658 || ch == 0x205E
6659 || ch == 0x2E19
6660 || ch == 0x2E2A
6661 || ch == 0x2E2B
6662 || ch == 0x2E2C
6663 || ch == 0x2E2D
6664 || ch == 0x2E30
6665 || ch == 0x2E31
6666 || ch == 0x2E33
6667 || ch == 0x2E34
6668 || ch == 0x10100
6669 || ch == 0x10101
6670 || ch == 0x10102
6671 || ch == 0x1039F
6672 || ch == 0x103D0
6673 || ch == 0x1091F
6674 || ch == 0x12470
6675
6676 || ch == 0x0964
6677 || ch == 0x0965
6678 || ch == 0x0E5A
6679 || ch == 0x0E5B
6680 || ch == 0x104A
6681 || ch == 0x104B
6682 || ch == 0x1735
6683 || ch == 0x1736
6684 || ch == 0x17D4
6685 || ch == 0x17D5
6686 || ch == 0x1B5E
6687 || ch == 0x1B5F
6688 || ch == 0xA8CE
6689 || ch == 0xA8CF
6690 || ch == 0xAA5D
6691 || ch == 0xAA5E
6692 || ch == 0xAA5F
6693 || ch == 0x10A56
6694 || ch == 0x10A57
6695
6696 || ch == 0x0F34
6697 || ch == 0x0F7F
6698 || ch == 0x0F85
6699 || ch == 0x0FBE
6700 || ch == 0x0FBF
6701 || ch == 0x0FD2
6702
6703 || ch == 0x1804
6704 || ch == 0x1805
6705 || ch == 0x1B5A
6706 || ch == 0x1B5B
6707 || ch == 0x1B5D
6708 || ch == 0x1B60
6709 || ch == 0x1C3B
6710 || ch == 0x1C3C
6711 || ch == 0x1C3D
6712 || ch == 0x1C3E
6713 || ch == 0x1C3F
6714 || ch == 0x1C7E
6715 || ch == 0x1C7F
6716 || ch == 0x2CFA
6717 || ch == 0x2CFB
6718 || ch == 0x2CFC
6719 || ch == 0x2CFF
6720 || (ch >= 0x2E0E && ch <= 0x2E15)
6721 || ch == 0x2E17
6722 || ch == 0x2E43
6723 || ch == 0x2E44
6724 || ch == 0x2E3C
6725 || ch == 0x2E3D
6726 || ch == 0x2E3E
6727 || ch == 0x2E40
6728 || ch == 0x2E41
6729 || ch == 0xA60D
6730 || ch == 0xA60F
6731 || ch == 0xA92E
6732 || ch == 0xA92F
6733 || ch == 0x10A50
6734 || ch == 0x10A51
6735 || ch == 0x10A52
6736 || ch == 0x10A53
6737 || ch == 0x10A54
6738 || ch == 0x10A55
6739
6740 || ch == 0x2D70
6741 || ch == 0xA4FE
6742 || ch == 0xA4FF
6743 || ch == 0xA6F3
6744 || ch == 0xA6F4
6745 || ch == 0xA6F5
6746 || ch == 0xA6F6
6747 || ch == 0xA6F7
6748 || ch == 0xA9C7
6749 || ch == 0xA9C8
6750 || ch == 0xA9C9
6751 || ch == 0xAAF0
6752 || ch == 0xAAF1
6753 || ch == 0xABEB
6754 || ch == 0x10857
6755 || (ch >= 0x10AF0 && ch <= 0x10AF5)
6756 || ch == 0x10B39
6757 || ch == 0x10B3A
6758 || ch == 0x10B3B
6759 || ch == 0x10B3C
6760 || ch == 0x10B3D
6761 || ch == 0x10B3E
6762 || ch == 0x10B3F
6763 || ch == 0x11047
6764 || ch == 0x11048
6765 || ch == 0x110BE
6766 || ch == 0x110BF
6767 || ch == 0x110C0
6768 || ch == 0x110C1
6769 || ch == 0x11140
6770 || ch == 0x11141
6771 || ch == 0x11142
6772 || ch == 0x11143
6773 || ch == 0x111C5
6774 || ch == 0x111C6
6775 || ch == 0x111C8
6776 || (ch >= 0x111DD && ch <= 0x111DF)
6777 || ch == 0x11238
6778 || ch == 0x11239
6779 || ch == 0x1123B
6780 || ch == 0x1123C
6781 || ch == 0x112A9
6782 || (ch >= 0x1144B && ch <= 0x1144E)
6783 || ch == 0x1145B
6784 || ch == 0x115C2
6785 || ch == 0x115C3
6786 || (ch >= 0x115C9 && ch <= 0x115D7)
6787 || ch == 0x11641
6788 || ch == 0x11642
6789 || (ch >= 0x1173C && ch <= 0x1173E)
6790 || (ch >= 0x11C41 && ch <= 0x11C45)
6791 || ch == 0x12471
6792 || ch == 0x12472
6793 || ch == 0x12473
6794 || ch == 0x12474
6795 || ch == 0x16A6E
6796 || ch == 0x16A6F
6797 || ch == 0x16AF5
6798 || ch == 0x16B37
6799 || ch == 0x16B38
6800 || ch == 0x16B39
6801 || ch == 0x16B44
6802 || ch == 0x1BC9F
6803 || (ch >= 0x1DA87 && ch <= 0x1DA8A) )
6804 attr |= (int64_t) 1 << LBP_BA;
6805
6806
6807 if (ch == 0x00B4
6808 || ch == 0x1FFD
6809 || ch == 0x02DF
6810 || ch == 0x02C8
6811 || ch == 0x02CC
6812 || ch == 0x0F01
6813 || ch == 0x0F02
6814 || ch == 0x0F03
6815 || ch == 0x0F04
6816 || ch == 0x0F06
6817 || ch == 0x0F07
6818 || ch == 0x0F09
6819 || ch == 0x0F0A
6820 || ch == 0x0FD0
6821 || ch == 0x0FD1
6822 || ch == 0x0FD3
6823 || ch == 0xA874
6824 || ch == 0xA875
6825 || ch == 0xA8FC
6826 || ch == 0x1806
6827 || ch == 0x11175
6828 || ch == 0x111DB
6829 || ch == 0x115C1
6830 || (ch >= 0x11660 && ch <= 0x1166C)
6831 || ch == 0x11C70 )
6832 attr |= (int64_t) 1 << LBP_BB;
6833
6834
6835 if (ch == 0x002D )
6836 attr |= (int64_t) 1 << LBP_HY;
6837
6838
6839 if (ch == 0xFFFC )
6840 attr |= (int64_t) 1 << LBP_CB;
6841
6842
6843 if (ch == 0x0029
6844 || ch == 0x005D )
6845 attr |= (int64_t) 1 << LBP_CP;
6846
6847
6848 if ((unicode_attributes[ch].category[0] == 'P'
6849 && unicode_attributes[ch].category[1] == 'e'
6850 && !(attr & ((int64_t) 1 << LBP_CP)))
6851 || ch == 0x3001
6852 || ch == 0x3002
6853 || ch == 0xFE11
6854 || ch == 0xFE12
6855 || ch == 0xFE50
6856 || ch == 0xFE52
6857 || ch == 0xFF0C
6858 || ch == 0xFF0E
6859 || ch == 0xFF61
6860 || ch == 0xFF64
6861
6862 || ch == 0x1325B
6863 || ch == 0x1325C
6864 || ch == 0x1325D
6865 || ch == 0x13282
6866 || ch == 0x13287
6867 || ch == 0x13289
6868 || ch == 0x1337A
6869 || ch == 0x1337B
6870 || ch == 0x145CF )
6871 attr |= (int64_t) 1 << LBP_CL;
6872
6873
6874 if (ch == 0x0021
6875 || ch == 0x003F
6876 || ch == 0x05C6
6877 || ch == 0x061B
6878 || ch == 0x061E
6879 || ch == 0x061F
6880 || ch == 0x06D4
6881 || ch == 0x07F9
6882 || ch == 0x0F0D
6883 || ch == 0x0F0E
6884 || ch == 0x0F0F
6885 || ch == 0x0F10
6886 || ch == 0x0F11
6887 || ch == 0x0F14
6888 || ch == 0x1802
6889 || ch == 0x1803
6890 || ch == 0x1808
6891 || ch == 0x1809
6892 || ch == 0x1944
6893 || ch == 0x1945
6894 || ch == 0x2762
6895 || ch == 0x2763
6896 || ch == 0x2CF9
6897 || ch == 0x2CFE
6898 || ch == 0x2E2E
6899 || ch == 0xA60E
6900 || ch == 0xA876
6901 || ch == 0xA877
6902 || ch == 0xFE15
6903 || ch == 0xFE16
6904 || ch == 0xFE56
6905 || ch == 0xFE57
6906 || ch == 0xFF01
6907 || ch == 0xFF1F
6908 || ch == 0x115C4
6909 || ch == 0x115C5
6910 || ch == 0x11C71 )
6911 attr |= (int64_t) 1 << LBP_EX;
6912
6913
6914 if (ch == 0x2024
6915 || ch == 0x2025
6916 || ch == 0x2026
6917 || ch == 0x22EF
6918 || ch == 0xFE19
6919 || ch == 0x10AF6 )
6920 attr |= (int64_t) 1 << LBP_IN;
6921
6922
6923 if (ch == 0x17D6
6924 || ch == 0x203C
6925 || ch == 0x203D
6926 || ch == 0x2047
6927 || ch == 0x2048
6928 || ch == 0x2049
6929 || ch == 0x3005
6930 || ch == 0x301C
6931 || ch == 0x303C
6932 || ch == 0x303B
6933 || ch == 0x309B
6934 || ch == 0x309C
6935 || ch == 0x309D
6936 || ch == 0x309E
6937 || ch == 0x30A0
6938 || ch == 0x30FB
6939 || ch == 0x30FC
6940 || ch == 0x30FD
6941 || ch == 0x30FE
6942 || ch == 0xA015
6943 || ch == 0xFE54
6944 || ch == 0xFE55
6945 || ch == 0xFF1A
6946 || ch == 0xFF1B
6947 || ch == 0xFF65
6948 || ch == 0xFF70
6949 || ch == 0xFF9E
6950 || ch == 0xFF9F
6951 || ch == 0x16FE0
6952 || ch == 0x1F679
6953 || ch == 0x1F67A
6954 || ch == 0x1F67B
6955 || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
6956 || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
6957 attr |= (int64_t) 1 << LBP_NS;
6958
6959
6960 if ((unicode_attributes[ch].category[0] == 'P'
6961 && unicode_attributes[ch].category[1] == 's')
6962 || ch == 0x00A1
6963 || ch == 0x00BF
6964 || ch == 0x2E18
6965
6966 || ch == 0x13258
6967 || ch == 0x13259
6968 || ch == 0x1325A
6969 || ch == 0x13286
6970 || ch == 0x13288
6971 || ch == 0x13379
6972 || ch == 0x145CE
6973 || (ch >= 0x1E95E && ch <= 0x1E95F) )
6974 attr |= (int64_t) 1 << LBP_OP;
6975
6976
6977 if ((unicode_attributes[ch].category[0] == 'P'
6978 && (unicode_attributes[ch].category[1] == 'f'
6979 || unicode_attributes[ch].category[1] == 'i'))
6980 || ch == 0x0022
6981 || ch == 0x0027
6982 || ch == 0x275B
6983 || ch == 0x275C
6984 || ch == 0x275D
6985 || ch == 0x275E
6986 || ch == 0x275F
6987 || ch == 0x2760
6988 || ch == 0x2E00
6989 || ch == 0x2E01
6990 || ch == 0x2E06
6991 || ch == 0x2E07
6992 || ch == 0x2E08
6993 || ch == 0x2E0B
6994 || ch == 0x1F676
6995 || ch == 0x1F677
6996 || ch == 0x1F678 )
6997 attr |= (int64_t) 1 << LBP_QU;
6998
6999
7000 if (ch == 0x002C
7001 || ch == 0x002E
7002 || ch == 0x003A
7003 || ch == 0x003B
7004 || ch == 0x037E
7005 || ch == 0x0589
7006 || ch == 0x060C
7007 || ch == 0x060D
7008 || ch == 0x07F8
7009 || ch == 0x2044
7010 || ch == 0xFE10
7011 || ch == 0xFE13
7012 || ch == 0xFE14 )
7013 attr |= (int64_t) 1 << LBP_IS;
7014
7015
7016 if ((unicode_attributes[ch].category[0] == 'N'
7017 && unicode_attributes[ch].category[1] == 'd'
7018 && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
7019 || ch == 0x066B
7020 || ch == 0x066C )
7021 attr |= (int64_t) 1 << LBP_NU;
7022
7023
7024 if (ch == 0x0025
7025 || ch == 0x00A2
7026 || ch == 0x00B0
7027 || ch == 0x060B
7028 || ch == 0x066A
7029 || ch == 0x2030
7030 || ch == 0x2031
7031 || ch == 0x2032
7032 || ch == 0x2033
7033 || ch == 0x2034
7034 || ch == 0x2035
7035 || ch == 0x2036
7036 || ch == 0x2037
7037 || ch == 0x20A7
7038 || ch == 0x20BB
7039 || ch == 0x2103
7040 || ch == 0x2109
7041 || ch == 0xFDFC
7042 || ch == 0xFE6A
7043 || ch == 0xFF05
7044 || ch == 0xFFE0
7045
7046 || ch == 0x0609
7047 || ch == 0x060A
7048 || ch == 0x09F2
7049 || ch == 0x09F3
7050 || ch == 0x09F9
7051 || ch == 0x0D79
7052 || ch == 0x20B6
7053 || ch == 0x20BE
7054 || ch == 0xA838 )
7055 attr |= (int64_t) 1 << LBP_PO;
7056
7057
7058 if ((unicode_attributes[ch].category[0] == 'S'
7059 && unicode_attributes[ch].category[1] == 'c')
7060 || ch == 0x002B
7061 || ch == 0x005C
7062 || ch == 0x00B1
7063 || ch == 0x2116
7064 || ch == 0x2212
7065 || ch == 0x2213 )
7066 if (!(attr & ((int64_t) 1 << LBP_PO)))
7067 attr |= (int64_t) 1 << LBP_PR;
7068
7069
7070 if (ch == 0x002F )
7071 attr |= (int64_t) 1 << LBP_SY;
7072
7073 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0)
7074 attr |= (int64_t) 1 << LBP_H2;
7075
7076 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0)
7077 attr |= (int64_t) 1 << LBP_H3;
7078
7079 if ((ch >= 0x05D0 && ch <= 0x05F2) || ch == 0xFB1D
7080 || (ch >= 0xFB1F && ch <= 0xFB28) || (ch >= 0xFB2A && ch <= 0xFB4F))
7081 attr |= (int64_t) 1 << LBP_HL;
7082
7083 if ((ch >= 0x1100 && ch <= 0x115F) || (ch >= 0xA960 && ch <= 0xA97C))
7084 attr |= (int64_t) 1 << LBP_JL;
7085
7086 if ((ch >= 0x1160 && ch <= 0x11A7) || (ch >= 0xD7B0 && ch <= 0xD7C6))
7087 attr |= (int64_t) 1 << LBP_JV;
7088
7089 if ((ch >= 0x11A8 && ch <= 0x11FF) || (ch >= 0xD7CB && ch <= 0xD7FB))
7090 attr |= (int64_t) 1 << LBP_JT;
7091
7092
7093 if (ch >= 0x1F1E6 && ch <= 0x1F1FF)
7094 attr |= (int64_t) 1 << LBP_RI;
7095
7096
7097 if (((unicode_attributes[ch].category[0] == 'C'
7098 && unicode_attributes[ch].category[1] == 'f')
7099 || (unicode_attributes[ch].category[0] == 'L'
7100 && (unicode_attributes[ch].category[1] == 'm'
7101 || unicode_attributes[ch].category[1] == 'o'))
7102 || (unicode_attributes[ch].category[0] == 'M'
7103 && (unicode_attributes[ch].category[1] == 'c'
7104 || unicode_attributes[ch].category[1] == 'n')
7105 && ch != 0x1A7F )
7106
7107 || ch == 0x109E
7108 || ch == 0x109F
7109 || ch == 0x19DA
7110 || ch == 0x19DE
7111 || ch == 0x19DF
7112 || (ch >= 0x1AA0 && ch <= 0x1AAD)
7113 || (ch >= 0xA9E0 && ch <= 0xA9EF)
7114 || (ch >= 0xA9FA && ch <= 0xA9FE)
7115 || (ch >= 0xAA77 && ch <= 0xAA79)
7116 || (ch >= 0xAADE && ch <= 0xAADF)
7117 || (ch >= 0x1173A && ch <= 0x1173B)
7118 || ch == 0x1173F )
7119 && ((ch >= 0x0E00 && ch <= 0x0EFF)
7120 || (ch >= 0x1000 && ch <= 0x109F)
7121 || (ch >= 0x1780 && ch <= 0x17FF)
7122 || (ch >= 0x1950 && ch <= 0x19DF)
7123 || (ch >= 0x1A20 && ch <= 0x1AAF)
7124 || (ch >= 0xA9E0 && ch <= 0xA9EF)
7125 || (ch >= 0xA9FA && ch <= 0xA9FE)
7126 || (ch >= 0xAA60 && ch <= 0xAADF)
7127 || (ch >= 0x11700 && ch <= 0x11719)
7128 || (ch >= 0x1171D && ch <= 0x1172B)
7129 || (ch >= 0x1173A && ch <= 0x1173B)
7130 || ch == 0x1173F ))
7131 attr |= (int64_t) 1 << LBP_SA;
7132
7133
7134 if ((unicode_attributes[ch].category[0] == 'M'
7135 && (unicode_attributes[ch].category[1] == 'c'
7136 || unicode_attributes[ch].category[1] == 'e'
7137 || unicode_attributes[ch].category[1] == 'n'))
7138 || (unicode_attributes[ch].category[0] == 'C'
7139 && (unicode_attributes[ch].category[1] == 'c'
7140 || unicode_attributes[ch].category[1] == 'f')
7141 && ch != 0x110BD
7142 && ch != 0x08E2 )
7143 || ch == 0x3035 )
7144 if (!(attr & (((int64_t) 1 << LBP_BK) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_WJ) | ((int64_t) 1 << LBP_ZW) | ((int64_t) 1 << LBP_ZWJ))))
7145 attr |= (int64_t) 1 << LBP_CM;
7146
7147
7148 if (ch == 0x231A
7149 || ch == 0x231B
7150 || ch == 0x23F0
7151 || ch == 0x23F1
7152 || ch == 0x23F2
7153 || ch == 0x23F3
7154 || ch == 0x2600
7155 || ch == 0x2601
7156 || ch == 0x2602
7157 || ch == 0x2603
7158 || ch == 0x2614
7159 || ch == 0x2615
7160 || ch == 0x2618
7161 || ch == 0x261A
7162 || ch == 0x261B
7163 || ch == 0x261C
7164 || ch == 0x261D
7165 || ch == 0x261E
7166 || ch == 0x261F
7167 || ch == 0x2639
7168 || ch == 0x263A
7169 || ch == 0x263B
7170 || ch == 0x2668
7171 || ch == 0x267F
7172 || ch == 0x26BD
7173 || ch == 0x26BE
7174 || ch == 0x26BF
7175 || ch == 0x26C0
7176 || ch == 0x26C1
7177 || ch == 0x26C2
7178 || ch == 0x26C3
7179 || ch == 0x26C4
7180 || ch == 0x26C5
7181 || ch == 0x26C6
7182 || ch == 0x26C7
7183 || ch == 0x26C8
7184 || ch == 0x26CD
7185 || ch == 0x26CF
7186 || ch == 0x26D0
7187 || ch == 0x26D1
7188 || ch == 0x26D3
7189 || ch == 0x26D4
7190 || ch == 0x26D8
7191 || ch == 0x26D9
7192 || ch == 0x26DC
7193 || ch == 0x26DF
7194 || ch == 0x26E0
7195 || ch == 0x26E1
7196 || ch == 0x26EA
7197 || ch == 0x26F1
7198 || ch == 0x26F2
7199 || ch == 0x26F3
7200 || ch == 0x26F4
7201 || ch == 0x26F5
7202 || ch == 0x26F7
7203 || ch == 0x26F8
7204 || ch == 0x26F9
7205 || ch == 0x26FA
7206 || ch == 0x26FD
7207 || ch == 0x26FE
7208 || ch == 0x26FF
7209 || ch == 0x2700
7210 || ch == 0x2701
7211 || ch == 0x2702
7212 || ch == 0x2703
7213 || ch == 0x2704
7214 || ch == 0x2708
7215 || ch == 0x2709
7216 || ch == 0x270A
7217 || ch == 0x270B
7218 || ch == 0x270C
7219 || ch == 0x270D
7220 || ch == 0x2764
7221 || (ch >= 0x2E80 && ch <= 0x2FFF)
7222 || (ch >= 0x3040 && ch <= 0x309F)
7223 || (ch >= 0x30A0 && ch <= 0x30FF)
7224 || (ch >= 0x3400 && ch <= 0x4DBF)
7225 || (ch >= 0x4E00 && ch <= 0x9FFF)
7226 || (ch >= 0xF900 && ch <= 0xFAD9)
7227 || (ch >= 0xA000 && ch <= 0xA48F)
7228 || (ch >= 0xA490 && ch <= 0xA4CF)
7229 || ch == 0xFE62
7230 || ch == 0xFE63
7231 || ch == 0xFE64
7232 || ch == 0xFE65
7233 || ch == 0xFE66
7234 || (ch >= 0xFF10 && ch <= 0xFF19)
7235 || (ch >= 0x20000 && ch <= 0x2A6D6)
7236 || (ch >= 0x2F800 && ch <= 0x2FA1D)
7237 || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
7238 || (ch >= 0x3000 && ch <= 0x33FF
7239 && !(attr & (((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_CM) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP))))
7240
7241 || ch == 0xFE30
7242 || ch == 0xFE31
7243 || ch == 0xFE32
7244 || ch == 0xFE33
7245 || ch == 0xFE34
7246 || ch == 0xFE45
7247 || ch == 0xFE46
7248 || ch == 0xFE49
7249 || ch == 0xFE4A
7250 || ch == 0xFE4B
7251 || ch == 0xFE4C
7252 || ch == 0xFE4D
7253 || ch == 0xFE4E
7254 || ch == 0xFE4F
7255 || ch == 0xFE51
7256 || ch == 0xFE58
7257 || ch == 0xFE5F
7258 || ch == 0xFE60
7259 || ch == 0xFE61
7260 || ch == 0xFE68
7261 || ch == 0xFE6B
7262 || ch == 0xFF02
7263 || ch == 0xFF03
7264 || ch == 0xFF06
7265 || ch == 0xFF07
7266 || ch == 0xFF0A
7267 || ch == 0xFF0B
7268 || ch == 0xFF0D
7269 || ch == 0xFF0F
7270 || ch == 0xFF1C
7271 || ch == 0xFF1D
7272 || ch == 0xFF1E
7273 || ch == 0xFF20
7274 || ch == 0xFF3C
7275 || ch == 0xFF3E
7276 || ch == 0xFF3F
7277 || ch == 0xFF40
7278 || ch == 0xFF5C
7279 || ch == 0xFF5E
7280 || ch == 0xFFE2
7281 || ch == 0xFFE3
7282 || ch == 0xFFE4
7283
7284 || ch == 0xFF66
7285 || (ch >= 0xFF71 && ch <= 0xFF9D)
7286 || (ch >= 0xFFA0 && ch <= 0xFFBE)
7287 || (ch >= 0xFFC2 && ch <= 0xFFC7)
7288 || (ch >= 0xFFCA && ch <= 0xFFCF)
7289 || (ch >= 0xFFD2 && ch <= 0xFFD7)
7290 || (ch >= 0xFFDA && ch <= 0xFFDC)
7291 || (ch >= 0x17000 && ch <= 0x187EC)
7292 || (ch >= 0x18800 && ch <= 0x18AF2)
7293 || (ch >= 0x1B000 && ch <= 0x1B001)
7294 || (ch >= 0x1F000 && ch <= 0x1F02B)
7295 || (ch >= 0x1F030 && ch <= 0x1F093)
7296 || (ch >= 0x1F0A0 && ch <= 0x1F0F5)
7297 || (ch >= 0x1F200 && ch <= 0x1F248)
7298 || (ch >= 0x1F250 && ch <= 0x1F251)
7299 || (ch >= 0x1F300 && ch <= 0x1F5FF
7300 && ch != 0x1F3B5 && ch != 0x1F3B6 && ch != 0x1F3BC
7301 && ch != 0x1F4A0 && ch != 0x1F4A2 && ch != 0x1F4A4
7302 && ch != 0x1F4AF && ch != 0x1F4B1 && ch != 0x1F4B2
7303 && !(ch >= 0x1F39C && ch <= 0x1F39D)
7304 && !(ch >= 0x1F3FB && ch <= 0x1F3FF)
7305 && !(ch >= 0x1F500 && ch <= 0x1F506)
7306 && !(ch >= 0x1F517 && ch <= 0x1F524)
7307 && !(ch >= 0x1F532 && ch <= 0x1F549)
7308 && !(ch >= 0x1F5D4 && ch <= 0x1F5DB)
7309 && !(ch >= 0x1F5F4 && ch <= 0x1F5F9))
7310 || (ch >= 0x1F600 && ch <= 0x1F64F)
7311 || (ch >= 0x1F680 && ch <= 0x1F6DF)
7312 || (ch >= 0x1F6E0 && ch <= 0x1F6EC)
7313 || (ch >= 0x1F6F0 && ch <= 0x1F6F6)
7314 || (ch >= 0x1F900 && ch <= 0x1F9FF)
7315 || (ch >= 0x2A700 && ch <= 0x2B734)
7316 || (ch >= 0x2B740 && ch <= 0x2B81D)
7317 || (ch >= 0x2B820 && ch <= 0x2CEAF) )
7318 if (!(attr & (((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_CM) | ((int64_t) 1 << LBP_EB))))
7319 {
7320
7321 if ((unicode_width[ch] != NULL
7322 && unicode_width[ch][0] == 'A'
7323 && ch >= 0x2000
7324 && ch != 0x2614
7325 && ch != 0x2615
7326 && ch != 0x261C
7327 && ch != 0x261E
7328 && ch != 0x2668
7329 && ch != 0x26BE
7330 && ch != 0x26BF
7331 && !(ch >= 0x26C4 && ch <= 0x26C8)
7332 && ch != 0x26CD
7333 && ch != 0x26CF
7334 && ch != 0x26D0
7335 && ch != 0x26D1
7336 && ch != 0x26D3
7337 && ch != 0x26D4
7338 && ch != 0x26D8
7339 && ch != 0x26D9
7340 && ch != 0x26DC
7341 && ch != 0x26DF
7342 && ch != 0x26E0
7343 && ch != 0x26E1
7344 && ch != 0x26EA
7345 && !(ch >= 0x26F1 && ch <= 0x26F5)
7346 && !(ch >= 0x26F7 && ch <= 0x26FA)
7347 && !(ch >= 0x26FD && ch <= 0x26FF))
7348 || ch == 0x24EA
7349 || (ch >= 0x2780 && ch <= 0x2793) )
7350 attr |= (int64_t) 1 << LBP_AI;
7351 else
7352 attr |= (int64_t) 1 << LBP_ID;
7353 }
7354
7355
7356 if ((unicode_attributes[ch].category[0] == 'L'
7357 && (unicode_attributes[ch].category[1] == 'u'
7358 || unicode_attributes[ch].category[1] == 'l'
7359 || unicode_attributes[ch].category[1] == 't'
7360 || unicode_attributes[ch].category[1] == 'm'
7361 || unicode_attributes[ch].category[1] == 'o'))
7362 || (unicode_attributes[ch].category[0] == 'S'
7363 && (unicode_attributes[ch].category[1] == 'm'
7364 || unicode_attributes[ch].category[1] == 'k'
7365 || unicode_attributes[ch].category[1] == 'o'))
7366 || (unicode_attributes[ch].category[0] == 'N'
7367 && (unicode_attributes[ch].category[1] == 'l'
7368 || unicode_attributes[ch].category[1] == 'o'))
7369 || (unicode_attributes[ch].category[0] == 'P'
7370 && (unicode_attributes[ch].category[1] == 'c'
7371 || unicode_attributes[ch].category[1] == 'd'
7372 || unicode_attributes[ch].category[1] == 'o'))
7373 || ch == 0x0600
7374 || ch == 0x0601
7375 || ch == 0x0602
7376 || ch == 0x0603
7377 || ch == 0x0604
7378 || ch == 0x0605
7379 || ch == 0x06DD
7380 || ch == 0x070F
7381 || ch == 0x08E2
7382 || ch == 0x2061
7383 || ch == 0x2062
7384 || ch == 0x2063
7385 || ch == 0x2064
7386
7387 || ch == 0x110BD )
7388 if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_HL) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_RI) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID) | ((int64_t) 1 << LBP_EB) | ((int64_t) 1 << LBP_EM)))
7389 && ch != 0x3035 )
7390 {
7391
7392 if ((unicode_width[ch] != NULL
7393 && unicode_width[ch][0] == 'A'
7394 && ch >= 0x2000
7395
7396 && ch != 0x2022
7397 && ch != 0x203E
7398 && ch != 0x2126
7399 && ch != 0x2153
7400 && ch != 0x215C
7401 && ch != 0x215D
7402 && ch != 0x21B8
7403 && ch != 0x21B9
7404 && ch != 0x21E7
7405 && ch != 0x24FF
7406 && ch != 0x273D )
7407 || ch == 0x00A7
7408 || ch == 0x00A8
7409 || ch == 0x00AA
7410 || ch == 0x00B2
7411 || ch == 0x00B3
7412 || ch == 0x00B6
7413 || ch == 0x00B7
7414 || ch == 0x00B8
7415 || ch == 0x00B9
7416 || ch == 0x00BA
7417 || ch == 0x00BC
7418 || ch == 0x00BD
7419 || ch == 0x00BE
7420 || ch == 0x00D7
7421 || ch == 0x00F7
7422 || ch == 0x02C7
7423 || ch == 0x02C9
7424 || ch == 0x02CA
7425 || ch == 0x02CB
7426 || ch == 0x02CD
7427 || ch == 0x02D0
7428 || ch == 0x02D8
7429 || ch == 0x02D9
7430 || ch == 0x02DA
7431 || ch == 0x02DB
7432 || ch == 0x02DD
7433 || ch == 0x24EA
7434 || (ch >= 0x2780 && ch <= 0x2793)
7435
7436 || ch == 0x2155
7437 || ch == 0x2574
7438 || ch == 0x2616
7439 || ch == 0x2617
7440 || ch == 0x2757
7441 || ch == 0x2B55
7442 || ch == 0x1F10B
7443 || ch == 0x1F18E
7444 || (ch >= 0x1F191 && ch <= 0x1F19A)
7445 || ch == 0x1F10C )
7446 attr |= (int64_t) 1 << LBP_AI;
7447 else
7448 attr |= (int64_t) 1 << LBP_AL;
7449 attr &= ~((int64_t) 1 << LBP_CM);
7450 }
7451 }
7452 else
7453 {
7454
7455 if ((ch >= 0x3400 && ch <= 0x4DBF)
7456 || (ch >= 0x4E00 && ch <= 0x9FFF)
7457 || (ch >= 0xF900 && ch <= 0xFAFF)
7458 || (ch >= 0x1F02C && ch <= 0x1F02F)
7459 || (ch >= 0x1F094 && ch <= 0x1F09F)
7460 || (ch >= 0x1F0AF && ch <= 0x1F0B0)
7461 || ch == 0x1F0C0
7462 || ch == 0x1F0D0
7463 || (ch >= 0x1F0F6 && ch <= 0x1F0FF)
7464 || (ch >= 0x1F10D && ch <= 0x1F10F)
7465 || ch == 0x1F12F
7466 || (ch >= 0x1F16C && ch <= 0x1F16F)
7467 || (ch >= 0x1F1AD && ch <= 0x1F1E5)
7468 || (ch >= 0x1F203 && ch <= 0x1F20F)
7469 || (ch >= 0x1F23C && ch <= 0x1F23F)
7470 || (ch >= 0x1F249 && ch <= 0x1F24F)
7471 || (ch >= 0x1F252 && ch <= 0x1F2FF)
7472 || (ch >= 0x1F6D3 && ch <= 0x1F6DF)
7473 || (ch >= 0x1F6ED && ch <= 0x1F6EF)
7474 || (ch >= 0x1F6F7 && ch <= 0x1F6FF)
7475 || (ch >= 0x1F774 && ch <= 0x1F77F)
7476 || (ch >= 0x1F7D5 && ch <= 0x1F7FF)
7477 || (ch >= 0x1F80C && ch <= 0x1F80F)
7478 || (ch >= 0x1F848 && ch <= 0x1F84F)
7479 || (ch >= 0x1F85A && ch <= 0x1F85F)
7480 || (ch >= 0x1F888 && ch <= 0x1F88F)
7481 || (ch >= 0x1F8AE && ch <= 0x1F90F)
7482 || ch == 0x1F91F
7483 || ch == 0x1F93F
7484 || (ch >= 0x1F928 && ch <= 0x1F92F)
7485 || (ch >= 0x1F931 && ch <= 0x1F932)
7486 || (ch >= 0x1F94C && ch <= 0x1F94F)
7487 || (ch >= 0x1F95F && ch <= 0x1F97F)
7488 || (ch >= 0x1F992 && ch <= 0x1F9BF)
7489 || (ch >= 0x1F9C1 && ch <= 0x1FFFD)
7490 || (ch >= 0x20000 && ch <= 0x2A6FF)
7491 || (ch >= 0x2A700 && ch <= 0x2F7FF)
7492
7493 || (ch >= 0x2F800 && ch <= 0x2FFFD)
7494
7495 || (ch >= 0x30000 && ch <= 0x3FFFD) )
7496 attr |= (int64_t) 1 << LBP_ID;
7497 }
7498
7499 if (attr == 0)
7500
7501 attr |= (int64_t) 1 << LBP_XX;
7502
7503 return attr;
7504 }
7505
7506
7507 static void
7508 debug_output_lbp (FILE *stream)
7509 {
7510 unsigned int i;
7511
7512 for (i = 0; i < 0x110000; i++)
7513 {
7514 int64_t attr = get_lbp (i);
7515 if (attr != (int64_t) 1 << LBP_XX)
7516 {
7517 fprintf (stream, "0x%04X", i);
7518 #define PRINT_BIT(attr,bit) \
7519 if (attr & ((int64_t) 1 << bit)) fprintf (stream, " " #bit);
7520 PRINT_BIT(attr,LBP_BK);
7521 PRINT_BIT(attr,LBP_CM);
7522 PRINT_BIT(attr,LBP_WJ);
7523 PRINT_BIT(attr,LBP_ZW);
7524 PRINT_BIT(attr,LBP_GL);
7525 PRINT_BIT(attr,LBP_SP);
7526 PRINT_BIT(attr,LBP_B2);
7527 PRINT_BIT(attr,LBP_BA);
7528 PRINT_BIT(attr,LBP_BB);
7529 PRINT_BIT(attr,LBP_HY);
7530 PRINT_BIT(attr,LBP_CB);
7531 PRINT_BIT(attr,LBP_CL);
7532 PRINT_BIT(attr,LBP_CP);
7533 PRINT_BIT(attr,LBP_EX);
7534 PRINT_BIT(attr,LBP_IN);
7535 PRINT_BIT(attr,LBP_NS);
7536 PRINT_BIT(attr,LBP_OP);
7537 PRINT_BIT(attr,LBP_QU);
7538 PRINT_BIT(attr,LBP_IS);
7539 PRINT_BIT(attr,LBP_NU);
7540 PRINT_BIT(attr,LBP_PO);
7541 PRINT_BIT(attr,LBP_PR);
7542 PRINT_BIT(attr,LBP_SY);
7543 PRINT_BIT(attr,LBP_AI);
7544 PRINT_BIT(attr,LBP_AL);
7545 PRINT_BIT(attr,LBP_H2);
7546 PRINT_BIT(attr,LBP_H3);
7547 PRINT_BIT(attr,LBP_HL);
7548 PRINT_BIT(attr,LBP_ID);
7549 PRINT_BIT(attr,LBP_JL);
7550 PRINT_BIT(attr,LBP_JV);
7551 PRINT_BIT(attr,LBP_JT);
7552 PRINT_BIT(attr,LBP_RI);
7553 PRINT_BIT(attr,LBP_SA);
7554 PRINT_BIT(attr,LBP_ZWJ);
7555 PRINT_BIT(attr,LBP_EB);
7556 PRINT_BIT(attr,LBP_EM);
7557 PRINT_BIT(attr,LBP_XX);
7558 #undef PRINT_BIT
7559 fprintf (stream, "\n");
7560 }
7561 }
7562 }
7563
7564 static void
7565 debug_output_lbrk_tables (const char *filename)
7566 {
7567 FILE *stream;
7568
7569 stream = fopen (filename, "w");
7570 if (stream == NULL)
7571 {
7572 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7573 exit (1);
7574 }
7575
7576 debug_output_lbp (stream);
7577
7578 if (ferror (stream) || fclose (stream))
7579 {
7580 fprintf (stderr, "error writing to '%s'\n", filename);
7581 exit (1);
7582 }
7583 }
7584
7585
7586 int unicode_org_lbp[0x110000];
7587
7588
7589
7590 static void
7591 fill_org_lbp (const char *linebreak_filename)
7592 {
7593 unsigned int i, j;
7594 FILE *stream;
7595 char field0[FIELDLEN];
7596 char field1[FIELDLEN];
7597 char field2[FIELDLEN];
7598 int lineno = 0;
7599
7600 for (i = 0; i < 0x110000; i++)
7601 unicode_org_lbp[i] = LBP_XX;
7602
7603 stream = fopen (linebreak_filename, "r");
7604 if (stream == NULL)
7605 {
7606 fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename);
7607 exit (1);
7608 }
7609
7610 for (;;)
7611 {
7612 int n;
7613 int c;
7614 int value;
7615
7616 lineno++;
7617 c = getc (stream);
7618 if (c == EOF)
7619 break;
7620 if (c == '#')
7621 {
7622 do c = getc (stream); while (c != EOF && c != '\n');
7623 continue;
7624 }
7625 ungetc (c, stream);
7626 n = getfield (stream, field0, ';');
7627 n += getfield (stream, field1, ' ');
7628 n += getfield (stream, field2, '\n');
7629 if (n == 0)
7630 break;
7631 if (n != 3)
7632 {
7633 fprintf (stderr, "short line in '%s':%d\n", linebreak_filename,
7634 lineno);
7635 exit (1);
7636 }
7637 #define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
7638 if (false) {}
7639 TRY(LBP_BK)
7640 TRY(LBP_CM)
7641 TRY(LBP_WJ)
7642 TRY(LBP_ZW)
7643 TRY(LBP_GL)
7644 TRY(LBP_SP)
7645 TRY(LBP_B2)
7646 TRY(LBP_BA)
7647 TRY(LBP_BB)
7648 TRY(LBP_HY)
7649 TRY(LBP_CB)
7650 TRY(LBP_CL)
7651 TRY(LBP_CP)
7652 TRY(LBP_EX)
7653 TRY(LBP_IN)
7654 TRY(LBP_NS)
7655 TRY(LBP_OP)
7656 TRY(LBP_QU)
7657 TRY(LBP_IS)
7658 TRY(LBP_NU)
7659 TRY(LBP_PO)
7660 TRY(LBP_PR)
7661 TRY(LBP_SY)
7662 TRY(LBP_AI)
7663 TRY(LBP_AL)
7664 TRY(LBP_H2)
7665 TRY(LBP_H3)
7666 TRY(LBP_HL)
7667 TRY(LBP_ID)
7668 TRY(LBP_JL)
7669 TRY(LBP_JV)
7670 TRY(LBP_JT)
7671 TRY(LBP_RI)
7672 TRY(LBP_SA)
7673 TRY(LBP_ZWJ)
7674 TRY(LBP_EB)
7675 TRY(LBP_EM)
7676 TRY(LBP_XX)
7677 #undef TRY
7678 else if (strcmp (field1, "LF") == 0) value = LBP_BK;
7679 else if (strcmp (field1, "CR") == 0) value = LBP_BK;
7680 else if (strcmp (field1, "NL") == 0) value = LBP_BK;
7681 else if (strcmp (field1, "SG") == 0) value = LBP_XX;
7682 else if (strcmp (field1, "CJ") == 0) value = LBP_NS;
7683 else
7684 {
7685 fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",
7686 field1, linebreak_filename, lineno);
7687 exit (1);
7688 }
7689 i = strtoul (field0, NULL, 16);
7690 if (strstr (field0, "..") != NULL)
7691 {
7692
7693 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
7694 for (; i <= j; i++)
7695 unicode_org_lbp[i] = value;
7696 }
7697 else
7698 {
7699
7700 unicode_org_lbp[i] = value;
7701 }
7702 }
7703
7704 if (ferror (stream) || fclose (stream))
7705 {
7706 fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
7707 exit (1);
7708 }
7709 }
7710
7711
7712 static void
7713 debug_output_org_lbp (FILE *stream)
7714 {
7715 unsigned int i;
7716
7717 for (i = 0; i < 0x110000; i++)
7718 {
7719 int attr = unicode_org_lbp[i];
7720 if (attr != LBP_XX)
7721 {
7722 fprintf (stream, "0x%04X", i);
7723 #define PRINT_BIT(attr,bit) \
7724 if (attr == bit) fprintf (stream, " " #bit);
7725 PRINT_BIT(attr,LBP_BK);
7726 PRINT_BIT(attr,LBP_CM);
7727 PRINT_BIT(attr,LBP_WJ);
7728 PRINT_BIT(attr,LBP_ZW);
7729 PRINT_BIT(attr,LBP_GL);
7730 PRINT_BIT(attr,LBP_SP);
7731 PRINT_BIT(attr,LBP_B2);
7732 PRINT_BIT(attr,LBP_BA);
7733 PRINT_BIT(attr,LBP_BB);
7734 PRINT_BIT(attr,LBP_HY);
7735 PRINT_BIT(attr,LBP_CB);
7736 PRINT_BIT(attr,LBP_CL);
7737 PRINT_BIT(attr,LBP_CP);
7738 PRINT_BIT(attr,LBP_EX);
7739 PRINT_BIT(attr,LBP_IN);
7740 PRINT_BIT(attr,LBP_NS);
7741 PRINT_BIT(attr,LBP_OP);
7742 PRINT_BIT(attr,LBP_QU);
7743 PRINT_BIT(attr,LBP_IS);
7744 PRINT_BIT(attr,LBP_NU);
7745 PRINT_BIT(attr,LBP_PO);
7746 PRINT_BIT(attr,LBP_PR);
7747 PRINT_BIT(attr,LBP_SY);
7748 PRINT_BIT(attr,LBP_AI);
7749 PRINT_BIT(attr,LBP_AL);
7750 PRINT_BIT(attr,LBP_H2);
7751 PRINT_BIT(attr,LBP_H3);
7752 PRINT_BIT(attr,LBP_HL);
7753 PRINT_BIT(attr,LBP_ID);
7754 PRINT_BIT(attr,LBP_JL);
7755 PRINT_BIT(attr,LBP_JV);
7756 PRINT_BIT(attr,LBP_JT);
7757 PRINT_BIT(attr,LBP_RI);
7758 PRINT_BIT(attr,LBP_SA);
7759 PRINT_BIT(attr,LBP_ZWJ);
7760 PRINT_BIT(attr,LBP_EB);
7761 PRINT_BIT(attr,LBP_EM);
7762 PRINT_BIT(attr,LBP_XX);
7763 #undef PRINT_BIT
7764 fprintf (stream, "\n");
7765 }
7766 }
7767 }
7768
7769 static void
7770 debug_output_org_lbrk_tables (const char *filename)
7771 {
7772 FILE *stream;
7773
7774 stream = fopen (filename, "w");
7775 if (stream == NULL)
7776 {
7777 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7778 exit (1);
7779 }
7780
7781 debug_output_org_lbp (stream);
7782
7783 if (ferror (stream) || fclose (stream))
7784 {
7785 fprintf (stderr, "error writing to '%s'\n", filename);
7786 exit (1);
7787 }
7788 }
7789
7790
7791 #define TABLE lbp_table
7792 #define ELEMENT unsigned char
7793 #define DEFAULT LBP_XX
7794 #define xmalloc malloc
7795 #define xrealloc realloc
7796 #include "3level.h"
7797
7798 static void
7799 output_lbp (FILE *stream1, FILE *stream2)
7800 {
7801 unsigned int i;
7802 struct lbp_table t;
7803 unsigned int level1_offset, level2_offset, level3_offset;
7804
7805 t.p = 7;
7806 t.q = 9;
7807 lbp_table_init (&t);
7808
7809 for (i = 0; i < 0x110000; i++)
7810 {
7811 int64_t attr = get_lbp (i);
7812
7813
7814 assert (attr != 0 && (attr & (attr - 1)) == 0);
7815
7816 if (attr != (int64_t) 1 << LBP_XX)
7817 {
7818 unsigned int log2_attr;
7819 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
7820
7821 lbp_table_add (&t, i, log2_attr);
7822 }
7823 }
7824
7825 lbp_table_finalize (&t);
7826
7827 level1_offset =
7828 5 * sizeof (uint32_t);
7829 level2_offset =
7830 5 * sizeof (uint32_t)
7831 + t.level1_size * sizeof (uint32_t);
7832 level3_offset =
7833 5 * sizeof (uint32_t)
7834 + t.level1_size * sizeof (uint32_t)
7835 + (t.level2_size << t.q) * sizeof (uint32_t);
7836
7837 for (i = 0; i < 5; i++)
7838 fprintf (stream1, "#define lbrkprop_header_%d %d\n", i,
7839 ((uint32_t *) t.result)[i]);
7840 fprintf (stream1, "\n");
7841 fprintf (stream1, "typedef struct\n");
7842 fprintf (stream1, " {\n");
7843 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
7844 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
7845 fprintf (stream1, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
7846 fprintf (stream1, " }\n");
7847 fprintf (stream1, "lbrkprop_t;\n");
7848 fprintf (stream1, "extern const lbrkprop_t unilbrkprop;\n");
7849
7850 fprintf (stream2, "const lbrkprop_t unilbrkprop =\n");
7851 fprintf (stream2, "{\n");
7852 fprintf (stream2, " {");
7853 if (t.level1_size > 8)
7854 fprintf (stream2, "\n ");
7855 for (i = 0; i < t.level1_size; i++)
7856 {
7857 uint32_t offset;
7858 if (i > 0 && (i % 8) == 0)
7859 fprintf (stream2, "\n ");
7860 offset = ((uint32_t *) (t.result + level1_offset))[i];
7861 if (offset == 0)
7862 fprintf (stream2, " %5d", -1);
7863 else
7864 fprintf (stream2, " %5zu",
7865 (offset - level2_offset) / sizeof (uint32_t));
7866 if (i+1 < t.level1_size)
7867 fprintf (stream2, ",");
7868 }
7869 if (t.level1_size > 8)
7870 fprintf (stream2, "\n ");
7871 fprintf (stream2, " },\n");
7872 fprintf (stream2, " {");
7873 if (t.level2_size << t.q > 8)
7874 fprintf (stream2, "\n ");
7875 for (i = 0; i < t.level2_size << t.q; i++)
7876 {
7877 uint32_t offset;
7878 if (i > 0 && (i % 8) == 0)
7879 fprintf (stream2, "\n ");
7880 offset = ((uint32_t *) (t.result + level2_offset))[i];
7881 if (offset == 0)
7882 fprintf (stream2, " %5d", -1);
7883 else
7884 fprintf (stream2, " %5zu",
7885 (offset - level3_offset) / sizeof (unsigned char));
7886 if (i+1 < t.level2_size << t.q)
7887 fprintf (stream2, ",");
7888 }
7889 if (t.level2_size << t.q > 8)
7890 fprintf (stream2, "\n ");
7891 fprintf (stream2, " },\n");
7892 fprintf (stream2, " {");
7893 if (t.level3_size << t.p > 8)
7894 fprintf (stream2, "\n ");
7895 for (i = 0; i < t.level3_size << t.p; i++)
7896 {
7897 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
7898 const char *value_string;
7899 switch (value)
7900 {
7901 #define CASE(x) case x: value_string = #x; break;
7902 CASE(LBP_BK);
7903 CASE(LBP_CM);
7904 CASE(LBP_WJ);
7905 CASE(LBP_ZW);
7906 CASE(LBP_GL);
7907 CASE(LBP_SP);
7908 CASE(LBP_B2);
7909 CASE(LBP_BA);
7910 CASE(LBP_BB);
7911 CASE(LBP_HY);
7912 CASE(LBP_CB);
7913 CASE(LBP_CL);
7914 CASE(LBP_CP);
7915 CASE(LBP_EX);
7916 CASE(LBP_IN);
7917 CASE(LBP_NS);
7918 CASE(LBP_OP);
7919 CASE(LBP_QU);
7920 CASE(LBP_IS);
7921 CASE(LBP_NU);
7922 CASE(LBP_PO);
7923 CASE(LBP_PR);
7924 CASE(LBP_SY);
7925 CASE(LBP_AI);
7926 CASE(LBP_AL);
7927 CASE(LBP_H2);
7928 CASE(LBP_H3);
7929 CASE(LBP_HL);
7930 CASE(LBP_ID);
7931 CASE(LBP_JL);
7932 CASE(LBP_JV);
7933 CASE(LBP_JT);
7934 CASE(LBP_RI);
7935 CASE(LBP_SA);
7936 CASE(LBP_ZWJ);
7937 CASE(LBP_EB);
7938 CASE(LBP_EM);
7939 CASE(LBP_XX);
7940 #undef CASE
7941 default:
7942 abort ();
7943 }
7944 if (i > 0 && (i % 8) == 0)
7945 fprintf (stream2, "\n ");
7946 fprintf (stream2, " %s%s", value_string,
7947 (i+1 < t.level3_size << t.p ? "," : ""));
7948 }
7949 if (t.level3_size << t.p > 8)
7950 fprintf (stream2, "\n ");
7951 fprintf (stream2, " }\n");
7952 fprintf (stream2, "};\n");
7953 }
7954
7955 static void
7956 output_lbrk_tables (const char *filename1, const char *filename2, const char *version)
7957 {
7958 const char *filenames[2];
7959 FILE *streams[2];
7960 size_t i;
7961
7962 filenames[0] = filename1;
7963 filenames[1] = filename2;
7964
7965 for (i = 0; i < 2; i++)
7966 {
7967 streams[i] = fopen (filenames[i], "w");
7968 if (streams[i] == NULL)
7969 {
7970 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
7971 exit (1);
7972 }
7973 }
7974
7975 for (i = 0; i < 2; i++)
7976 {
7977 FILE *stream = streams[i];
7978
7979 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7980 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
7981 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
7982 version);
7983 fprintf (stream, "\n");
7984
7985 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.\n");
7986 fprintf (stream, "\n");
7987 output_library_license (stream, false);
7988 fprintf (stream, "\n");
7989 }
7990
7991 output_lbp (streams[0], streams[1]);
7992
7993 for (i = 0; i < 2; i++)
7994 {
7995 if (ferror (streams[i]) || fclose (streams[i]))
7996 {
7997 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
7998 exit (1);
7999 }
8000 }
8001 }
8002
8003
8004
8005
8006
8007
8008
8009 enum
8010 {
8011 WBP_OTHER = 0,
8012 WBP_CR = 11,
8013 WBP_LF = 12,
8014 WBP_NEWLINE = 10,
8015 WBP_EXTEND = 8,
8016 WBP_FORMAT = 9,
8017 WBP_KATAKANA = 1,
8018 WBP_ALETTER = 2,
8019 WBP_MIDNUMLET = 3,
8020 WBP_MIDLETTER = 4,
8021 WBP_MIDNUM = 5,
8022 WBP_NUMERIC = 6,
8023 WBP_EXTENDNUMLET = 7,
8024 WBP_RI = 13,
8025 WBP_DQ = 14,
8026 WBP_SQ = 15,
8027 WBP_HL = 16,
8028 WBP_ZWJ = 17,
8029 WBP_EB = 18,
8030 WBP_EM = 19,
8031 WBP_GAZ = 20,
8032 WBP_EBG = 21
8033 };
8034
8035
8036 static int
8037 get_wbp (unsigned int ch)
8038 {
8039 int attr = 0;
8040
8041 if (unicode_attributes[ch].name != NULL)
8042 {
8043 if (ch == 0x000D)
8044 attr |= 1 << WBP_CR;
8045
8046 if (ch == 0x000A)
8047 attr |= 1 << WBP_LF;
8048
8049 if (ch == 0x000B || ch == 0x000C
8050 || ch == 0x0085
8051 || ch == 0x2028 || ch == 0x2029)
8052 attr |= 1 << WBP_NEWLINE;
8053
8054 if (((unicode_properties[ch] >> PROP_GRAPHEME_EXTEND) & 1) != 0
8055 || ((unicode_properties[ch] >> PROP_OTHER_GRAPHEME_EXTEND) & 1) != 0
8056 || (unicode_attributes[ch].category != NULL
8057 && strcmp (unicode_attributes[ch].category, "Mc") == 0))
8058 attr |= 1 << WBP_EXTEND;
8059
8060 if (unicode_attributes[ch].category != NULL
8061 && strcmp (unicode_attributes[ch].category, "Cf") == 0
8062 && ch != 0x200B && ch != 0x200C && ch != 0x200D
8063 && !(ch >= 0xe0020 && ch <= 0xe007f))
8064 attr |= 1 << WBP_FORMAT;
8065
8066 if ((unicode_scripts[ch] < numscripts
8067 && strcmp (scripts[unicode_scripts[ch]], "Katakana") == 0)
8068 || (ch >= 0x3031 && ch <= 0x3035)
8069 || ch == 0x309B || ch == 0x309C || ch == 0x30A0 || ch == 0x30FC
8070 || ch == 0xFF70)
8071 attr |= 1 << WBP_KATAKANA;
8072
8073 if ((unicode_scripts[ch] < numscripts
8074 && strcmp (scripts[unicode_scripts[ch]], "Hebrew") == 0)
8075 && strcmp (unicode_attributes[ch].category, "Lo") == 0)
8076 attr |= 1 << WBP_HL;
8077
8078 if ((((unicode_properties[ch] >> PROP_ALPHABETIC) & 1) != 0
8079 || ch == 0x05F3)
8080 && ((unicode_properties[ch] >> PROP_IDEOGRAPHIC) & 1) == 0
8081 && (attr & (1 << WBP_KATAKANA)) == 0
8082 && ((get_lbp (ch) >> LBP_SA) & 1) == 0
8083 && !(unicode_scripts[ch] < numscripts
8084 && strcmp (scripts[unicode_scripts[ch]], "Hiragana") == 0)
8085 && (attr & (1 << WBP_EXTEND)) == 0
8086 && (attr & (1 << WBP_HL)) == 0)
8087 attr |= 1 << WBP_ALETTER;
8088
8089 if (is_WBP_MIDNUMLET (ch))
8090 attr |= 1 << WBP_MIDNUMLET;
8091
8092 if (is_WBP_MIDLETTER (ch))
8093 attr |= 1 << WBP_MIDLETTER;
8094
8095 if ((((get_lbp (ch) >> LBP_IS) & 1) != 0
8096 || ch == 0x066C || ch == 0xFE50 || ch == 0xFE54 || ch == 0xFF0C
8097 || ch == 0xFF1B)
8098 && ch != 0x003A && ch != 0xFE13 && ch != 0x002E)
8099 attr |= 1 << WBP_MIDNUM;
8100
8101 if (((get_lbp (ch) >> LBP_NU) & 1) != 0
8102 && ch != 0x066C)
8103 attr |= 1 << WBP_NUMERIC;
8104
8105 if ((unicode_attributes[ch].category != NULL
8106 && strcmp (unicode_attributes[ch].category, "Pc") == 0)
8107 || ch == 0x202F )
8108 attr |= 1 << WBP_EXTENDNUMLET;
8109
8110 if (((get_lbp (ch) >> LBP_RI) & 1) != 0)
8111 attr |= 1 << WBP_RI;
8112
8113 if (ch == 0x0022)
8114 attr |= 1 << WBP_DQ;
8115
8116 if (ch == 0x0027)
8117 attr |= 1 << WBP_SQ;
8118
8119 if (ch == 0x200D)
8120 attr |= 1 << WBP_ZWJ;
8121
8122 if (ch >= 0x1F466 && ch <= 0x1F469)
8123 attr |= 1 << WBP_EBG;
8124 else if (((get_lbp (ch) >> LBP_EB) & 1) != 0)
8125 attr |= 1 << WBP_EB;
8126
8127 if (((get_lbp (ch) >> LBP_EM) & 1) != 0)
8128 attr |= 1 << WBP_EM;
8129
8130 if (ch == 0x2764 || ch == 0x1F48B || ch == 0x1F5E8)
8131 attr |= 1 << WBP_GAZ;
8132 }
8133
8134 if (attr == 0)
8135
8136 attr |= 1 << WBP_OTHER;
8137
8138 return attr;
8139 }
8140
8141
8142 static void
8143 debug_output_wbp (FILE *stream)
8144 {
8145 unsigned int i;
8146
8147 for (i = 0; i < 0x110000; i++)
8148 {
8149 int attr = get_wbp (i);
8150 if (attr != 1 << WBP_OTHER)
8151 {
8152 fprintf (stream, "0x%04X", i);
8153 if (attr & (1 << WBP_CR))
8154 fprintf (stream, " CR");
8155 if (attr & (1 << WBP_LF))
8156 fprintf (stream, " LF");
8157 if (attr & (1 << WBP_NEWLINE))
8158 fprintf (stream, " Newline");
8159 if (attr & (1 << WBP_EXTEND))
8160 fprintf (stream, " Extend");
8161 if (attr & (1 << WBP_FORMAT))
8162 fprintf (stream, " Format");
8163 if (attr & (1 << WBP_KATAKANA))
8164 fprintf (stream, " Katakana");
8165 if (attr & (1 << WBP_ALETTER))
8166 fprintf (stream, " ALetter");
8167 if (attr & (1 << WBP_MIDNUMLET))
8168 fprintf (stream, " MidNumLet");
8169 if (attr & (1 << WBP_MIDLETTER))
8170 fprintf (stream, " MidLetter");
8171 if (attr & (1 << WBP_MIDNUM))
8172 fprintf (stream, " MidNum");
8173 if (attr & (1 << WBP_NUMERIC))
8174 fprintf (stream, " Numeric");
8175 if (attr & (1 << WBP_EXTENDNUMLET))
8176 fprintf (stream, " ExtendNumLet");
8177 if (attr & (1 << WBP_RI))
8178 fprintf (stream, " Regional_Indicator");
8179 if (attr & (1 << WBP_DQ))
8180 fprintf (stream, " Double_Quote");
8181 if (attr & (1 << WBP_SQ))
8182 fprintf (stream, " Single_Quote");
8183 if (attr & (1 << WBP_HL))
8184 fprintf (stream, " Hebrew_Letter");
8185 if (attr & (1 << WBP_ZWJ))
8186 fprintf (stream, " ZWJ");
8187 if (attr & (1 << WBP_EB))
8188 fprintf (stream, " E_Base");
8189 if (attr & (1 << WBP_EM))
8190 fprintf (stream, " E_Modifier");
8191 if (attr & (1 << WBP_GAZ))
8192 fprintf (stream, " Glue_After_Zwj");
8193 if (attr & (1 << WBP_EBG))
8194 fprintf (stream, " E_Base_GAZ");
8195 fprintf (stream, "\n");
8196 }
8197 }
8198 }
8199
8200 static void
8201 debug_output_wbrk_tables (const char *filename)
8202 {
8203 FILE *stream;
8204
8205 stream = fopen (filename, "w");
8206 if (stream == NULL)
8207 {
8208 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8209 exit (1);
8210 }
8211
8212 debug_output_wbp (stream);
8213
8214 if (ferror (stream) || fclose (stream))
8215 {
8216 fprintf (stderr, "error writing to '%s'\n", filename);
8217 exit (1);
8218 }
8219 }
8220
8221
8222 int unicode_org_wbp[0x110000];
8223
8224
8225
8226 static void
8227 fill_org_wbp (const char *wordbreakproperty_filename)
8228 {
8229 unsigned int i;
8230 FILE *stream;
8231
8232 for (i = 0; i < 0x110000; i++)
8233 unicode_org_wbp[i] = WBP_OTHER;
8234
8235 stream = fopen (wordbreakproperty_filename, "r");
8236 if (stream == NULL)
8237 {
8238 fprintf (stderr, "error during fopen of '%s'\n", wordbreakproperty_filename);
8239 exit (1);
8240 }
8241
8242 for (;;)
8243 {
8244 char buf[200+1];
8245 unsigned int i1, i2;
8246 char padding[200+1];
8247 char propname[200+1];
8248 int propvalue;
8249
8250 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
8251 break;
8252
8253 if (buf[0] == '\0' || buf[0] == '#')
8254 continue;
8255
8256 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
8257 {
8258 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
8259 {
8260 fprintf (stderr, "parse error in '%s'\n",
8261 wordbreakproperty_filename);
8262 exit (1);
8263 }
8264 i2 = i1;
8265 }
8266 #define PROP(name,value) \
8267 if (strcmp (propname, name) == 0) propvalue = value; else
8268 PROP ("CR", WBP_CR)
8269 PROP ("LF", WBP_LF)
8270 PROP ("Newline", WBP_NEWLINE)
8271 PROP ("Extend", WBP_EXTEND)
8272 PROP ("Format", WBP_FORMAT)
8273 PROP ("Katakana", WBP_KATAKANA)
8274 PROP ("ALetter", WBP_ALETTER)
8275 PROP ("MidNumLet", WBP_MIDNUMLET)
8276 PROP ("MidLetter", WBP_MIDLETTER)
8277 PROP ("MidNum", WBP_MIDNUM)
8278 PROP ("Numeric", WBP_NUMERIC)
8279 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
8280 PROP ("Regional_Indicator", WBP_RI)
8281 PROP ("Double_Quote", WBP_DQ)
8282 PROP ("Single_Quote", WBP_SQ)
8283 PROP ("Hebrew_Letter", WBP_HL)
8284 PROP ("ZWJ", WBP_ZWJ)
8285 PROP ("E_Base", WBP_EB)
8286 PROP ("E_Modifier", WBP_EM)
8287 PROP ("Glue_After_Zwj", WBP_GAZ)
8288 PROP ("E_Base_GAZ", WBP_EBG)
8289 #undef PROP
8290 {
8291 fprintf (stderr, "unknown property value '%s' in '%s'\n", propname,
8292 wordbreakproperty_filename);
8293 exit (1);
8294 }
8295 assert (i1 <= i2 && i2 < 0x110000);
8296
8297 for (i = i1; i <= i2; i++)
8298 unicode_org_wbp[i] = propvalue;
8299 }
8300
8301 if (ferror (stream) || fclose (stream))
8302 {
8303 fprintf (stderr, "error reading from '%s'\n", wordbreakproperty_filename);
8304 exit (1);
8305 }
8306 }
8307
8308
8309 static void
8310 debug_output_org_wbp (FILE *stream)
8311 {
8312 unsigned int i;
8313
8314 for (i = 0; i < 0x110000; i++)
8315 {
8316 int propvalue = unicode_org_wbp[i];
8317 if (propvalue != WBP_OTHER)
8318 {
8319 fprintf (stream, "0x%04X", i);
8320 #define PROP(name,value) \
8321 if (propvalue == value) fprintf (stream, " " name); else
8322 PROP ("CR", WBP_CR)
8323 PROP ("LF", WBP_LF)
8324 PROP ("Newline", WBP_NEWLINE)
8325 PROP ("Extend", WBP_EXTEND)
8326 PROP ("Format", WBP_FORMAT)
8327 PROP ("Katakana", WBP_KATAKANA)
8328 PROP ("ALetter", WBP_ALETTER)
8329 PROP ("MidNumLet", WBP_MIDNUMLET)
8330 PROP ("MidLetter", WBP_MIDLETTER)
8331 PROP ("MidNum", WBP_MIDNUM)
8332 PROP ("Numeric", WBP_NUMERIC)
8333 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
8334 PROP ("Regional_Indicator", WBP_RI)
8335 PROP ("Double_Quote", WBP_DQ)
8336 PROP ("Single_Quote", WBP_SQ)
8337 PROP ("Hebrew_Letter", WBP_HL)
8338 PROP ("ZWJ", WBP_ZWJ)
8339 PROP ("E_Base", WBP_EB)
8340 PROP ("E_Modifier", WBP_EM)
8341 PROP ("Glue_After_Zwj", WBP_GAZ)
8342 PROP ("E_Base_GAZ", WBP_EBG)
8343 #undef PROP
8344 fprintf (stream, " ??");
8345 fprintf (stream, "\n");
8346 }
8347 }
8348 }
8349
8350 static void
8351 debug_output_org_wbrk_tables (const char *filename)
8352 {
8353 FILE *stream;
8354
8355 stream = fopen (filename, "w");
8356 if (stream == NULL)
8357 {
8358 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8359 exit (1);
8360 }
8361
8362 debug_output_org_wbp (stream);
8363
8364 if (ferror (stream) || fclose (stream))
8365 {
8366 fprintf (stderr, "error writing to '%s'\n", filename);
8367 exit (1);
8368 }
8369 }
8370
8371
8372 #define TABLE wbp_table
8373 #define ELEMENT unsigned char
8374 #define DEFAULT WBP_OTHER
8375 #define xmalloc malloc
8376 #define xrealloc realloc
8377 #include "3level.h"
8378
8379 static void
8380 output_wbp (FILE *stream)
8381 {
8382 unsigned int i;
8383 struct wbp_table t;
8384 unsigned int level1_offset, level2_offset, level3_offset;
8385
8386 t.p = 7;
8387 t.q = 9;
8388 wbp_table_init (&t);
8389
8390 for (i = 0; i < 0x110000; i++)
8391 {
8392 int attr = get_wbp (i);
8393
8394
8395 assert (attr != 0 && (attr & (attr - 1)) == 0);
8396
8397 if (attr != 1 << WBP_OTHER)
8398 {
8399 unsigned int log2_attr;
8400 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
8401
8402 wbp_table_add (&t, i, log2_attr);
8403 }
8404 }
8405
8406 wbp_table_finalize (&t);
8407
8408 level1_offset =
8409 5 * sizeof (uint32_t);
8410 level2_offset =
8411 5 * sizeof (uint32_t)
8412 + t.level1_size * sizeof (uint32_t);
8413 level3_offset =
8414 5 * sizeof (uint32_t)
8415 + t.level1_size * sizeof (uint32_t)
8416 + (t.level2_size << t.q) * sizeof (uint32_t);
8417
8418 for (i = 0; i < 5; i++)
8419 fprintf (stream, "#define wbrkprop_header_%d %d\n", i,
8420 ((uint32_t *) t.result)[i]);
8421 fprintf (stream, "\n");
8422 fprintf (stream, "typedef struct\n");
8423 fprintf (stream, " {\n");
8424 fprintf (stream, " int level1[%zu];\n", t.level1_size);
8425 fprintf (stream, " int level2[%zu << %d];\n", t.level2_size, t.q);
8426 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
8427 fprintf (stream, " }\n");
8428 fprintf (stream, "wbrkprop_t;\n");
8429 fprintf (stream, "static const wbrkprop_t uniwbrkprop =\n");
8430 fprintf (stream, "{\n");
8431 fprintf (stream, " {");
8432 if (t.level1_size > 8)
8433 fprintf (stream, "\n ");
8434 for (i = 0; i < t.level1_size; i++)
8435 {
8436 uint32_t offset;
8437 if (i > 0 && (i % 8) == 0)
8438 fprintf (stream, "\n ");
8439 offset = ((uint32_t *) (t.result + level1_offset))[i];
8440 if (offset == 0)
8441 fprintf (stream, " %5d", -1);
8442 else
8443 fprintf (stream, " %5zu",
8444 (offset - level2_offset) / sizeof (uint32_t));
8445 if (i+1 < t.level1_size)
8446 fprintf (stream, ",");
8447 }
8448 if (t.level1_size > 8)
8449 fprintf (stream, "\n ");
8450 fprintf (stream, " },\n");
8451 fprintf (stream, " {");
8452 if (t.level2_size << t.q > 8)
8453 fprintf (stream, "\n ");
8454 for (i = 0; i < t.level2_size << t.q; i++)
8455 {
8456 uint32_t offset;
8457 if (i > 0 && (i % 8) == 0)
8458 fprintf (stream, "\n ");
8459 offset = ((uint32_t *) (t.result + level2_offset))[i];
8460 if (offset == 0)
8461 fprintf (stream, " %5d", -1);
8462 else
8463 fprintf (stream, " %5zu",
8464 (offset - level3_offset) / sizeof (unsigned char));
8465 if (i+1 < t.level2_size << t.q)
8466 fprintf (stream, ",");
8467 }
8468 if (t.level2_size << t.q > 8)
8469 fprintf (stream, "\n ");
8470 fprintf (stream, " },\n");
8471 fprintf (stream, " {");
8472 if (t.level3_size << t.p > 4)
8473 fprintf (stream, "\n ");
8474 for (i = 0; i < t.level3_size << t.p; i++)
8475 {
8476 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
8477 const char *value_string;
8478 switch (value)
8479 {
8480 #define CASE(x) case x: value_string = #x; break;
8481 CASE(WBP_OTHER);
8482 CASE(WBP_CR);
8483 CASE(WBP_LF);
8484 CASE(WBP_NEWLINE);
8485 CASE(WBP_EXTEND);
8486 CASE(WBP_FORMAT);
8487 CASE(WBP_KATAKANA);
8488 CASE(WBP_ALETTER);
8489 CASE(WBP_MIDNUMLET);
8490 CASE(WBP_MIDLETTER);
8491 CASE(WBP_MIDNUM);
8492 CASE(WBP_NUMERIC);
8493 CASE(WBP_EXTENDNUMLET);
8494 CASE(WBP_RI);
8495 CASE(WBP_DQ);
8496 CASE(WBP_SQ);
8497 CASE(WBP_HL);
8498 CASE(WBP_ZWJ);
8499 CASE(WBP_EB);
8500 CASE(WBP_EM);
8501 CASE(WBP_GAZ);
8502 CASE(WBP_EBG);
8503 #undef CASE
8504 default:
8505 abort ();
8506 }
8507 if (i > 0 && (i % 4) == 0)
8508 fprintf (stream, "\n ");
8509 fprintf (stream, " %s%s", value_string,
8510 (i+1 < t.level3_size << t.p ? "," : ""));
8511 }
8512 if (t.level3_size << t.p > 4)
8513 fprintf (stream, "\n ");
8514 fprintf (stream, " }\n");
8515 fprintf (stream, "};\n");
8516 }
8517
8518 static void
8519 output_wbrk_tables (const char *filename, const char *version)
8520 {
8521 FILE *stream;
8522
8523 stream = fopen (filename, "w");
8524 if (stream == NULL)
8525 {
8526 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8527 exit (1);
8528 }
8529
8530 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8531 fprintf (stream, "/* Word breaking properties of Unicode characters. */\n");
8532 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
8533 version);
8534 fprintf (stream, "\n");
8535
8536 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.\n");
8537 fprintf (stream, "\n");
8538 output_library_license (stream, false);
8539 fprintf (stream, "\n");
8540
8541 output_wbp (stream);
8542
8543 if (ferror (stream) || fclose (stream))
8544 {
8545 fprintf (stderr, "error writing to '%s'\n", filename);
8546 exit (1);
8547 }
8548 }
8549
8550
8551
8552
8553
8554
8555
8556 enum
8557 {
8558 GBP_OTHER = 0,
8559 GBP_CR = 1,
8560 GBP_LF = 2,
8561 GBP_CONTROL = 3,
8562 GBP_EXTEND = 4,
8563 GBP_PREPEND = 5,
8564 GBP_SPACINGMARK = 6,
8565 GBP_L = 7,
8566 GBP_V = 8,
8567 GBP_T = 9,
8568 GBP_LV = 10,
8569 GBP_LVT = 11,
8570 GBP_RI = 12,
8571 GBP_ZWJ = 13,
8572 GBP_EB = 14,
8573 GBP_EM = 15,
8574 GBP_GAZ = 16,
8575 GBP_EBG = 17
8576 };
8577
8578
8579 #define TABLE gbp_table
8580 #define ELEMENT unsigned char
8581 #define DEFAULT GBP_OTHER
8582 #define xmalloc malloc
8583 #define xrealloc realloc
8584 #include "3level.h"
8585
8586
8587 int unicode_org_gbp[0x110000];
8588
8589
8590 static void
8591 output_gbp_test (const char *filename)
8592 {
8593 FILE *stream;
8594 bool need_comma;
8595 unsigned int ch;
8596
8597 stream = fopen (filename, "w");
8598 if (stream == NULL)
8599 {
8600 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8601 exit (1);
8602 }
8603
8604 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8605 fprintf (stream, "/* Test the Unicode grapheme break property functions.\n");
8606 fprintf (stream, " Copyright (C) 2010 Free Software Foundation, Inc.\n");
8607 fprintf (stream, "\n");
8608 output_tests_license (stream);
8609 fprintf (stream, "\n");
8610
8611 need_comma = false;
8612 for (ch = 0; ch < 0x110000; ch++)
8613 {
8614 int gbp = unicode_org_gbp[ch];
8615 const char *gbp_string;
8616
8617 while (ch + 1 < 0x110000 && unicode_org_gbp[ch + 1] == gbp)
8618 ch++;
8619
8620 switch (gbp)
8621 {
8622 #define CASE(x) case x: gbp_string = #x; break;
8623 CASE (GBP_OTHER)
8624 CASE (GBP_CR)
8625 CASE (GBP_LF)
8626 CASE (GBP_CONTROL)
8627 CASE (GBP_EXTEND)
8628 CASE (GBP_PREPEND)
8629 CASE (GBP_SPACINGMARK)
8630 CASE (GBP_L)
8631 CASE (GBP_V)
8632 CASE (GBP_T)
8633 CASE (GBP_LV)
8634 CASE (GBP_LVT)
8635 CASE (GBP_RI)
8636 CASE (GBP_ZWJ)
8637 CASE (GBP_EB)
8638 CASE (GBP_EM)
8639 CASE (GBP_GAZ)
8640 CASE (GBP_EBG)
8641 #undef CASE
8642 default:
8643 abort ();
8644 }
8645
8646 if (need_comma)
8647 fprintf (stream, ",\n");
8648 fprintf (stream, "{ 0x%04X, %s }", ch + 1, gbp_string);
8649
8650 need_comma = true;
8651 }
8652 fprintf (stream, "\n");
8653
8654 if (ferror (stream) || fclose (stream))
8655 {
8656 fprintf (stderr, "error writing to '%s'\n", filename);
8657 exit (1);
8658 }
8659 }
8660
8661
8662 static void
8663 output_gbp_table (const char *filename, const char *version)
8664 {
8665 FILE *stream;
8666 unsigned int ch, i;
8667 struct gbp_table t;
8668 unsigned int level1_offset, level2_offset, level3_offset;
8669
8670 stream = fopen (filename, "w");
8671 if (stream == NULL)
8672 {
8673 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8674 exit (1);
8675 }
8676
8677 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8678 fprintf (stream, "/* Grapheme break property of Unicode characters. */\n");
8679 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
8680 version);
8681 fprintf (stream, "\n");
8682
8683 fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
8684 fprintf (stream, "\n");
8685 output_library_license (stream, false);
8686 fprintf (stream, "\n");
8687
8688 t.p = 7;
8689 t.q = 9;
8690 gbp_table_init (&t);
8691
8692 for (ch = 0; ch < 0x110000; ch++)
8693 gbp_table_add (&t, ch, unicode_org_gbp[ch]);
8694
8695 gbp_table_finalize (&t);
8696
8697
8698 level1_offset =
8699 5 * sizeof (uint32_t);
8700 level2_offset =
8701 5 * sizeof (uint32_t)
8702 + t.level1_size * sizeof (uint32_t);
8703 level3_offset =
8704 5 * sizeof (uint32_t)
8705 + t.level1_size * sizeof (uint32_t)
8706 + (t.level2_size << t.q) * sizeof (uint32_t);
8707
8708 for (i = 0; i < 5; i++)
8709 fprintf (stream, "#define gbrkprop_header_%d %d\n", i,
8710 ((uint32_t *) t.result)[i]);
8711 fprintf (stream, "static const\n");
8712 fprintf (stream, "struct\n");
8713 fprintf (stream, " {\n");
8714 fprintf (stream, " int level1[%zu];\n", t.level1_size);
8715 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
8716 fprintf (stream, " unsigned char level3[%zu << %d];\n",
8717 t.level3_size, t.p);
8718 fprintf (stream, " }\n");
8719 fprintf (stream, "unigbrkprop =\n");
8720 fprintf (stream, "{\n");
8721 fprintf (stream, " {");
8722 if (t.level1_size > 8)
8723 fprintf (stream, "\n ");
8724 for (i = 0; i < t.level1_size; i++)
8725 {
8726 uint32_t offset;
8727 if (i > 0 && (i % 8) == 0)
8728 fprintf (stream, "\n ");
8729 offset = ((uint32_t *) (t.result + level1_offset))[i];
8730 if (offset == 0)
8731 fprintf (stream, " %5d", -1);
8732 else
8733 fprintf (stream, " %5zu",
8734 (offset - level2_offset) / sizeof (uint32_t));
8735 if (i+1 < t.level1_size)
8736 fprintf (stream, ",");
8737 }
8738 if (t.level1_size > 8)
8739 fprintf (stream, "\n ");
8740 fprintf (stream, " },\n");
8741 fprintf (stream, " {");
8742 if (t.level2_size << t.q > 8)
8743 fprintf (stream, "\n ");
8744 for (i = 0; i < t.level2_size << t.q; i++)
8745 {
8746 uint32_t offset;
8747 if (i > 0 && (i % 8) == 0)
8748 fprintf (stream, "\n ");
8749 offset = ((uint32_t *) (t.result + level2_offset))[i];
8750 if (offset == 0)
8751 fprintf (stream, " %5d", -1);
8752 else
8753 fprintf (stream, " %5zu",
8754 (offset - level3_offset) / sizeof (uint8_t));
8755 if (i+1 < t.level2_size << t.q)
8756 fprintf (stream, ",");
8757 }
8758 if (t.level2_size << t.q > 8)
8759 fprintf (stream, "\n ");
8760 fprintf (stream, " },\n");
8761 fprintf (stream, " {");
8762 if (t.level3_size << t.p > 4)
8763 fprintf (stream, "\n ");
8764 for (i = 0; i < t.level3_size << t.p; i++)
8765 {
8766 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
8767 const char *value_string;
8768 switch (value)
8769 {
8770 #define CASE(x) case x: value_string = #x; break;
8771 CASE (GBP_OTHER)
8772 CASE (GBP_CR)
8773 CASE (GBP_LF)
8774 CASE (GBP_CONTROL)
8775 CASE (GBP_EXTEND)
8776 CASE (GBP_PREPEND)
8777 CASE (GBP_SPACINGMARK)
8778 CASE (GBP_L)
8779 CASE (GBP_V)
8780 CASE (GBP_T)
8781 CASE (GBP_LV)
8782 CASE (GBP_LVT)
8783 CASE (GBP_RI)
8784 CASE (GBP_ZWJ)
8785 CASE (GBP_EB)
8786 CASE (GBP_EM)
8787 CASE (GBP_GAZ)
8788 CASE (GBP_EBG)
8789 #undef CASE
8790 default:
8791 abort ();
8792 }
8793 if (i > 0 && (i % 4) == 0)
8794 fprintf (stream, "\n ");
8795 fprintf (stream, " %s%s", value_string,
8796 (i+1 < t.level3_size << t.p ? "," : ""));
8797 }
8798 if (t.level3_size << t.p > 4)
8799 fprintf (stream, "\n ");
8800 fprintf (stream, " }\n");
8801 fprintf (stream, "};\n");
8802
8803 if (ferror (stream) || fclose (stream))
8804 {
8805 fprintf (stderr, "error writing to '%s'\n", filename);
8806 exit (1);
8807 }
8808 }
8809
8810
8811
8812 static void
8813 fill_org_gbp (const char *graphemebreakproperty_filename)
8814 {
8815 unsigned int i;
8816 FILE *stream;
8817 int lineno = 0;
8818
8819 for (i = 0; i < 0x110000; i++)
8820 unicode_org_gbp[i] = GBP_OTHER;
8821
8822 stream = fopen (graphemebreakproperty_filename, "r");
8823 if (stream == NULL)
8824 {
8825 fprintf (stderr, "error during fopen of '%s'\n",
8826 graphemebreakproperty_filename);
8827 exit (1);
8828 }
8829
8830 for (;;)
8831 {
8832 char buf[200+1];
8833 unsigned int i1, i2;
8834 char padding[200+1];
8835 char propname[200+1];
8836 int propvalue;
8837
8838 lineno++;
8839 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
8840 break;
8841
8842 if (buf[0] == '\0' || buf[0] == '#')
8843 continue;
8844
8845 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
8846 {
8847 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
8848 {
8849 fprintf (stderr, "parse error in '%s'\n",
8850 graphemebreakproperty_filename);
8851 exit (1);
8852 }
8853 i2 = i1;
8854 }
8855 #define PROP(name,value) \
8856 if (strcmp (propname, name) == 0) propvalue = value; else
8857 PROP ("CR", GBP_CR)
8858 PROP ("LF", GBP_LF)
8859 PROP ("Control", GBP_CONTROL)
8860 PROP ("Extend", GBP_EXTEND)
8861 PROP ("Prepend", GBP_PREPEND)
8862 PROP ("SpacingMark", GBP_SPACINGMARK)
8863 PROP ("L", GBP_L)
8864 PROP ("V", GBP_V)
8865 PROP ("T", GBP_T)
8866 PROP ("LV", GBP_LV)
8867 PROP ("LVT", GBP_LVT)
8868 PROP ("Regional_Indicator", GBP_RI)
8869 PROP ("ZWJ", GBP_ZWJ)
8870 PROP ("E_Base", GBP_EB)
8871 PROP ("E_Modifier", GBP_EM)
8872 PROP ("Glue_After_Zwj", GBP_GAZ)
8873 PROP ("E_Base_GAZ", GBP_EBG)
8874 #undef PROP
8875 {
8876 fprintf (stderr, "unknown property value '%s' in %s:%d\n", propname,
8877 graphemebreakproperty_filename, lineno);
8878 exit (1);
8879 }
8880 assert (i1 <= i2 && i2 < 0x110000);
8881
8882 for (i = i1; i <= i2; i++)
8883 unicode_org_gbp[i] = propvalue;
8884 }
8885
8886 if (ferror (stream) || fclose (stream))
8887 {
8888 fprintf (stderr, "error reading from '%s'\n", graphemebreakproperty_filename);
8889 exit (1);
8890 }
8891 }
8892
8893
8894
8895
8896
8897
8898
8899
8900 #define MAX_DECOMP_LENGTH 18
8901
8902 enum
8903 {
8904 UC_DECOMP_CANONICAL,
8905 UC_DECOMP_FONT,
8906 UC_DECOMP_NOBREAK,
8907 UC_DECOMP_INITIAL,
8908 UC_DECOMP_MEDIAL,
8909 UC_DECOMP_FINAL,
8910 UC_DECOMP_ISOLATED,
8911 UC_DECOMP_CIRCLE,
8912 UC_DECOMP_SUPER,
8913 UC_DECOMP_SUB,
8914 UC_DECOMP_VERTICAL,
8915 UC_DECOMP_WIDE,
8916 UC_DECOMP_NARROW,
8917 UC_DECOMP_SMALL,
8918 UC_DECOMP_SQUARE,
8919 UC_DECOMP_FRACTION,
8920 UC_DECOMP_COMPAT
8921 };
8922
8923
8924
8925 static int
8926 get_decomposition (unsigned int ch,
8927 unsigned int *lengthp, unsigned int decomposed[MAX_DECOMP_LENGTH])
8928 {
8929 const char *decomposition = unicode_attributes[ch].decomposition;
8930
8931 if (decomposition != NULL && decomposition[0] != '\0')
8932 {
8933 int type = UC_DECOMP_CANONICAL;
8934 unsigned int length;
8935 char *endptr;
8936
8937 if (decomposition[0] == '<')
8938 {
8939 const char *rangle;
8940 size_t typelen;
8941
8942 rangle = strchr (decomposition + 1, '>');
8943 assert (rangle != NULL);
8944 typelen = rangle + 1 - decomposition;
8945 #define TYPE(t1,t2) \
8946 if (typelen == (sizeof (t1) - 1) && memcmp (decomposition, t1, typelen) == 0) \
8947 type = t2; \
8948 else
8949 TYPE ("<font>", UC_DECOMP_FONT)
8950 TYPE ("<noBreak>", UC_DECOMP_NOBREAK)
8951 TYPE ("<initial>", UC_DECOMP_INITIAL)
8952 TYPE ("<medial>", UC_DECOMP_MEDIAL)
8953 TYPE ("<final>", UC_DECOMP_FINAL)
8954 TYPE ("<isolated>", UC_DECOMP_ISOLATED)
8955 TYPE ("<circle>", UC_DECOMP_CIRCLE)
8956 TYPE ("<super>", UC_DECOMP_SUPER)
8957 TYPE ("<sub>", UC_DECOMP_SUB)
8958 TYPE ("<vertical>", UC_DECOMP_VERTICAL)
8959 TYPE ("<wide>", UC_DECOMP_WIDE)
8960 TYPE ("<narrow>", UC_DECOMP_NARROW)
8961 TYPE ("<small>", UC_DECOMP_SMALL)
8962 TYPE ("<square>", UC_DECOMP_SQUARE)
8963 TYPE ("<fraction>", UC_DECOMP_FRACTION)
8964 TYPE ("<compat>", UC_DECOMP_COMPAT)
8965 {
8966 fprintf (stderr, "unknown decomposition type %*s\n", (int)typelen, decomposition);
8967 exit (1);
8968 }
8969 #undef TYPE
8970 decomposition = rangle + 1;
8971 if (decomposition[0] == ' ')
8972 decomposition++;
8973 }
8974 for (length = 0; length < MAX_DECOMP_LENGTH; length++)
8975 {
8976 decomposed[length] = strtoul (decomposition, &endptr, 16);
8977 if (endptr == decomposition)
8978 break;
8979 decomposition = endptr;
8980 if (decomposition[0] == ' ')
8981 decomposition++;
8982 }
8983
8984
8985 assert (*decomposition == '\0');
8986
8987 *lengthp = length;
8988 return type;
8989 }
8990 else
8991 return -1;
8992 }
8993
8994
8995 #define TABLE decomp_table
8996 #define ELEMENT uint16_t
8997 #define DEFAULT (uint16_t)(-1)
8998 #define xmalloc malloc
8999 #define xrealloc realloc
9000 #include "3level.h"
9001
9002 static void
9003 output_decomposition (FILE *stream1, FILE *stream2)
9004 {
9005 struct decomp_table t;
9006 unsigned int level1_offset, level2_offset, level3_offset;
9007 unsigned int offset;
9008 unsigned int ch;
9009 unsigned int i;
9010
9011 t.p = 5;
9012 t.q = 5;
9013 decomp_table_init (&t);
9014
9015 fprintf (stream1, "extern const unsigned char gl_uninorm_decomp_chars_table[];\n");
9016 fprintf (stream1, "\n");
9017 fprintf (stream2, "const unsigned char gl_uninorm_decomp_chars_table[] =\n{");
9018 offset = 0;
9019
9020 for (ch = 0; ch < 0x110000; ch++)
9021 {
9022 unsigned int length;
9023 unsigned int decomposed[MAX_DECOMP_LENGTH];
9024 int type = get_decomposition (ch, &length, decomposed);
9025
9026 if (type >= 0)
9027 {
9028 assert (offset < (1 << 15));
9029 decomp_table_add (&t, ch, ((type == UC_DECOMP_CANONICAL ? 0 : 1) << 15) | offset);
9030
9031
9032
9033 assert (length != 0);
9034 for (i = 0; i < length; i++)
9035 {
9036 if (offset > 0)
9037 fprintf (stream2, ",");
9038 if ((offset % 4) == 0)
9039 fprintf (stream2, "\n ");
9040 assert (decomposed[i] < (1 << 18));
9041 fprintf (stream2, " 0x%02X, 0x%02X, 0x%02X",
9042 (((i+1 < length ? (1 << 23) : 0)
9043 | (i == 0 ? (type << 18) : 0)
9044 | decomposed[i]) >> 16) & 0xff,
9045 (decomposed[i] >> 8) & 0xff,
9046 decomposed[i] & 0xff);
9047 offset++;
9048 }
9049 }
9050 }
9051
9052 fprintf (stream2, "\n};\n");
9053 fprintf (stream2, "\n");
9054
9055 decomp_table_finalize (&t);
9056
9057 level1_offset =
9058 5 * sizeof (uint32_t);
9059 level2_offset =
9060 5 * sizeof (uint32_t)
9061 + t.level1_size * sizeof (uint32_t);
9062 level3_offset =
9063 5 * sizeof (uint32_t)
9064 + t.level1_size * sizeof (uint32_t)
9065 + (t.level2_size << t.q) * sizeof (uint32_t);
9066
9067 for (i = 0; i < 5; i++)
9068 fprintf (stream1, "#define decomp_header_%d %d\n", i,
9069 ((uint32_t *) t.result)[i]);
9070 fprintf (stream1, "\n");
9071 fprintf (stream1, "typedef struct\n");
9072 fprintf (stream1, " {\n");
9073 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
9074 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
9075 fprintf (stream1, " unsigned short level3[%zu << %d];\n", t.level3_size, t.p);
9076 fprintf (stream1, " }\n");
9077 fprintf (stream1, "decomp_index_table_t;\n");
9078 fprintf (stream1, "extern const decomp_index_table_t gl_uninorm_decomp_index_table;\n");
9079 fprintf (stream2, "const decomp_index_table_t gl_uninorm_decomp_index_table =\n");
9080 fprintf (stream2, "{\n");
9081 fprintf (stream2, " {");
9082 if (t.level1_size > 8)
9083 fprintf (stream2, "\n ");
9084 for (i = 0; i < t.level1_size; i++)
9085 {
9086 uint32_t offset;
9087 if (i > 0 && (i % 8) == 0)
9088 fprintf (stream2, "\n ");
9089 offset = ((uint32_t *) (t.result + level1_offset))[i];
9090 if (offset == 0)
9091 fprintf (stream2, " %5d", -1);
9092 else
9093 fprintf (stream2, " %5zu",
9094 (offset - level2_offset) / sizeof (uint32_t));
9095 if (i+1 < t.level1_size)
9096 fprintf (stream2, ",");
9097 }
9098 if (t.level1_size > 8)
9099 fprintf (stream2, "\n ");
9100 fprintf (stream2, " },\n");
9101 fprintf (stream2, " {");
9102 if (t.level2_size << t.q > 8)
9103 fprintf (stream2, "\n ");
9104 for (i = 0; i < t.level2_size << t.q; i++)
9105 {
9106 uint32_t offset;
9107 if (i > 0 && (i % 8) == 0)
9108 fprintf (stream2, "\n ");
9109 offset = ((uint32_t *) (t.result + level2_offset))[i];
9110 if (offset == 0)
9111 fprintf (stream2, " %5d", -1);
9112 else
9113 fprintf (stream2, " %5zu",
9114 (offset - level3_offset) / sizeof (uint16_t));
9115 if (i+1 < t.level2_size << t.q)
9116 fprintf (stream2, ",");
9117 }
9118 if (t.level2_size << t.q > 8)
9119 fprintf (stream2, "\n ");
9120 fprintf (stream2, " },\n");
9121 fprintf (stream2, " {");
9122 if (t.level3_size << t.p > 8)
9123 fprintf (stream2, "\n ");
9124 for (i = 0; i < t.level3_size << t.p; i++)
9125 {
9126 uint16_t value = ((uint16_t *) (t.result + level3_offset))[i];
9127 if (i > 0 && (i % 8) == 0)
9128 fprintf (stream2, "\n ");
9129 fprintf (stream2, " %5d", value == (uint16_t)(-1) ? -1 : value);
9130 if (i+1 < t.level3_size << t.p)
9131 fprintf (stream2, ",");
9132 }
9133 if (t.level3_size << t.p > 8)
9134 fprintf (stream2, "\n ");
9135 fprintf (stream2, " }\n");
9136 fprintf (stream2, "};\n");
9137 }
9138
9139 static void
9140 output_decomposition_tables (const char *filename1, const char *filename2, const char *version)
9141 {
9142 const char *filenames[2];
9143 FILE *streams[2];
9144 size_t i;
9145
9146 filenames[0] = filename1;
9147 filenames[1] = filename2;
9148
9149 for (i = 0; i < 2; i++)
9150 {
9151 streams[i] = fopen (filenames[i], "w");
9152 if (streams[i] == NULL)
9153 {
9154 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
9155 exit (1);
9156 }
9157 }
9158
9159 for (i = 0; i < 2; i++)
9160 {
9161 FILE *stream = streams[i];
9162
9163 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
9164 fprintf (stream, "/* Decomposition of Unicode characters. */\n");
9165 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
9166 version);
9167 fprintf (stream, "\n");
9168
9169 fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
9170 fprintf (stream, "\n");
9171 output_library_license (stream, true);
9172 fprintf (stream, "\n");
9173 }
9174
9175 output_decomposition (streams[0], streams[1]);
9176
9177 for (i = 0; i < 2; i++)
9178 {
9179 if (ferror (streams[i]) || fclose (streams[i]))
9180 {
9181 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
9182 exit (1);
9183 }
9184 }
9185 }
9186
9187
9188 char unicode_composition_exclusions[0x110000];
9189
9190 static void
9191 fill_composition_exclusions (const char *compositionexclusions_filename)
9192 {
9193 FILE *stream;
9194 unsigned int i;
9195
9196 stream = fopen (compositionexclusions_filename, "r");
9197 if (stream == NULL)
9198 {
9199 fprintf (stderr, "error during fopen of '%s'\n", compositionexclusions_filename);
9200 exit (1);
9201 }
9202
9203 for (i = 0; i < 0x110000; i++)
9204 unicode_composition_exclusions[i] = 0;
9205
9206 for (;;)
9207 {
9208 char buf[200+1];
9209 unsigned int i;
9210
9211 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
9212 break;
9213
9214 if (buf[0] == '\0' || buf[0] == '#')
9215 continue;
9216
9217 if (sscanf (buf, "%X", &i) != 1)
9218 {
9219 fprintf (stderr, "parse error in '%s'\n", compositionexclusions_filename);
9220 exit (1);
9221 }
9222 assert (i < 0x110000);
9223
9224 unicode_composition_exclusions[i] = 1;
9225 }
9226
9227 if (ferror (stream) || fclose (stream))
9228 {
9229 fprintf (stderr, "error reading from '%s'\n", compositionexclusions_filename);
9230 exit (1);
9231 }
9232 }
9233
9234 static void
9235 debug_output_composition_tables (const char *filename)
9236 {
9237 FILE *stream;
9238 unsigned int ch;
9239
9240 stream = fopen (filename, "w");
9241 if (stream == NULL)
9242 {
9243 fprintf (stderr, "cannot open '%s' for writing\n", filename);
9244 exit (1);
9245 }
9246
9247 for (ch = 0; ch < 0x110000; ch++)
9248 {
9249 unsigned int length;
9250 unsigned int decomposed[MAX_DECOMP_LENGTH];
9251 int type = get_decomposition (ch, &length, decomposed);
9252
9253 if (type == UC_DECOMP_CANONICAL
9254
9255
9256 && length == 2)
9257 {
9258 unsigned int code1 = decomposed[0];
9259 unsigned int code2 = decomposed[1];
9260 unsigned int combined = ch;
9261
9262
9263
9264 if (strcmp (unicode_attributes[code1].combining, "0") == 0
9265
9266 && !unicode_composition_exclusions[combined])
9267 {
9268
9269
9270 assert (strcmp (unicode_attributes[combined].combining, "0") == 0);
9271
9272 fprintf (stream, "0x%04X\t0x%04X\t0x%04X\t%s\n",
9273 code1,
9274 code2,
9275 combined,
9276 unicode_attributes[code2].combining);
9277 }
9278 }
9279 }
9280
9281 if (ferror (stream) || fclose (stream))
9282 {
9283 fprintf (stderr, "error writing to '%s'\n", filename);
9284 exit (1);
9285 }
9286 }
9287
9288 static void
9289 output_composition_tables (const char *filename, const char *version)
9290 {
9291 FILE *stream;
9292 unsigned int ch;
9293
9294 stream = fopen (filename, "w");
9295 if (stream == NULL)
9296 {
9297 fprintf (stderr, "cannot open '%s' for writing\n", filename);
9298 exit (1);
9299 }
9300
9301 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
9302 fprintf (stream, "/* Canonical composition of Unicode characters. */\n");
9303 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
9304 version);
9305 fprintf (stream, "\n");
9306
9307 fprintf (stream, "/* Copyright (C) 2009 Free Software Foundation, Inc.\n");
9308 fprintf (stream, "\n");
9309 output_library_license (stream, true);
9310 fprintf (stream, "\n");
9311
9312
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331 fprintf (stream, "struct composition_rule { char codes[6]; };\n");
9332 fprintf (stream, "%%struct-type\n");
9333 fprintf (stream, "%%language=ANSI-C\n");
9334 fprintf (stream, "%%define slot-name codes\n");
9335 fprintf (stream, "%%define hash-function-name gl_uninorm_compose_hash\n");
9336 fprintf (stream, "%%define lookup-function-name gl_uninorm_compose_lookup\n");
9337 fprintf (stream, "%%compare-lengths\n");
9338 fprintf (stream, "%%compare-strncmp\n");
9339 fprintf (stream, "%%readonly-tables\n");
9340 fprintf (stream, "%%omit-struct-type\n");
9341 fprintf (stream, "%%%%\n");
9342
9343 for (ch = 0; ch < 0x110000; ch++)
9344 {
9345 unsigned int length;
9346 unsigned int decomposed[MAX_DECOMP_LENGTH];
9347 int type = get_decomposition (ch, &length, decomposed);
9348
9349 if (type == UC_DECOMP_CANONICAL
9350
9351
9352 && length == 2)
9353 {
9354 unsigned int code1 = decomposed[0];
9355 unsigned int code2 = decomposed[1];
9356 unsigned int combined = ch;
9357
9358
9359
9360 if (strcmp (unicode_attributes[code1].combining, "0") == 0
9361
9362 && !unicode_composition_exclusions[combined])
9363 {
9364
9365
9366 assert (strcmp (unicode_attributes[combined].combining, "0") == 0);
9367
9368 fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n",
9369 (code1 >> 16) & 0xff, (code1 >> 8) & 0xff, code1 & 0xff,
9370 (code2 >> 16) & 0xff, (code2 >> 8) & 0xff, code2 & 0xff,
9371 combined);
9372 }
9373 }
9374 }
9375
9376 if (ferror (stream) || fclose (stream))
9377 {
9378 fprintf (stderr, "error writing to '%s'\n", filename);
9379 exit (1);
9380 }
9381 }
9382
9383
9384
9385
9386
9387 static void
9388 output_simple_mapping_test (const char *filename,
9389 const char *function_name,
9390 unsigned int (*func) (unsigned int),
9391 const char *version)
9392 {
9393 FILE *stream;
9394 bool need_comma;
9395 unsigned int ch;
9396
9397 stream = fopen (filename, "w");
9398 if (stream == NULL)
9399 {
9400 fprintf (stderr, "cannot open '%s' for writing\n", filename);
9401 exit (1);
9402 }
9403
9404 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
9405 fprintf (stream, "/* Test the Unicode character mapping functions.\n");
9406 fprintf (stream, " Copyright (C) 2009 Free Software Foundation, Inc.\n");
9407 fprintf (stream, "\n");
9408 output_tests_license (stream);
9409 fprintf (stream, "\n");
9410 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
9411 version);
9412 fprintf (stream, "\n");
9413 fprintf (stream, "#include \"test-mapping-part1.h\"\n");
9414 fprintf (stream, "\n");
9415
9416 need_comma = false;
9417 for (ch = 0; ch < 0x110000; ch++)
9418 {
9419 unsigned int value = func (ch);
9420
9421 if (value != ch)
9422 {
9423 if (need_comma)
9424 fprintf (stream, ",\n");
9425 fprintf (stream, " { 0x%04X, 0x%04X }", ch, value);
9426 need_comma = true;
9427 }
9428 }
9429 if (need_comma)
9430 fprintf (stream, "\n");
9431
9432 fprintf (stream, "\n");
9433 fprintf (stream, "#define MAP(c) %s (c)\n", function_name);
9434 fprintf (stream, "#include \"test-mapping-part2.h\"\n");
9435
9436 if (ferror (stream) || fclose (stream))
9437 {
9438 fprintf (stderr, "error writing to '%s'\n", filename);
9439 exit (1);
9440 }
9441 }
9442
9443
9444 #define TABLE mapping_table
9445 #define ELEMENT int32_t
9446 #define DEFAULT 0
9447 #define xmalloc malloc
9448 #define xrealloc realloc
9449 #include "3level.h"
9450
9451
9452
9453 static void
9454 output_simple_mapping (const char *filename,
9455 unsigned int (*func) (unsigned int),
9456 const char *version)
9457 {
9458 FILE *stream;
9459 unsigned int ch, i;
9460 struct mapping_table t;
9461 unsigned int level1_offset, level2_offset, level3_offset;
9462
9463 stream = fopen (filename, "w");
9464 if (stream == NULL)
9465 {
9466 fprintf (stderr, "cannot open '%s' for writing\n", filename);
9467 exit (1);
9468 }
9469
9470 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
9471 fprintf (stream, "/* Simple character mapping of Unicode characters. */\n");
9472 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
9473 version);
9474 fprintf (stream, "\n");
9475
9476 fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
9477 fprintf (stream, "\n");
9478 output_library_license (stream, false);
9479 fprintf (stream, "\n");
9480
9481 t.p = 7;
9482 t.q = 9;
9483 mapping_table_init (&t);
9484
9485 for (ch = 0; ch < 0x110000; ch++)
9486 {
9487 int value = (int) func (ch) - (int) ch;
9488
9489 mapping_table_add (&t, ch, value);
9490 }
9491
9492 mapping_table_finalize (&t);
9493
9494
9495 level1_offset =
9496 5 * sizeof (uint32_t);
9497 level2_offset =
9498 5 * sizeof (uint32_t)
9499 + t.level1_size * sizeof (uint32_t);
9500 level3_offset =
9501 5 * sizeof (uint32_t)
9502 + t.level1_size * sizeof (uint32_t)
9503 + (t.level2_size << t.q) * sizeof (uint32_t);
9504
9505 for (i = 0; i < 5; i++)
9506 fprintf (stream, "#define mapping_header_%d %d\n", i,
9507 ((uint32_t *) t.result)[i]);
9508 fprintf (stream, "static const\n");
9509 fprintf (stream, "struct\n");
9510 fprintf (stream, " {\n");
9511 fprintf (stream, " int level1[%zu];\n", t.level1_size);
9512 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
9513 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
9514 fprintf (stream, " }\n");
9515 fprintf (stream, "u_mapping =\n");
9516 fprintf (stream, "{\n");
9517 fprintf (stream, " {");
9518 if (t.level1_size > 8)
9519 fprintf (stream, "\n ");
9520 for (i = 0; i < t.level1_size; i++)
9521 {
9522 uint32_t offset;
9523 if (i > 0 && (i % 8) == 0)
9524 fprintf (stream, "\n ");
9525 offset = ((uint32_t *) (t.result + level1_offset))[i];
9526 if (offset == 0)
9527 fprintf (stream, " %5d", -1);
9528 else
9529 fprintf (stream, " %5zu",
9530 (offset - level2_offset) / sizeof (uint32_t));
9531 if (i+1 < t.level1_size)
9532 fprintf (stream, ",");
9533 }
9534 if (t.level1_size > 8)
9535 fprintf (stream, "\n ");
9536 fprintf (stream, " },\n");
9537 fprintf (stream, " {");
9538 if (t.level2_size << t.q > 8)
9539 fprintf (stream, "\n ");
9540 for (i = 0; i < t.level2_size << t.q; i++)
9541 {
9542 uint32_t offset;
9543 if (i > 0 && (i % 8) == 0)
9544 fprintf (stream, "\n ");
9545 offset = ((uint32_t *) (t.result + level2_offset))[i];
9546 if (offset == 0)
9547 fprintf (stream, " %5d", -1);
9548 else
9549 fprintf (stream, " %5zu",
9550 (offset - level3_offset) / sizeof (int32_t));
9551 if (i+1 < t.level2_size << t.q)
9552 fprintf (stream, ",");
9553 }
9554 if (t.level2_size << t.q > 8)
9555 fprintf (stream, "\n ");
9556 fprintf (stream, " },\n");
9557 fprintf (stream, " {");
9558 if (t.level3_size << t.p > 8)
9559 fprintf (stream, "\n ");
9560 for (i = 0; i < t.level3_size << t.p; i++)
9561 {
9562 if (i > 0 && (i % 8) == 0)
9563 fprintf (stream, "\n ");
9564 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
9565 if (i+1 < t.level3_size << t.p)
9566 fprintf (stream, ",");
9567 }
9568 if (t.level3_size << t.p > 8)
9569 fprintf (stream, "\n ");
9570 fprintf (stream, " }\n");
9571 fprintf (stream, "};\n");
9572
9573 if (ferror (stream) || fclose (stream))
9574 {
9575 fprintf (stderr, "error writing to '%s'\n", filename);
9576 exit (1);
9577 }
9578 }
9579
9580
9581
9582
9583
9584 enum
9585 {
9586 SCC_ALWAYS = 0,
9587 SCC_FINAL_SIGMA,
9588 SCC_AFTER_SOFT_DOTTED,
9589 SCC_MORE_ABOVE,
9590 SCC_BEFORE_DOT,
9591 SCC_AFTER_I
9592 };
9593
9594
9595 struct special_casing_rule
9596 {
9597 unsigned int code;
9598 unsigned int lower_mapping[3];
9599 unsigned int title_mapping[3];
9600 unsigned int upper_mapping[3];
9601 unsigned int casefold_mapping[3];
9602 const char *language;
9603 int context;
9604 };
9605
9606
9607 struct special_casing_rule **casing_rules;
9608 unsigned int num_casing_rules;
9609 unsigned int allocated_casing_rules;
9610
9611 static void
9612 add_casing_rule (struct special_casing_rule *new_rule)
9613 {
9614 if (num_casing_rules == allocated_casing_rules)
9615 {
9616 allocated_casing_rules = 2 * allocated_casing_rules;
9617 if (allocated_casing_rules < 16)
9618 allocated_casing_rules = 16;
9619 casing_rules =
9620 (struct special_casing_rule **)
9621 realloc (casing_rules, allocated_casing_rules * sizeof (struct special_casing_rule *));
9622 }
9623 casing_rules[num_casing_rules++] = new_rule;
9624 }
9625
9626
9627
9628 static void
9629 fill_casing_rules (const char *specialcasing_filename)
9630 {
9631 FILE *stream;
9632
9633 stream = fopen (specialcasing_filename, "r");
9634 if (stream == NULL)
9635 {
9636 fprintf (stderr, "error during fopen of '%s'\n", specialcasing_filename);
9637 exit (1);
9638 }
9639
9640 casing_rules = NULL;
9641 num_casing_rules = 0;
9642 allocated_casing_rules = 0;
9643
9644 for (;;)
9645 {
9646 char buf[200+1];
9647 char *scanptr;
9648 char *endptr;
9649 int i;
9650
9651 unsigned int code;
9652 unsigned int lower_mapping[3];
9653 unsigned int title_mapping[3];
9654 unsigned int upper_mapping[3];
9655 char *language;
9656 int context;
9657
9658 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
9659 break;
9660
9661 if (buf[0] == '\0' || buf[0] == '#')
9662 continue;
9663
9664
9665 scanptr = buf;
9666 code = strtoul (scanptr, &endptr, 16);
9667 if (endptr == scanptr)
9668 {
9669 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9670 exit (1);
9671 }
9672 scanptr = endptr;
9673 if (*scanptr != ';')
9674 {
9675 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9676 exit (1);
9677 }
9678 scanptr++;
9679
9680
9681 for (i = 0; i < 3; i++)
9682 lower_mapping[i] = 0;
9683 for (i = 0; i < 3; i++)
9684 {
9685 while (*scanptr == ' ')
9686 scanptr++;
9687 if (*scanptr == ';')
9688 break;
9689 lower_mapping[i] = strtoul (scanptr, &endptr, 16);
9690 if (endptr == scanptr)
9691 {
9692 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9693 exit (1);
9694 }
9695 scanptr = endptr;
9696 }
9697 if (*scanptr != ';')
9698 {
9699 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9700 exit (1);
9701 }
9702 scanptr++;
9703
9704
9705 for (i = 0; i < 3; i++)
9706 title_mapping[i] = 0;
9707 for (i = 0; i < 3; i++)
9708 {
9709 while (*scanptr == ' ')
9710 scanptr++;
9711 if (*scanptr == ';')
9712 break;
9713 title_mapping[i] = strtoul (scanptr, &endptr, 16);
9714 if (endptr == scanptr)
9715 {
9716 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9717 exit (1);
9718 }
9719 scanptr = endptr;
9720 }
9721 if (*scanptr != ';')
9722 {
9723 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9724 exit (1);
9725 }
9726 scanptr++;
9727
9728
9729 for (i = 0; i < 3; i++)
9730 upper_mapping[i] = 0;
9731 for (i = 0; i < 3; i++)
9732 {
9733 while (*scanptr == ' ')
9734 scanptr++;
9735 if (*scanptr == ';')
9736 break;
9737 upper_mapping[i] = strtoul (scanptr, &endptr, 16);
9738 if (endptr == scanptr)
9739 {
9740 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9741 exit (1);
9742 }
9743 scanptr = endptr;
9744 }
9745 if (*scanptr != ';')
9746 {
9747 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9748 exit (1);
9749 }
9750 scanptr++;
9751
9752
9753 language = NULL;
9754 context = SCC_ALWAYS;
9755 while (*scanptr == ' ')
9756 scanptr++;
9757 if (*scanptr != '\0' && *scanptr != '#')
9758 {
9759 const char *word_begin = scanptr;
9760 const char *word_end;
9761
9762 while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
9763 scanptr++;
9764 word_end = scanptr;
9765
9766 while (*scanptr == ' ')
9767 scanptr++;
9768
9769 if (word_end - word_begin == 2)
9770 {
9771 language = (char *) malloc ((word_end - word_begin) + 1);
9772 memcpy (language, word_begin, 2);
9773 language[word_end - word_begin] = '\0';
9774 word_begin = word_end = NULL;
9775
9776 if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
9777 {
9778 word_begin = scanptr;
9779 while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
9780 scanptr++;
9781 word_end = scanptr;
9782 }
9783 }
9784
9785 if (word_end > word_begin)
9786 {
9787 bool negate = false;
9788
9789 if (word_end - word_begin >= 4 && memcmp (word_begin, "Not_", 4) == 0)
9790 {
9791 word_begin += 4;
9792 negate = true;
9793 }
9794 if (word_end - word_begin == 11 && memcmp (word_begin, "Final_Sigma", 11) == 0)
9795 context = SCC_FINAL_SIGMA;
9796 else if (word_end - word_begin == 17 && memcmp (word_begin, "After_Soft_Dotted", 17) == 0)
9797 context = SCC_AFTER_SOFT_DOTTED;
9798 else if (word_end - word_begin == 10 && memcmp (word_begin, "More_Above", 10) == 0)
9799 context = SCC_MORE_ABOVE;
9800 else if (word_end - word_begin == 10 && memcmp (word_begin, "Before_Dot", 10) == 0)
9801 context = SCC_BEFORE_DOT;
9802 else if (word_end - word_begin == 7 && memcmp (word_begin, "After_I", 7) == 0)
9803 context = SCC_AFTER_I;
9804 else
9805 {
9806 fprintf (stderr, "unknown context type in '%s'\n", specialcasing_filename);
9807 exit (1);
9808 }
9809 if (negate)
9810 context = - context;
9811 }
9812
9813 if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
9814 {
9815 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9816 exit (1);
9817 }
9818 }
9819
9820
9821 {
9822 struct special_casing_rule *new_rule =
9823 (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
9824 new_rule->code = code;
9825 new_rule->language = language;
9826 new_rule->context = context;
9827 memcpy (new_rule->lower_mapping, lower_mapping, sizeof (new_rule->lower_mapping));
9828 memcpy (new_rule->title_mapping, title_mapping, sizeof (new_rule->title_mapping));
9829 memcpy (new_rule->upper_mapping, upper_mapping, sizeof (new_rule->upper_mapping));
9830
9831 add_casing_rule (new_rule);
9832 }
9833 }
9834
9835 if (ferror (stream) || fclose (stream))
9836 {
9837 fprintf (stderr, "error reading from '%s'\n", specialcasing_filename);
9838 exit (1);
9839 }
9840 }
9841
9842
9843 struct casefold_rule
9844 {
9845 unsigned int code;
9846 unsigned int mapping[3];
9847 const char *language;
9848 };
9849
9850
9851 struct casefold_rule **casefolding_rules;
9852 unsigned int num_casefolding_rules;
9853 unsigned int allocated_casefolding_rules;
9854
9855
9856
9857 static void
9858 fill_casefolding_rules (const char *casefolding_filename)
9859 {
9860 FILE *stream;
9861
9862 stream = fopen (casefolding_filename, "r");
9863 if (stream == NULL)
9864 {
9865 fprintf (stderr, "error during fopen of '%s'\n", casefolding_filename);
9866 exit (1);
9867 }
9868
9869 casefolding_rules = NULL;
9870 num_casefolding_rules = 0;
9871 allocated_casefolding_rules = 0;
9872
9873 for (;;)
9874 {
9875 char buf[200+1];
9876 char *scanptr;
9877 char *endptr;
9878 int i;
9879
9880 unsigned int code;
9881 char type;
9882 unsigned int mapping[3];
9883
9884 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
9885 break;
9886
9887 if (buf[0] == '\0' || buf[0] == '#')
9888 continue;
9889
9890
9891 scanptr = buf;
9892 code = strtoul (scanptr, &endptr, 16);
9893 if (endptr == scanptr)
9894 {
9895 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9896 exit (1);
9897 }
9898 scanptr = endptr;
9899 if (*scanptr != ';')
9900 {
9901 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9902 exit (1);
9903 }
9904 scanptr++;
9905
9906
9907 while (*scanptr == ' ')
9908 scanptr++;
9909
9910 switch (*scanptr)
9911 {
9912 case 'C': case 'F': case 'S': case 'T':
9913 type = *scanptr;
9914 break;
9915 default:
9916 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9917 exit (1);
9918 }
9919 scanptr++;
9920 if (*scanptr != ';')
9921 {
9922 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9923 exit (1);
9924 }
9925 scanptr++;
9926
9927
9928 for (i = 0; i < 3; i++)
9929 mapping[i] = 0;
9930 for (i = 0; i < 3; i++)
9931 {
9932 while (*scanptr == ' ')
9933 scanptr++;
9934 if (*scanptr == ';')
9935 break;
9936 mapping[i] = strtoul (scanptr, &endptr, 16);
9937 if (endptr == scanptr)
9938 {
9939 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9940 exit (1);
9941 }
9942 scanptr = endptr;
9943 }
9944 if (*scanptr != ';')
9945 {
9946 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9947 exit (1);
9948 }
9949 scanptr++;
9950
9951
9952 if (type != 'S')
9953 {
9954 const char * const *languages;
9955 unsigned int languages_count;
9956
9957
9958
9959 if (type == 'T')
9960 {
9961 static const char * const turkish_languages[] = { "tr", "az" };
9962 languages = turkish_languages;
9963 languages_count = 2;
9964 }
9965 else
9966 {
9967 static const char * const all_languages[] = { NULL };
9968 languages = all_languages;
9969 languages_count = 1;
9970 }
9971
9972 for (i = 0; i < languages_count; i++)
9973 {
9974
9975 struct casefold_rule *new_rule =
9976 (struct casefold_rule *) malloc (sizeof (struct casefold_rule));
9977 new_rule->code = code;
9978 memcpy (new_rule->mapping, mapping, sizeof (new_rule->mapping));
9979 new_rule->language = languages[i];
9980
9981 if (num_casefolding_rules == allocated_casefolding_rules)
9982 {
9983 allocated_casefolding_rules = 2 * allocated_casefolding_rules;
9984 if (allocated_casefolding_rules < 16)
9985 allocated_casefolding_rules = 16;
9986 casefolding_rules =
9987 (struct casefold_rule **)
9988 realloc (casefolding_rules,
9989 allocated_casefolding_rules * sizeof (struct casefold_rule *));
9990 }
9991 casefolding_rules[num_casefolding_rules++] = new_rule;
9992 }
9993 }
9994 }
9995
9996 if (ferror (stream) || fclose (stream))
9997 {
9998 fprintf (stderr, "error reading from '%s'\n", casefolding_filename);
9999 exit (1);
10000 }
10001 }
10002
10003
10004 unsigned int unicode_casefold[0x110000];
10005
10006 static unsigned int
10007 to_casefold (unsigned int ch)
10008 {
10009 return unicode_casefold[ch];
10010 }
10011
10012
10013
10014
10015
10016 static void
10017 redistribute_casefolding_rules (void)
10018 {
10019 unsigned int ch, i, j;
10020
10021
10022 for (ch = 0; ch < 0x110000; ch++)
10023 unicode_casefold[ch] = ch;
10024 for (i = 0; i < num_casefolding_rules; i++)
10025 {
10026 struct casefold_rule *cfrule = casefolding_rules[i];
10027
10028 if (cfrule->language == NULL && cfrule->mapping[1] == 0)
10029 {
10030 ch = cfrule->code;
10031 assert (ch < 0x110000);
10032 unicode_casefold[ch] = cfrule->mapping[0];
10033 }
10034 }
10035
10036
10037
10038 for (j = 0; j < num_casing_rules; j++)
10039 {
10040 struct special_casing_rule *rule = casing_rules[j];
10041 unsigned int k;
10042
10043 rule->casefold_mapping[0] = to_casefold (rule->code);
10044 for (k = 1; k < 3; k++)
10045 rule->casefold_mapping[k] = 0;
10046 }
10047
10048
10049 for (i = 0; i < num_casefolding_rules; i++)
10050 {
10051 struct casefold_rule *cfrule = casefolding_rules[i];
10052
10053 if (!(cfrule->language == NULL && cfrule->mapping[1] == 0))
10054 {
10055
10056
10057
10058 struct special_casing_rule *found_rule = NULL;
10059
10060 for (j = 0; j < num_casing_rules; j++)
10061 {
10062 struct special_casing_rule *rule = casing_rules[j];
10063
10064 if (rule->code == cfrule->code
10065 && (cfrule->language == NULL
10066 || (rule->language != NULL
10067 && strcmp (rule->language, cfrule->language) == 0)))
10068 {
10069 memcpy (rule->casefold_mapping, cfrule->mapping,
10070 sizeof (rule->casefold_mapping));
10071
10072 if ((cfrule->language == NULL
10073 ? rule->language == NULL
10074 : rule->language != NULL
10075 && strcmp (rule->language, cfrule->language) == 0)
10076 && rule->context == SCC_ALWAYS)
10077 {
10078
10079 found_rule = rule;
10080 }
10081 }
10082 }
10083
10084 if (found_rule == NULL)
10085 {
10086
10087 struct special_casing_rule *new_rule =
10088 (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
10089
10090
10091
10092 for (j = 0; j < num_casing_rules; j++)
10093 {
10094 struct special_casing_rule *rule = casing_rules[j];
10095
10096 if (rule->code == cfrule->code
10097 && rule->context == SCC_ALWAYS
10098 && rule->language == NULL)
10099 {
10100
10101 found_rule = rule;
10102 break;
10103 }
10104 }
10105
10106 new_rule->code = cfrule->code;
10107 new_rule->language = cfrule->language;
10108 new_rule->context = SCC_ALWAYS;
10109 if (found_rule != NULL)
10110 {
10111 memcpy (new_rule->lower_mapping, found_rule->lower_mapping,
10112 sizeof (new_rule->lower_mapping));
10113 memcpy (new_rule->title_mapping, found_rule->title_mapping,
10114 sizeof (new_rule->title_mapping));
10115 memcpy (new_rule->upper_mapping, found_rule->upper_mapping,
10116 sizeof (new_rule->upper_mapping));
10117 }
10118 else
10119 {
10120 unsigned int k;
10121
10122 new_rule->lower_mapping[0] = to_lower (cfrule->code);
10123 for (k = 1; k < 3; k++)
10124 new_rule->lower_mapping[k] = 0;
10125 new_rule->title_mapping[0] = to_title (cfrule->code);
10126 for (k = 1; k < 3; k++)
10127 new_rule->title_mapping[k] = 0;
10128 new_rule->upper_mapping[0] = to_upper (cfrule->code);
10129 for (k = 1; k < 3; k++)
10130 new_rule->upper_mapping[k] = 0;
10131 }
10132 memcpy (new_rule->casefold_mapping, cfrule->mapping,
10133 sizeof (new_rule->casefold_mapping));
10134
10135 add_casing_rule (new_rule);
10136 }
10137 }
10138 }
10139 }
10140
10141 static int
10142 compare_casing_rules (const void *a, const void *b)
10143 {
10144 struct special_casing_rule *a_rule = *(struct special_casing_rule **) a;
10145 struct special_casing_rule *b_rule = *(struct special_casing_rule **) b;
10146 unsigned int a_code = a_rule->code;
10147 unsigned int b_code = b_rule->code;
10148
10149 if (a_code < b_code)
10150 return -1;
10151 if (a_code > b_code)
10152 return 1;
10153
10154
10155 return (- ((a_rule->language != NULL ? 1 : 0) + (a_rule->context != SCC_ALWAYS ? 1 : 0))
10156 + ((b_rule->language != NULL ? 1 : 0) + (b_rule->context != SCC_ALWAYS ? 1 : 0)));
10157 }
10158
10159 static void
10160 sort_casing_rules (void)
10161 {
10162
10163 if (num_casing_rules > 1)
10164 qsort (casing_rules, num_casing_rules, sizeof (struct special_casing_rule *),
10165 compare_casing_rules);
10166 }
10167
10168
10169 static void
10170 output_casing_rules (const char *filename, const char *version)
10171 {
10172 FILE *stream;
10173 unsigned int i, j;
10174 unsigned int minor;
10175
10176 stream = fopen (filename, "w");
10177 if (stream == NULL)
10178 {
10179 fprintf (stderr, "cannot open '%s' for writing\n", filename);
10180 exit (1);
10181 }
10182
10183 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
10184 fprintf (stream, "/* Special casing rules of Unicode characters. */\n");
10185 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
10186 version);
10187 fprintf (stream, "\n");
10188
10189 fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
10190 fprintf (stream, "\n");
10191 output_library_license (stream, false);
10192 fprintf (stream, "\n");
10193
10194 fprintf (stream, "struct special_casing_rule { char code[3]; };\n");
10195 fprintf (stream, "%%struct-type\n");
10196 fprintf (stream, "%%language=ANSI-C\n");
10197 fprintf (stream, "%%define slot-name code\n");
10198 fprintf (stream, "%%define hash-function-name gl_unicase_special_hash\n");
10199 fprintf (stream, "%%define lookup-function-name gl_unicase_special_lookup\n");
10200 fprintf (stream, "%%compare-lengths\n");
10201 fprintf (stream, "%%compare-strncmp\n");
10202 fprintf (stream, "%%readonly-tables\n");
10203 fprintf (stream, "%%omit-struct-type\n");
10204 fprintf (stream, "%%%%\n");
10205
10206 minor = 0;
10207 for (i = 0; i < num_casing_rules; i++)
10208 {
10209 struct special_casing_rule *rule = casing_rules[i];
10210 int context;
10211
10212 if (i > 0 && rule->code == casing_rules[i - 1]->code)
10213 minor += 1;
10214 else
10215 minor = 0;
10216
10217 if (!(rule->code < 0x10000))
10218 {
10219 fprintf (stderr, "special rule #%u: code %u out of range\n", i, rule->code);
10220 exit (1);
10221 }
10222
10223 fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\", ",
10224 (rule->code >> 8) & 0xff, rule->code & 0xff, minor);
10225
10226 fprintf (stream, "%d, ",
10227 i + 1 < num_casing_rules && casing_rules[i + 1]->code == rule->code ? 1 : 0);
10228
10229 context = rule->context;
10230 if (context < 0)
10231 {
10232 fprintf (stream, "-");
10233 context = - context;
10234 }
10235 else
10236 fprintf (stream, " ");
10237 switch (context)
10238 {
10239 case SCC_ALWAYS:
10240 fprintf (stream, "SCC_ALWAYS ");
10241 break;
10242 case SCC_FINAL_SIGMA:
10243 fprintf (stream, "SCC_FINAL_SIGMA ");
10244 break;
10245 case SCC_AFTER_SOFT_DOTTED:
10246 fprintf (stream, "SCC_AFTER_SOFT_DOTTED");
10247 break;
10248 case SCC_MORE_ABOVE:
10249 fprintf (stream, "SCC_MORE_ABOVE ");
10250 break;
10251 case SCC_BEFORE_DOT:
10252 fprintf (stream, "SCC_BEFORE_DOT ");
10253 break;
10254 case SCC_AFTER_I:
10255 fprintf (stream, "SCC_AFTER_I ");
10256 break;
10257 default:
10258 abort ();
10259 }
10260 fprintf (stream, ", ");
10261
10262 if (rule->language != NULL)
10263 {
10264 assert (strlen (rule->language) == 2);
10265 fprintf (stream, "{ '%c', '%c' }, ", rule->language[0], rule->language[1]);
10266 }
10267 else
10268 fprintf (stream, "{ '\\0', '\\0' }, ");
10269
10270 fprintf (stream, "{ ");
10271 for (j = 0; j < 3; j++)
10272 {
10273 if (j > 0)
10274 fprintf (stream, ", ");
10275 if (!(rule->upper_mapping[j] < 0x10000))
10276 {
10277 fprintf (stderr, "special rule #%u: upper mapping of code %u out of range\n", i, rule->code);
10278 exit (1);
10279 }
10280 if (rule->upper_mapping[j] != 0)
10281 fprintf (stream, "0x%04X", rule->upper_mapping[j]);
10282 else
10283 fprintf (stream, " 0");
10284 }
10285 fprintf (stream, " }, { ");
10286 for (j = 0; j < 3; j++)
10287 {
10288 if (j > 0)
10289 fprintf (stream, ", ");
10290 if (!(rule->lower_mapping[j] < 0x10000))
10291 {
10292 fprintf (stderr, "special rule #%u: lower mapping of code %u out of range\n", i, rule->code);
10293 exit (1);
10294 }
10295 if (rule->lower_mapping[j] != 0)
10296 fprintf (stream, "0x%04X", rule->lower_mapping[j]);
10297 else
10298 fprintf (stream, " 0");
10299 }
10300 fprintf (stream, " }, { ");
10301 for (j = 0; j < 3; j++)
10302 {
10303 if (j > 0)
10304 fprintf (stream, ", ");
10305 if (!(rule->title_mapping[j] < 0x10000))
10306 {
10307 fprintf (stderr, "special rule #%u: title mapping of code %u out of range\n", i, rule->code);
10308 exit (1);
10309 }
10310 if (rule->title_mapping[j] != 0)
10311 fprintf (stream, "0x%04X", rule->title_mapping[j]);
10312 else
10313 fprintf (stream, " 0");
10314 }
10315 fprintf (stream, " }, { ");
10316 for (j = 0; j < 3; j++)
10317 {
10318 if (j > 0)
10319 fprintf (stream, ", ");
10320 if (!(rule->casefold_mapping[j] < 0x10000))
10321 {
10322 fprintf (stderr, "special rule #%u: casefold mapping of code %u out of range\n", i, rule->code);
10323 exit (1);
10324 }
10325 if (rule->casefold_mapping[j] != 0)
10326 fprintf (stream, "0x%04X", rule->casefold_mapping[j]);
10327 else
10328 fprintf (stream, " 0");
10329 }
10330 fprintf (stream, " }\n");
10331 }
10332
10333 if (ferror (stream) || fclose (stream))
10334 {
10335 fprintf (stderr, "error writing to '%s'\n", filename);
10336 exit (1);
10337 }
10338 }
10339
10340
10341
10342
10343
10344
10345
10346 static bool
10347 is_cased (unsigned int ch)
10348 {
10349 return (is_property_lowercase (ch)
10350 || is_property_uppercase (ch)
10351 || is_category_Lt (ch));
10352 }
10353
10354
10355
10356
10357
10358
10359
10360
10361
10362
10363
10364
10365
10366
10367
10368
10369 static bool
10370 is_case_ignorable (unsigned int ch)
10371 {
10372 return (unicode_org_wbp[ch] == WBP_MIDLETTER
10373 || unicode_org_wbp[ch] == WBP_MIDNUMLET
10374 || is_category_Mn (ch)
10375 || is_category_Me (ch)
10376 || is_category_Cf (ch)
10377 || is_category_Lm (ch)
10378 || is_category_Sk (ch))
10379 && !is_cased (ch);
10380 }
10381
10382
10383
10384
10385 static void
10386 output_casing_properties (const char *version)
10387 {
10388 #define PROPERTY(FN,P) \
10389 debug_output_predicate ("unicase/" #FN ".txt", is_ ## P); \
10390 output_predicate_test ("../tests/unicase/test-" #FN ".c", is_ ## P, "uc_is_" #P " (c)"); \
10391 output_predicate ("unicase/" #FN ".h", is_ ## P, "u_casing_property_" #P, "Casing Properties", version);
10392 PROPERTY(cased, cased)
10393 PROPERTY(ignorable, case_ignorable)
10394 #undef PROPERTY
10395 }
10396
10397
10398
10399 int
10400 main (int argc, char * argv[])
10401 {
10402 const char *unicodedata_filename;
10403 const char *proplist_filename;
10404 const char *derivedproplist_filename;
10405 const char *arabicshaping_filename;
10406 const char *scripts_filename;
10407 const char *blocks_filename;
10408 const char *proplist30_filename;
10409 const char *eastasianwidth_filename;
10410 const char *linebreak_filename;
10411 const char *wordbreakproperty_filename;
10412 const char *graphemebreakproperty_filename;
10413 const char *compositionexclusions_filename;
10414 const char *specialcasing_filename;
10415 const char *casefolding_filename;
10416 const char *version;
10417
10418 if (argc != 16)
10419 {
10420 fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt ArabicShaping.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt GraphemeBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n",
10421 argv[0]);
10422 exit (1);
10423 }
10424
10425 unicodedata_filename = argv[1];
10426 proplist_filename = argv[2];
10427 derivedproplist_filename = argv[3];
10428 arabicshaping_filename = argv[4];
10429 scripts_filename = argv[5];
10430 blocks_filename = argv[6];
10431 proplist30_filename = argv[7];
10432 eastasianwidth_filename = argv[8];
10433 linebreak_filename = argv[9];
10434 wordbreakproperty_filename = argv[10];
10435 graphemebreakproperty_filename = argv[11];
10436 compositionexclusions_filename = argv[12];
10437 specialcasing_filename = argv[13];
10438 casefolding_filename = argv[14];
10439 version = argv[15];
10440
10441 fill_attributes (unicodedata_filename);
10442 clear_properties ();
10443 fill_properties (proplist_filename);
10444 fill_properties (derivedproplist_filename);
10445 fill_properties30 (proplist30_filename);
10446 fill_arabicshaping (arabicshaping_filename);
10447 fill_scripts (scripts_filename);
10448 fill_blocks (blocks_filename);
10449 fill_width (eastasianwidth_filename);
10450 fill_org_lbp (linebreak_filename);
10451 fill_org_wbp (wordbreakproperty_filename);
10452 fill_org_gbp (graphemebreakproperty_filename);
10453 fill_composition_exclusions (compositionexclusions_filename);
10454 fill_casing_rules (specialcasing_filename);
10455 fill_casefolding_rules (casefolding_filename);
10456 redistribute_casefolding_rules ();
10457 sort_casing_rules ();
10458
10459 output_categories (version);
10460 output_category ("unictype/categ_of.h", version);
10461 output_combclass ("unictype/combiningclass.h", version);
10462 output_bidi_category ("unictype/bidi_of.h", version);
10463 output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version);
10464 output_decimal_digit ("unictype/decdigit.h", version);
10465 output_digit_test ("../tests/unictype/test-digit.h", version);
10466 output_digit ("unictype/digit.h", version);
10467 output_numeric_test ("../tests/unictype/test-numeric.h", version);
10468 output_numeric ("unictype/numeric.h", version);
10469 output_mirror ("unictype/mirror.h", version);
10470 output_properties (version);
10471 output_joining_type_test ("../tests/unictype/test-joiningtype_of.h", version);
10472 output_joining_type ("unictype/joiningtype_of.h", version);
10473 output_joining_group_test ("../tests/unictype/test-joininggroup_of.h", version);
10474 output_joining_group ("unictype/joininggroup_of.h", version);
10475
10476 output_scripts (version);
10477 output_scripts_byname (version);
10478 output_blocks (version);
10479 output_ident_properties (version);
10480 output_nonspacing_property ("uniwidth/width.c.part");
10481 output_width_property_test ("../tests/uniwidth/test-uc_width2.sh.part");
10482 output_old_ctype (version);
10483
10484 debug_output_lbrk_tables ("unilbrk/lbrkprop.txt");
10485 debug_output_org_lbrk_tables ("unilbrk/lbrkprop_org.txt");
10486 output_lbrk_tables ("unilbrk/lbrkprop1.h", "unilbrk/lbrkprop2.h", version);
10487
10488 debug_output_wbrk_tables ("uniwbrk/wbrkprop.txt");
10489 debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt");
10490 output_wbrk_tables ("uniwbrk/wbrkprop.h", version);
10491
10492 output_gbp_test ("../tests/unigbrk/test-uc-gbrk-prop.h");
10493 output_gbp_table ("unigbrk/gbrkprop.h", version);
10494
10495 output_decomposition_tables ("uninorm/decomposition-table1.h", "uninorm/decomposition-table2.h", version);
10496 debug_output_composition_tables ("uninorm/composition.txt");
10497 output_composition_tables ("uninorm/composition-table.gperf", version);
10498
10499 output_simple_mapping_test ("../tests/unicase/test-uc_toupper.c", "uc_toupper", to_upper, version);
10500 output_simple_mapping_test ("../tests/unicase/test-uc_tolower.c", "uc_tolower", to_lower, version);
10501 output_simple_mapping_test ("../tests/unicase/test-uc_totitle.c", "uc_totitle", to_title, version);
10502 output_simple_mapping ("unicase/toupper.h", to_upper, version);
10503 output_simple_mapping ("unicase/tolower.h", to_lower, version);
10504 output_simple_mapping ("unicase/totitle.h", to_title, version);
10505 output_simple_mapping ("unicase/tocasefold.h", to_casefold, version);
10506 output_casing_rules ("unicase/special-casing-table.gperf", version);
10507 output_casing_properties (version);
10508
10509 return 0;
10510 }
10511
10512
10513
10514
10515
10516
10517
10518
10519
10520
10521
10522
10523
10524
10525
10526
10527
10528
10529
10530
10531
10532
10533
10534
10535
10536