root/maint/gnulib/lib/gen-uni-tables.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. fill_attribute
  2. getfield
  3. fill_attributes
  4. output_library_license
  5. output_tests_license
  6. is_category_L
  7. is_category_LC
  8. is_category_Lu
  9. is_category_Ll
  10. is_category_Lt
  11. is_category_Lm
  12. is_category_Lo
  13. is_category_M
  14. is_category_Mn
  15. is_category_Mc
  16. is_category_Me
  17. is_category_N
  18. is_category_Nd
  19. is_category_Nl
  20. is_category_No
  21. is_category_P
  22. is_category_Pc
  23. is_category_Pd
  24. is_category_Ps
  25. is_category_Pe
  26. is_category_Pi
  27. is_category_Pf
  28. is_category_Po
  29. is_category_S
  30. is_category_Sm
  31. is_category_Sc
  32. is_category_Sk
  33. is_category_So
  34. is_category_Z
  35. is_category_Zs
  36. is_category_Zl
  37. is_category_Zp
  38. is_category_C
  39. is_category_Cc
  40. is_category_Cf
  41. is_category_Cs
  42. is_category_Co
  43. is_category_Cn
  44. debug_output_predicate
  45. output_predicate_test
  46. output_predicate
  47. output_categories
  48. general_category_byname
  49. output_category
  50. output_combclass
  51. bidi_category_byname
  52. get_bidi_category
  53. output_bidi_category
  54. get_decdigit_value
  55. output_decimal_digit_test
  56. output_decimal_digit
  57. get_digit_value
  58. output_digit_test
  59. output_digit
  60. get_numeric_value
  61. output_numeric_test
  62. output_numeric
  63. get_mirror_value
  64. output_mirror
  65. is_WBP_MIDNUMLET
  66. is_WBP_MIDLETTER
  67. clear_properties
  68. fill_properties
  69. fill_property30
  70. fill_properties30
  71. is_property_white_space
  72. is_property_alphabetic
  73. is_property_other_alphabetic
  74. is_property_not_a_character
  75. is_property_default_ignorable_code_point
  76. is_property_other_default_ignorable_code_point
  77. is_property_deprecated
  78. is_property_logical_order_exception
  79. is_property_variation_selector
  80. is_property_private_use
  81. is_property_unassigned_code_value
  82. is_property_uppercase
  83. is_property_other_uppercase
  84. is_property_lowercase
  85. is_property_other_lowercase
  86. is_property_titlecase
  87. is_property_cased
  88. is_property_case_ignorable
  89. is_property_changes_when_lowercased
  90. is_property_changes_when_uppercased
  91. is_property_changes_when_titlecased
  92. is_property_changes_when_casefolded
  93. is_property_changes_when_casemapped
  94. is_property_soft_dotted
  95. is_property_id_start
  96. is_property_other_id_start
  97. is_property_id_continue
  98. is_property_other_id_continue
  99. is_property_xid_start
  100. is_property_xid_continue
  101. is_property_pattern_white_space
  102. is_property_pattern_syntax
  103. is_property_join_control
  104. is_property_grapheme_base
  105. is_property_grapheme_extend
  106. is_property_other_grapheme_extend
  107. is_property_grapheme_link
  108. is_property_bidi_control
  109. is_property_bidi_left_to_right
  110. is_property_bidi_hebrew_right_to_left
  111. is_property_bidi_arabic_right_to_left
  112. is_property_bidi_european_digit
  113. is_property_bidi_eur_num_separator
  114. is_property_bidi_eur_num_terminator
  115. is_property_bidi_arabic_digit
  116. is_property_bidi_common_separator
  117. is_property_bidi_block_separator
  118. is_property_bidi_segment_separator
  119. is_property_bidi_whitespace
  120. is_property_bidi_non_spacing_mark
  121. is_property_bidi_boundary_neutral
  122. is_property_bidi_pdf
  123. is_property_bidi_embedding_or_override
  124. is_property_bidi_other_neutral
  125. is_property_hex_digit
  126. is_property_ascii_hex_digit
  127. is_property_ideographic
  128. is_property_unified_ideograph
  129. is_property_radical
  130. is_property_ids_binary_operator
  131. is_property_ids_trinary_operator
  132. is_property_zero_width
  133. is_property_space
  134. is_property_non_break
  135. is_property_iso_control
  136. is_property_format_control
  137. is_property_dash
  138. is_property_hyphen
  139. is_property_punctuation
  140. is_property_line_separator
  141. is_property_paragraph_separator
  142. is_property_quotation_mark
  143. is_property_sentence_terminal
  144. is_property_terminal_punctuation
  145. is_property_currency_symbol
  146. is_property_math
  147. is_property_other_math
  148. is_property_paired_punctuation
  149. is_property_left_of_pair
  150. is_property_combining
  151. is_property_non_spacing
  152. is_property_composite
  153. is_property_decimal_digit
  154. is_property_numeric
  155. is_property_diacritic
  156. is_property_extender
  157. is_property_ignorable_control
  158. output_properties
  159. fill_arabicshaping
  160. joining_type_as_c_identifier
  161. output_joining_type_test
  162. output_joining_type
  163. joining_group_as_c_identifier
  164. output_joining_group_test
  165. output_joining_group
  166. fill_scripts
  167. output_scripts
  168. output_scripts_byname
  169. fill_blocks
  170. block_first_index
  171. block_last_index
  172. output_blocks
  173. is_c_whitespace
  174. c_ident_category
  175. is_java_whitespace
  176. java_ident_category
  177. output_ident_category
  178. output_ident_properties
  179. to_upper
  180. to_lower
  181. to_title
  182. is_upper
  183. is_lower
  184. is_alpha
  185. is_digit
  186. is_alnum
  187. is_blank
  188. is_space
  189. is_cntrl
  190. is_xdigit
  191. is_graph
  192. is_print
  193. is_punct
  194. output_old_ctype
  195. is_combining
  196. is_combining_level3
  197. ucs_symbol
  198. ucs_symbol_range
  199. output_charclass
  200. output_charmap
  201. output_widthmap
  202. output_tables
  203. fill_width
  204. is_nonspacing
  205. output_nonspacing_property
  206. symbolic_width
  207. output_width_property_test
  208. get_lbp
  209. debug_output_lbp
  210. debug_output_lbrk_tables
  211. fill_org_lbp
  212. debug_output_org_lbp
  213. debug_output_org_lbrk_tables
  214. output_lbp
  215. output_lbrk_tables
  216. get_wbp
  217. debug_output_wbp
  218. debug_output_wbrk_tables
  219. fill_org_wbp
  220. debug_output_org_wbp
  221. debug_output_org_wbrk_tables
  222. output_wbp
  223. output_wbrk_tables
  224. output_gbp_test
  225. output_gbp_table
  226. fill_org_gbp
  227. get_decomposition
  228. output_decomposition
  229. output_decomposition_tables
  230. fill_composition_exclusions
  231. debug_output_composition_tables
  232. output_composition_tables
  233. output_simple_mapping_test
  234. output_simple_mapping
  235. add_casing_rule
  236. fill_casing_rules
  237. fill_casefolding_rules
  238. to_casefold
  239. redistribute_casefolding_rules
  240. compare_casing_rules
  241. sort_casing_rules
  242. output_casing_rules
  243. is_cased
  244. is_case_ignorable
  245. output_casing_properties
  246. main

   1 /* Generate Unicode conforming character classification tables and
   2    line break properties tables and word break property tables and
   3    decomposition/composition and case mapping tables from a UnicodeData file.
   4    Copyright (C) 2000-2002, 2004, 2007-2021 Free Software Foundation, Inc.
   5    Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
   6 
   7    This program is free software: you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3 of the License, or
  10    (at your option) any later version.
  11 
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16 
  17    You should have received a copy of the GNU General Public License
  18    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
  19 
  20 /* Usage example:
  21      $ gen-uni-tables /usr/local/share/www.unicode.org/Public/9.0.0/ucd/UnicodeData.txt \
  22                       /usr/local/share/www.unicode.org/Public/9.0.0/ucd/PropList.txt \
  23                       /usr/local/share/www.unicode.org/Public/9.0.0/ucd/DerivedCoreProperties.txt \
  24                       /usr/local/share/www.unicode.org/Public/9.0.0/ucd/ArabicShaping.txt \
  25                       /usr/local/share/www.unicode.org/Public/9.0.0/ucd/Scripts.txt \
  26                       /usr/local/share/www.unicode.org/Public/9.0.0/ucd/Blocks.txt \
  27                       /usr/local/share/www.unicode.org/Public/3.0-Update1/PropList-3.0.1.txt \
  28                       /usr/local/share/www.unicode.org/Public/9.0.0/ucd/EastAsianWidth.txt \
  29                       /usr/local/share/www.unicode.org/Public/9.0.0/ucd/LineBreak.txt \
  30                       /usr/local/share/www.unicode.org/Public/9.0.0/ucd/auxiliary/WordBreakProperty.txt \
  31                       /usr/local/share/www.unicode.org/Public/9.0.0/ucd/auxiliary/GraphemeBreakProperty.txt \
  32                       /usr/local/share/www.unicode.org/Public/9.0.0/ucd/CompositionExclusions.txt \
  33                       /usr/local/share/www.unicode.org/Public/9.0.0/ucd/SpecialCasing.txt \
  34                       /usr/local/share/www.unicode.org/Public/9.0.0/ucd/CaseFolding.txt \
  35                       9.0.0
  36  */
  37 
  38 #include <assert.h>
  39 #include <stdbool.h>
  40 #include <stdint.h>
  41 #include <stdio.h>
  42 #include <stdlib.h>
  43 #include <string.h>
  44 #include <time.h>
  45 
  46 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
  47 
  48 /* ========================================================================= */
  49 
  50 /* Reading UnicodeData.txt.  */
  51 /* See UCD.html.  */
  52 
  53 /* This structure represents one line in the UnicodeData.txt file.  */
  54 struct unicode_attribute
  55 {
  56   const char *name;           /* Character name */
  57   const char *category;       /* General category */
  58   const char *combining;      /* Canonical combining class */
  59   const char *bidi;           /* Bidirectional category */
  60   const char *decomposition;  /* Character decomposition mapping */
  61   const char *decdigit;       /* Decimal digit value */
  62   const char *digit;          /* Digit value */
  63   const char *numeric;        /* Numeric value */
  64   bool mirrored;              /* mirrored */
  65   const char *oldname;        /* Old Unicode 1.0 name */
  66   const char *comment;        /* Comment */
  67   unsigned int upper;         /* Uppercase mapping */
  68   unsigned int lower;         /* Lowercase mapping */
  69   unsigned int title;         /* Titlecase mapping */
  70 };
  71 
  72 /* Missing fields are represented with "" for strings, and NONE for
  73    characters.  */
  74 #define NONE (~(unsigned int)0)
  75 
  76 /* The entire contents of the UnicodeData.txt file.  */
  77 struct unicode_attribute unicode_attributes [0x110000];
  78 
  79 /* Stores in unicode_attributes[i] the values from the given fields.  */
  80 static void
  81 fill_attribute (unsigned int i,
     /* [previous][next][first][last][top][bottom][index][help] */
  82                 const char *field1, const char *field2,
  83                 const char *field3, const char *field4,
  84                 const char *field5, const char *field6,
  85                 const char *field7, const char *field8,
  86                 const char *field9, const char *field10,
  87                 const char *field11, const char *field12,
  88                 const char *field13, const char *field14)
  89 {
  90   struct unicode_attribute * uni;
  91 
  92   if (i >= 0x110000)
  93     {
  94       fprintf (stderr, "index too large\n");
  95       exit (1);
  96     }
  97   if (strcmp (field2, "Cs") == 0)
  98     /* Surrogates are UTF-16 artifacts, not real characters. Ignore them.  */
  99     return;
 100   uni = &unicode_attributes[i];
 101   /* Copy the strings.  */
 102   uni->name          = strdup (field1);
 103   uni->category      = (field2[0] == '\0' ? "" : strdup (field2));
 104   uni->combining     = (field3[0] == '\0' ? "" : strdup (field3));
 105   uni->bidi          = (field4[0] == '\0' ? "" : strdup (field4));
 106   uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
 107   uni->decdigit      = (field6[0] == '\0' ? "" : strdup (field6));
 108   uni->digit         = (field7[0] == '\0' ? "" : strdup (field7));
 109   uni->numeric       = (field8[0] == '\0' ? "" : strdup (field8));
 110   uni->mirrored      = (field9[0] == 'Y');
 111   uni->oldname       = (field10[0] == '\0' ? "" : strdup (field10));
 112   uni->comment       = (field11[0] == '\0' ? "" : strdup (field11));
 113   uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
 114   uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
 115   uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
 116 }
 117 
 118 /* Maximum length of a field in the UnicodeData.txt file.  */
 119 #define FIELDLEN 160
 120 
 121 /* Reads the next field from STREAM.  The buffer BUFFER has size FIELDLEN.
 122    Reads up to (but excluding) DELIM.
 123    Returns 1 when a field was successfully read, otherwise 0.  */
 124 static int
 125 getfield (FILE *stream, char *buffer, int delim)
     /* [previous][next][first][last][top][bottom][index][help] */
 126 {
 127   int count = 0;
 128   int c;
 129 
 130   for (; (c = getc (stream)), (c != EOF && c != delim); )
 131     {
 132       /* The original unicode.org UnicodeData.txt file happens to have
 133          CR/LF line terminators.  Silently convert to LF.  */
 134       if (c == '\r')
 135         continue;
 136 
 137       /* Put c into the buffer.  */
 138       if (++count >= FIELDLEN - 1)
 139         {
 140           fprintf (stderr, "field longer than expected, increase FIELDLEN\n");
 141           exit (1);
 142         }
 143       *buffer++ = c;
 144     }
 145 
 146   if (c == EOF)
 147     return 0;
 148 
 149   *buffer = '\0';
 150   return 1;
 151 }
 152 
 153 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
 154    file.  */
 155 static void
 156 fill_attributes (const char *unicodedata_filename)
     /* [previous][next][first][last][top][bottom][index][help] */
 157 {
 158   unsigned int i, j;
 159   FILE *stream;
 160   char field0[FIELDLEN];
 161   char field1[FIELDLEN];
 162   char field2[FIELDLEN];
 163   char field3[FIELDLEN];
 164   char field4[FIELDLEN];
 165   char field5[FIELDLEN];
 166   char field6[FIELDLEN];
 167   char field7[FIELDLEN];
 168   char field8[FIELDLEN];
 169   char field9[FIELDLEN];
 170   char field10[FIELDLEN];
 171   char field11[FIELDLEN];
 172   char field12[FIELDLEN];
 173   char field13[FIELDLEN];
 174   char field14[FIELDLEN];
 175   int lineno = 0;
 176 
 177   for (i = 0; i < 0x110000; i++)
 178     unicode_attributes[i].name = NULL;
 179 
 180   stream = fopen (unicodedata_filename, "r");
 181   if (stream == NULL)
 182     {
 183       fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
 184       exit (1);
 185     }
 186 
 187   for (;;)
 188     {
 189       int n;
 190 
 191       lineno++;
 192       n = getfield (stream, field0, ';');
 193       n += getfield (stream, field1, ';');
 194       n += getfield (stream, field2, ';');
 195       n += getfield (stream, field3, ';');
 196       n += getfield (stream, field4, ';');
 197       n += getfield (stream, field5, ';');
 198       n += getfield (stream, field6, ';');
 199       n += getfield (stream, field7, ';');
 200       n += getfield (stream, field8, ';');
 201       n += getfield (stream, field9, ';');
 202       n += getfield (stream, field10, ';');
 203       n += getfield (stream, field11, ';');
 204       n += getfield (stream, field12, ';');
 205       n += getfield (stream, field13, ';');
 206       n += getfield (stream, field14, '\n');
 207       if (n == 0)
 208         break;
 209       if (n != 15)
 210         {
 211           fprintf (stderr, "short line in '%s':%d\n",
 212                    unicodedata_filename, lineno);
 213           exit (1);
 214         }
 215       i = strtoul (field0, NULL, 16);
 216       if (field1[0] == '<'
 217           && strlen (field1) >= 9
 218           && strcmp (field1 + strlen (field1) - 8, ", First>") == 0)
 219         {
 220           /* Deal with a range. */
 221           lineno++;
 222           n = getfield (stream, field0, ';');
 223           n += getfield (stream, field1, ';');
 224           n += getfield (stream, field2, ';');
 225           n += getfield (stream, field3, ';');
 226           n += getfield (stream, field4, ';');
 227           n += getfield (stream, field5, ';');
 228           n += getfield (stream, field6, ';');
 229           n += getfield (stream, field7, ';');
 230           n += getfield (stream, field8, ';');
 231           n += getfield (stream, field9, ';');
 232           n += getfield (stream, field10, ';');
 233           n += getfield (stream, field11, ';');
 234           n += getfield (stream, field12, ';');
 235           n += getfield (stream, field13, ';');
 236           n += getfield (stream, field14, '\n');
 237           if (n != 15)
 238             {
 239               fprintf (stderr, "missing end range in '%s':%d\n",
 240                        unicodedata_filename, lineno);
 241               exit (1);
 242             }
 243           if (!(field1[0] == '<'
 244                 && strlen (field1) >= 8
 245                 && strcmp (field1 + strlen (field1) - 7, ", Last>") == 0))
 246             {
 247               fprintf (stderr, "missing end range in '%s':%d\n",
 248                        unicodedata_filename, lineno);
 249               exit (1);
 250             }
 251           field1[strlen (field1) - 7] = '\0';
 252           j = strtoul (field0, NULL, 16);
 253           for (; i <= j; i++)
 254             fill_attribute (i, field1+1, field2, field3, field4, field5,
 255                                field6, field7, field8, field9, field10,
 256                                field11, field12, field13, field14);
 257         }
 258       else
 259         {
 260           /* Single character line */
 261           fill_attribute (i, field1, field2, field3, field4, field5,
 262                              field6, field7, field8, field9, field10,
 263                              field11, field12, field13, field14);
 264         }
 265     }
 266 
 267   if (ferror (stream) || fclose (stream))
 268     {
 269       fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
 270       exit (1);
 271     }
 272 }
 273 
 274 /* ========================================================================= */
 275 
 276 /* Output the license notice for a library file.
 277    This closes an open C syntax comment.  */
 278 static void
 279 output_library_license (FILE *stream, bool lgplv2plus)
     /* [previous][next][first][last][top][bottom][index][help] */
 280 {
 281   if (lgplv2plus)
 282     {
 283       /* These Gnulib modules are under the LGPLv2+ license.  */
 284       fprintf (stream, "   This file is free software: you can redistribute it and/or modify\n");
 285       fprintf (stream, "   it under the terms of the GNU Lesser General Public License as\n");
 286       fprintf (stream, "   published by the Free Software Foundation; either version 2.1 of the\n");
 287       fprintf (stream, "   License, or (at your option) any later version.\n");
 288       fprintf (stream, "\n");
 289       fprintf (stream, "   This file is distributed in the hope that it will be useful,\n");
 290       fprintf (stream, "   but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
 291       fprintf (stream, "   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n");
 292       fprintf (stream, "   GNU Lesser General Public License for more details.\n");
 293       fprintf (stream, "\n");
 294       fprintf (stream, "   You should have received a copy of the GNU Lesser General Public License\n");
 295       fprintf (stream, "   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */\n");
 296     }
 297   else
 298     {
 299       /* These Gnulib modules are under the 'LGPLv3+ or GPLv2+' license.  */
 300       fprintf (stream, "   This program is free software.\n");
 301       fprintf (stream, "   It is dual-licensed under \"the GNU LGPLv3+ or the GNU GPLv2+\".\n");
 302       fprintf (stream, "   You can redistribute it and/or modify it under either\n");
 303       fprintf (stream, "     - the terms of the GNU Lesser General Public License as published\n");
 304       fprintf (stream, "       by the Free Software Foundation; either version 3, or (at your\n");
 305       fprintf (stream, "       option) any later version, or\n");
 306       fprintf (stream, "     - the terms of the GNU General Public License as published by the\n");
 307       fprintf (stream, "       Free Software Foundation; either version 2, or (at your option)\n");
 308       fprintf (stream, "       any later version, or\n");
 309       fprintf (stream, "     - the same dual license \"the GNU LGPLv3+ or the GNU GPLv2+\".\n");
 310       fprintf (stream, "\n");
 311       fprintf (stream, "   This program is distributed in the hope that it will be useful,\n");
 312       fprintf (stream, "   but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
 313       fprintf (stream, "   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n");
 314       fprintf (stream, "   Lesser General Public License and the GNU General Public License\n");
 315       fprintf (stream, "   for more details.\n");
 316       fprintf (stream, "\n");
 317       fprintf (stream, "   You should have received a copy of the GNU Lesser General Public\n");
 318       fprintf (stream, "   License and of the GNU General Public License along with this\n");
 319       fprintf (stream, "   program; if not, see <https://www.gnu.org/licenses/>.  */\n");
 320     }
 321 }
 322 
 323 /* Output the license notice for a tests file.
 324    This closes an open C syntax comment.  */
 325 static void
 326 output_tests_license (FILE *stream)
     /* [previous][next][first][last][top][bottom][index][help] */
 327 {
 328   /* Gnulib tests modules are under the GPLv3+ license.  */
 329   fprintf (stream, "   This program is free software: you can redistribute it and/or modify\n");
 330   fprintf (stream, "   it under the terms of the GNU General Public License as published by\n");
 331   fprintf (stream, "   the Free Software Foundation; either version 3 of the License, or\n");
 332   fprintf (stream, "   (at your option) any later version.\n");
 333   fprintf (stream, "\n");
 334   fprintf (stream, "   This program is distributed in the hope that it will be useful,\n");
 335   fprintf (stream, "   but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
 336   fprintf (stream, "   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n");
 337   fprintf (stream, "   GNU General Public License for more details.\n");
 338   fprintf (stream, "\n");
 339   fprintf (stream, "   You should have received a copy of the GNU General Public License\n");
 340   fprintf (stream, "   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */\n");
 341 }
 342 
 343 /* ========================================================================= */
 344 
 345 /* General category.  */
 346 /* See Unicode 3.0 book, section 4.5,
 347        UCD.html.  */
 348 
 349 static bool
 350 is_category_L (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 351 {
 352   return (unicode_attributes[ch].name != NULL
 353           && unicode_attributes[ch].category[0] == 'L');
 354 }
 355 
 356 static bool
 357 is_category_LC (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 358 {
 359   /* See PropertyValueAliases.txt.  */
 360   return (unicode_attributes[ch].name != NULL
 361           && unicode_attributes[ch].category[0] == 'L'
 362           && (unicode_attributes[ch].category[1] == 'u'
 363               || unicode_attributes[ch].category[1] == 'l'
 364               || unicode_attributes[ch].category[1] == 't'));
 365 }
 366 
 367 static bool
 368 is_category_Lu (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 369 {
 370   return (unicode_attributes[ch].name != NULL
 371           && unicode_attributes[ch].category[0] == 'L'
 372           && unicode_attributes[ch].category[1] == 'u');
 373 }
 374 
 375 static bool
 376 is_category_Ll (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 377 {
 378   return (unicode_attributes[ch].name != NULL
 379           && unicode_attributes[ch].category[0] == 'L'
 380           && unicode_attributes[ch].category[1] == 'l');
 381 }
 382 
 383 static bool
 384 is_category_Lt (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 385 {
 386   return (unicode_attributes[ch].name != NULL
 387           && unicode_attributes[ch].category[0] == 'L'
 388           && unicode_attributes[ch].category[1] == 't');
 389 }
 390 
 391 static bool
 392 is_category_Lm (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 393 {
 394   return (unicode_attributes[ch].name != NULL
 395           && unicode_attributes[ch].category[0] == 'L'
 396           && unicode_attributes[ch].category[1] == 'm');
 397 }
 398 
 399 static bool
 400 is_category_Lo (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 401 {
 402   return (unicode_attributes[ch].name != NULL
 403           && unicode_attributes[ch].category[0] == 'L'
 404           && unicode_attributes[ch].category[1] == 'o');
 405 }
 406 
 407 static bool
 408 is_category_M (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 409 {
 410   return (unicode_attributes[ch].name != NULL
 411           && unicode_attributes[ch].category[0] == 'M');
 412 }
 413 
 414 static bool
 415 is_category_Mn (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 416 {
 417   return (unicode_attributes[ch].name != NULL
 418           && unicode_attributes[ch].category[0] == 'M'
 419           && unicode_attributes[ch].category[1] == 'n');
 420 }
 421 
 422 static bool
 423 is_category_Mc (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 424 {
 425   return (unicode_attributes[ch].name != NULL
 426           && unicode_attributes[ch].category[0] == 'M'
 427           && unicode_attributes[ch].category[1] == 'c');
 428 }
 429 
 430 static bool
 431 is_category_Me (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 432 {
 433   return (unicode_attributes[ch].name != NULL
 434           && unicode_attributes[ch].category[0] == 'M'
 435           && unicode_attributes[ch].category[1] == 'e');
 436 }
 437 
 438 static bool
 439 is_category_N (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 440 {
 441   return (unicode_attributes[ch].name != NULL
 442           && unicode_attributes[ch].category[0] == 'N');
 443 }
 444 
 445 static bool
 446 is_category_Nd (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 447 {
 448   return (unicode_attributes[ch].name != NULL
 449           && unicode_attributes[ch].category[0] == 'N'
 450           && unicode_attributes[ch].category[1] == 'd');
 451 }
 452 
 453 static bool
 454 is_category_Nl (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 455 {
 456   return (unicode_attributes[ch].name != NULL
 457           && unicode_attributes[ch].category[0] == 'N'
 458           && unicode_attributes[ch].category[1] == 'l');
 459 }
 460 
 461 static bool
 462 is_category_No (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 463 {
 464   return (unicode_attributes[ch].name != NULL
 465           && unicode_attributes[ch].category[0] == 'N'
 466           && unicode_attributes[ch].category[1] == 'o');
 467 }
 468 
 469 static bool
 470 is_category_P (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 471 {
 472   return (unicode_attributes[ch].name != NULL
 473           && unicode_attributes[ch].category[0] == 'P');
 474 }
 475 
 476 static bool
 477 is_category_Pc (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 478 {
 479   return (unicode_attributes[ch].name != NULL
 480           && unicode_attributes[ch].category[0] == 'P'
 481           && unicode_attributes[ch].category[1] == 'c');
 482 }
 483 
 484 static bool
 485 is_category_Pd (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 486 {
 487   return (unicode_attributes[ch].name != NULL
 488           && unicode_attributes[ch].category[0] == 'P'
 489           && unicode_attributes[ch].category[1] == 'd');
 490 }
 491 
 492 static bool
 493 is_category_Ps (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 494 {
 495   return (unicode_attributes[ch].name != NULL
 496           && unicode_attributes[ch].category[0] == 'P'
 497           && unicode_attributes[ch].category[1] == 's');
 498 }
 499 
 500 static bool
 501 is_category_Pe (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 502 {
 503   return (unicode_attributes[ch].name != NULL
 504           && unicode_attributes[ch].category[0] == 'P'
 505           && unicode_attributes[ch].category[1] == 'e');
 506 }
 507 
 508 static bool
 509 is_category_Pi (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 510 {
 511   return (unicode_attributes[ch].name != NULL
 512           && unicode_attributes[ch].category[0] == 'P'
 513           && unicode_attributes[ch].category[1] == 'i');
 514 }
 515 
 516 static bool
 517 is_category_Pf (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 518 {
 519   return (unicode_attributes[ch].name != NULL
 520           && unicode_attributes[ch].category[0] == 'P'
 521           && unicode_attributes[ch].category[1] == 'f');
 522 }
 523 
 524 static bool
 525 is_category_Po (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 526 {
 527   return (unicode_attributes[ch].name != NULL
 528           && unicode_attributes[ch].category[0] == 'P'
 529           && unicode_attributes[ch].category[1] == 'o');
 530 }
 531 
 532 static bool
 533 is_category_S (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 534 {
 535   return (unicode_attributes[ch].name != NULL
 536           && unicode_attributes[ch].category[0] == 'S');
 537 }
 538 
 539 static bool
 540 is_category_Sm (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 541 {
 542   return (unicode_attributes[ch].name != NULL
 543           && unicode_attributes[ch].category[0] == 'S'
 544           && unicode_attributes[ch].category[1] == 'm');
 545 }
 546 
 547 static bool
 548 is_category_Sc (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 549 {
 550   return (unicode_attributes[ch].name != NULL
 551           && unicode_attributes[ch].category[0] == 'S'
 552           && unicode_attributes[ch].category[1] == 'c');
 553 }
 554 
 555 static bool
 556 is_category_Sk (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 557 {
 558   return (unicode_attributes[ch].name != NULL
 559           && unicode_attributes[ch].category[0] == 'S'
 560           && unicode_attributes[ch].category[1] == 'k');
 561 }
 562 
 563 static bool
 564 is_category_So (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 565 {
 566   return (unicode_attributes[ch].name != NULL
 567           && unicode_attributes[ch].category[0] == 'S'
 568           && unicode_attributes[ch].category[1] == 'o');
 569 }
 570 
 571 static bool
 572 is_category_Z (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 573 {
 574   return (unicode_attributes[ch].name != NULL
 575           && unicode_attributes[ch].category[0] == 'Z');
 576 }
 577 
 578 static bool
 579 is_category_Zs (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 580 {
 581   return (unicode_attributes[ch].name != NULL
 582           && unicode_attributes[ch].category[0] == 'Z'
 583           && unicode_attributes[ch].category[1] == 's');
 584 }
 585 
 586 static bool
 587 is_category_Zl (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 588 {
 589   return (unicode_attributes[ch].name != NULL
 590           && unicode_attributes[ch].category[0] == 'Z'
 591           && unicode_attributes[ch].category[1] == 'l');
 592 }
 593 
 594 static bool
 595 is_category_Zp (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 596 {
 597   return (unicode_attributes[ch].name != NULL
 598           && unicode_attributes[ch].category[0] == 'Z'
 599           && unicode_attributes[ch].category[1] == 'p');
 600 }
 601 
 602 static bool
 603 is_category_C (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 604 {
 605   return (unicode_attributes[ch].name == NULL
 606           || unicode_attributes[ch].category[0] == 'C');
 607 }
 608 
 609 static bool
 610 is_category_Cc (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 611 {
 612   return (unicode_attributes[ch].name != NULL
 613           && unicode_attributes[ch].category[0] == 'C'
 614           && unicode_attributes[ch].category[1] == 'c');
 615 }
 616 
 617 static bool
 618 is_category_Cf (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 619 {
 620   return (unicode_attributes[ch].name != NULL
 621           && unicode_attributes[ch].category[0] == 'C'
 622           && unicode_attributes[ch].category[1] == 'f');
 623 }
 624 
 625 static bool
 626 is_category_Cs (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 627 {
 628   return (ch >= 0xd800 && ch < 0xe000);
 629 }
 630 
 631 static bool
 632 is_category_Co (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 633 {
 634   return (unicode_attributes[ch].name != NULL
 635           && unicode_attributes[ch].category[0] == 'C'
 636           && unicode_attributes[ch].category[1] == 'o');
 637 }
 638 
 639 static bool
 640 is_category_Cn (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
 641 {
 642   return (unicode_attributes[ch].name == NULL
 643           && !(ch >= 0xd800 && ch < 0xe000));
 644 }
 645 
 646 /* Output a boolean property in a human readable format.  */
 647 static void
 648 debug_output_predicate (const char *filename, bool (*predicate) (unsigned int))
     /* [previous][next][first][last][top][bottom][index][help] */
 649 {
 650   FILE *stream;
 651   unsigned int ch;
 652 
 653   stream = fopen (filename, "w");
 654   if (stream == NULL)
 655     {
 656       fprintf (stderr, "cannot open '%s' for writing\n", filename);
 657       exit (1);
 658     }
 659 
 660 #if 0 /* This yields huge text output.  */
 661   for (ch = 0; ch < 0x110000; ch++)
 662     if (predicate (ch))
 663       {
 664         fprintf (stream, "0x%04X\n", ch);
 665       }
 666 #else
 667   for (ch = 0; ch < 0x110000; ch++)
 668     if (predicate (ch))
 669       {
 670         unsigned int first = ch;
 671         unsigned int last;
 672 
 673         while (ch + 1 < 0x110000 && predicate (ch + 1))
 674           ch++;
 675         last = ch;
 676         if (first < last)
 677           fprintf (stream, "0x%04X..0x%04X\n", first, last);
 678         else
 679           fprintf (stream, "0x%04X\n", ch);
 680       }
 681 #endif
 682 
 683   if (ferror (stream) || fclose (stream))
 684     {
 685       fprintf (stderr, "error writing to '%s'\n", filename);
 686       exit (1);
 687     }
 688 }
 689 
 690 /* Output the unit test for a boolean property.  */
 691 static void
 692 output_predicate_test (const char *filename, bool (*predicate) (unsigned int), const char *expression)
     /* [previous][next][first][last][top][bottom][index][help] */
 693 {
 694   FILE *stream;
 695   bool need_comma;
 696   unsigned int ch;
 697 
 698   stream = fopen (filename, "w");
 699   if (stream == NULL)
 700     {
 701       fprintf (stderr, "cannot open '%s' for writing\n", filename);
 702       exit (1);
 703     }
 704 
 705   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
 706   fprintf (stream, "/* Test the Unicode character type functions.\n");
 707   fprintf (stream, "   Copyright (C) 2007 Free Software Foundation, Inc.\n");
 708   fprintf (stream, "\n");
 709   output_tests_license (stream);
 710   fprintf (stream, "\n");
 711   fprintf (stream, "#include \"test-predicate-part1.h\"\n");
 712   fprintf (stream, "\n");
 713 
 714   need_comma = false;
 715   for (ch = 0; ch < 0x110000; ch++)
 716     if (predicate (ch))
 717       {
 718         unsigned int first = ch;
 719         unsigned int last;
 720 
 721         while (ch + 1 < 0x110000 && predicate (ch + 1))
 722           ch++;
 723         last = ch;
 724         if (need_comma)
 725           fprintf (stream, ",\n");
 726         fprintf (stream, "    { 0x%04X, 0x%04X }", first, last);
 727         need_comma = true;
 728       }
 729   if (need_comma)
 730     fprintf (stream, "\n");
 731 
 732   fprintf (stream, "\n");
 733   fprintf (stream, "#define PREDICATE(c) %s\n", expression);
 734   fprintf (stream, "#include \"test-predicate-part2.h\"\n");
 735 
 736   if (ferror (stream) || fclose (stream))
 737     {
 738       fprintf (stderr, "error writing to '%s'\n", filename);
 739       exit (1);
 740     }
 741 }
 742 
 743 /* Construction of sparse 3-level tables.  */
 744 #define TABLE predicate_table
 745 #define xmalloc malloc
 746 #define xrealloc realloc
 747 #include "3levelbit.h"
 748 
 749 /* Output a boolean property in a three-level bitmap.  */
 750 static void
 751 output_predicate (const char *filename, bool (*predicate) (unsigned int), const char *name, const char *comment, const char *version)
     /* [previous][next][first][last][top][bottom][index][help] */
 752 {
 753   FILE *stream;
 754   unsigned int ch, i;
 755   struct predicate_table t;
 756   unsigned int level1_offset, level2_offset, level3_offset;
 757 
 758   stream = fopen (filename, "w");
 759   if (stream == NULL)
 760     {
 761       fprintf (stderr, "cannot open '%s' for writing\n", filename);
 762       exit (1);
 763     }
 764 
 765   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
 766   fprintf (stream, "/* %s of Unicode characters.  */\n", comment);
 767   fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
 768            version);
 769   fprintf (stream, "\n");
 770 
 771   fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
 772   fprintf (stream, "\n");
 773   output_library_license (stream, strcmp (filename, "unictype/categ_M.h") == 0);
 774   fprintf (stream, "\n");
 775 
 776   t.p = 4; /* or: 5 */
 777   t.q = 7; /* or: 6 */
 778   predicate_table_init (&t);
 779 
 780   for (ch = 0; ch < 0x110000; ch++)
 781     if (predicate (ch))
 782       predicate_table_add (&t, ch);
 783 
 784   predicate_table_finalize (&t);
 785 
 786   /* Offsets in t.result, in memory of this process.  */
 787   level1_offset =
 788     5 * sizeof (uint32_t);
 789   level2_offset =
 790     5 * sizeof (uint32_t)
 791     + t.level1_size * sizeof (uint32_t);
 792   level3_offset =
 793     5 * sizeof (uint32_t)
 794     + t.level1_size * sizeof (uint32_t)
 795     + (t.level2_size << t.q) * sizeof (uint32_t);
 796 
 797   for (i = 0; i < 5; i++)
 798     if (i != 1)
 799       fprintf (stream, "#define header_%d %d\n", i,
 800                ((uint32_t *) t.result)[i]);
 801 
 802   fprintf (stream, "static const\n");
 803   fprintf (stream, "struct\n");
 804   fprintf (stream, "  {\n");
 805   fprintf (stream, "    int header[1];\n");
 806   fprintf (stream, "    int level1[%zu];\n", t.level1_size);
 807   fprintf (stream, "    short level2[%zu << %d];\n", t.level2_size, t.q);
 808   fprintf (stream, "    unsigned int level3[%zu << %d];\n", t.level3_size, t.p);
 809   fprintf (stream, "  }\n");
 810   fprintf (stream, "%s =\n", name);
 811   fprintf (stream, "{\n");
 812   fprintf (stream, "  { %d },\n", ((uint32_t *) t.result)[1]);
 813   fprintf (stream, "  {");
 814   if (t.level1_size > 1)
 815     fprintf (stream, "\n   ");
 816   for (i = 0; i < t.level1_size; i++)
 817     {
 818       uint32_t offset;
 819       if (i > 0 && (i % 1) == 0)
 820         fprintf (stream, "\n   ");
 821       offset = ((uint32_t *) (t.result + level1_offset))[i];
 822       if (offset == 0)
 823         fprintf (stream, " %5d", -1);
 824       else
 825         fprintf (stream, " %5zu * sizeof (int) / sizeof (short) + %5zu",
 826                  1 + t.level1_size, (offset - level2_offset) / sizeof (uint32_t));
 827       if (i+1 < t.level1_size)
 828         fprintf (stream, ",");
 829     }
 830   if (t.level1_size > 1)
 831     fprintf (stream, "\n ");
 832   fprintf (stream, " },\n");
 833   fprintf (stream, "  {");
 834   if (t.level2_size << t.q > 1)
 835     fprintf (stream, "\n   ");
 836   for (i = 0; i < t.level2_size << t.q; i++)
 837     {
 838       uint32_t offset;
 839       if (i > 0 && (i % 1) == 0)
 840         fprintf (stream, "\n   ");
 841       offset = ((uint32_t *) (t.result + level2_offset))[i];
 842       if (offset == 0)
 843         fprintf (stream, " %5d", -1);
 844       else
 845         fprintf (stream, " %5zu + %5zu * sizeof (short) / sizeof (int) + %5zu",
 846                  1 + t.level1_size, t.level2_size << t.q, (offset - level3_offset) / sizeof (uint32_t));
 847       if (i+1 < t.level2_size << t.q)
 848         fprintf (stream, ",");
 849     }
 850   if (t.level2_size << t.q > 1)
 851     fprintf (stream, "\n ");
 852   fprintf (stream, " },\n");
 853   fprintf (stream, "  {");
 854   if (t.level3_size << t.p > 4)
 855     fprintf (stream, "\n   ");
 856   for (i = 0; i < t.level3_size << t.p; i++)
 857     {
 858       if (i > 0 && (i % 4) == 0)
 859         fprintf (stream, "\n   ");
 860       fprintf (stream, " 0x%08XU",
 861                ((uint32_t *) (t.result + level3_offset))[i]);
 862       if (i+1 < t.level3_size << t.p)
 863         fprintf (stream, ",");
 864     }
 865   if (t.level3_size << t.p > 4)
 866     fprintf (stream, "\n ");
 867   fprintf (stream, " }\n");
 868   fprintf (stream, "};\n");
 869 
 870   if (ferror (stream) || fclose (stream))
 871     {
 872       fprintf (stderr, "error writing to '%s'\n", filename);
 873       exit (1);
 874     }
 875 }
 876 
 877 /* Output all categories.  */
 878 static void
 879 output_categories (const char *version)
     /* [previous][next][first][last][top][bottom][index][help] */
 880 {
 881 #define CATEGORY(C) \
 882   debug_output_predicate ("unictype/categ_" #C ".txt", is_category_ ## C); \
 883   output_predicate_test ("../tests/unictype/test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \
 884   output_predicate ("unictype/categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version);
 885   CATEGORY (L)
 886   CATEGORY (LC)
 887   CATEGORY (Lu)
 888   CATEGORY (Ll)
 889   CATEGORY (Lt)
 890   CATEGORY (Lm)
 891   CATEGORY (Lo)
 892   CATEGORY (M)
 893   CATEGORY (Mn)
 894   CATEGORY (Mc)
 895   CATEGORY (Me)
 896   CATEGORY (N)
 897   CATEGORY (Nd)
 898   CATEGORY (Nl)
 899   CATEGORY (No)
 900   CATEGORY (P)
 901   CATEGORY (Pc)
 902   CATEGORY (Pd)
 903   CATEGORY (Ps)
 904   CATEGORY (Pe)
 905   CATEGORY (Pi)
 906   CATEGORY (Pf)
 907   CATEGORY (Po)
 908   CATEGORY (S)
 909   CATEGORY (Sm)
 910   CATEGORY (Sc)
 911   CATEGORY (Sk)
 912   CATEGORY (So)
 913   CATEGORY (Z)
 914   CATEGORY (Zs)
 915   CATEGORY (Zl)
 916   CATEGORY (Zp)
 917   CATEGORY (C)
 918   CATEGORY (Cc)
 919   CATEGORY (Cf)
 920   CATEGORY (Cs)
 921   CATEGORY (Co)
 922   CATEGORY (Cn)
 923 #undef CATEGORY
 924 }
 925 
 926 enum
 927 {
 928   UC_CATEGORY_MASK_L  = 0x0000001f,
 929   UC_CATEGORY_MASK_LC = 0x00000007,
 930   UC_CATEGORY_MASK_Lu = 0x00000001,
 931   UC_CATEGORY_MASK_Ll = 0x00000002,
 932   UC_CATEGORY_MASK_Lt = 0x00000004,
 933   UC_CATEGORY_MASK_Lm = 0x00000008,
 934   UC_CATEGORY_MASK_Lo = 0x00000010,
 935   UC_CATEGORY_MASK_M  = 0x000000e0,
 936   UC_CATEGORY_MASK_Mn = 0x00000020,
 937   UC_CATEGORY_MASK_Mc = 0x00000040,
 938   UC_CATEGORY_MASK_Me = 0x00000080,
 939   UC_CATEGORY_MASK_N  = 0x00000700,
 940   UC_CATEGORY_MASK_Nd = 0x00000100,
 941   UC_CATEGORY_MASK_Nl = 0x00000200,
 942   UC_CATEGORY_MASK_No = 0x00000400,
 943   UC_CATEGORY_MASK_P  = 0x0003f800,
 944   UC_CATEGORY_MASK_Pc = 0x00000800,
 945   UC_CATEGORY_MASK_Pd = 0x00001000,
 946   UC_CATEGORY_MASK_Ps = 0x00002000,
 947   UC_CATEGORY_MASK_Pe = 0x00004000,
 948   UC_CATEGORY_MASK_Pi = 0x00008000,
 949   UC_CATEGORY_MASK_Pf = 0x00010000,
 950   UC_CATEGORY_MASK_Po = 0x00020000,
 951   UC_CATEGORY_MASK_S  = 0x003c0000,
 952   UC_CATEGORY_MASK_Sm = 0x00040000,
 953   UC_CATEGORY_MASK_Sc = 0x00080000,
 954   UC_CATEGORY_MASK_Sk = 0x00100000,
 955   UC_CATEGORY_MASK_So = 0x00200000,
 956   UC_CATEGORY_MASK_Z  = 0x01c00000,
 957   UC_CATEGORY_MASK_Zs = 0x00400000,
 958   UC_CATEGORY_MASK_Zl = 0x00800000,
 959   UC_CATEGORY_MASK_Zp = 0x01000000,
 960   UC_CATEGORY_MASK_C  = 0x3e000000,
 961   UC_CATEGORY_MASK_Cc = 0x02000000,
 962   UC_CATEGORY_MASK_Cf = 0x04000000,
 963   UC_CATEGORY_MASK_Cs = 0x08000000,
 964   UC_CATEGORY_MASK_Co = 0x10000000,
 965   UC_CATEGORY_MASK_Cn = 0x20000000
 966 };
 967 
 968 static int
 969 general_category_byname (const char *category_name)
     /* [previous][next][first][last][top][bottom][index][help] */
 970 {
 971   if (category_name[0] != '\0'
 972       && (category_name[1] == '\0' || category_name[2] == '\0'))
 973     switch (category_name[0])
 974       {
 975       case 'L':
 976         switch (category_name[1])
 977           {
 978           case '\0': return UC_CATEGORY_MASK_L;
 979           case 'C': return UC_CATEGORY_MASK_LC;
 980           case 'u': return UC_CATEGORY_MASK_Lu;
 981           case 'l': return UC_CATEGORY_MASK_Ll;
 982           case 't': return UC_CATEGORY_MASK_Lt;
 983           case 'm': return UC_CATEGORY_MASK_Lm;
 984           case 'o': return UC_CATEGORY_MASK_Lo;
 985           }
 986         break;
 987       case 'M':
 988         switch (category_name[1])
 989           {
 990           case '\0': return UC_CATEGORY_MASK_M;
 991           case 'n': return UC_CATEGORY_MASK_Mn;
 992           case 'c': return UC_CATEGORY_MASK_Mc;
 993           case 'e': return UC_CATEGORY_MASK_Me;
 994           }
 995         break;
 996       case 'N':
 997         switch (category_name[1])
 998           {
 999           case '\0': return UC_CATEGORY_MASK_N;
1000           case 'd': return UC_CATEGORY_MASK_Nd;
1001           case 'l': return UC_CATEGORY_MASK_Nl;
1002           case 'o': return UC_CATEGORY_MASK_No;
1003           }
1004         break;
1005       case 'P':
1006         switch (category_name[1])
1007           {
1008           case '\0': return UC_CATEGORY_MASK_P;
1009           case 'c': return UC_CATEGORY_MASK_Pc;
1010           case 'd': return UC_CATEGORY_MASK_Pd;
1011           case 's': return UC_CATEGORY_MASK_Ps;
1012           case 'e': return UC_CATEGORY_MASK_Pe;
1013           case 'i': return UC_CATEGORY_MASK_Pi;
1014           case 'f': return UC_CATEGORY_MASK_Pf;
1015           case 'o': return UC_CATEGORY_MASK_Po;
1016           }
1017         break;
1018       case 'S':
1019         switch (category_name[1])
1020           {
1021           case '\0': return UC_CATEGORY_MASK_S;
1022           case 'm': return UC_CATEGORY_MASK_Sm;
1023           case 'c': return UC_CATEGORY_MASK_Sc;
1024           case 'k': return UC_CATEGORY_MASK_Sk;
1025           case 'o': return UC_CATEGORY_MASK_So;
1026           }
1027         break;
1028       case 'Z':
1029         switch (category_name[1])
1030           {
1031           case '\0': return UC_CATEGORY_MASK_Z;
1032           case 's': return UC_CATEGORY_MASK_Zs;
1033           case 'l': return UC_CATEGORY_MASK_Zl;
1034           case 'p': return UC_CATEGORY_MASK_Zp;
1035           }
1036         break;
1037       case 'C':
1038         switch (category_name[1])
1039           {
1040           case '\0': return UC_CATEGORY_MASK_C;
1041           case 'c': return UC_CATEGORY_MASK_Cc;
1042           case 'f': return UC_CATEGORY_MASK_Cf;
1043           case 's': return UC_CATEGORY_MASK_Cs;
1044           case 'o': return UC_CATEGORY_MASK_Co;
1045           case 'n': return UC_CATEGORY_MASK_Cn;
1046           }
1047         break;
1048       }
1049   /* Invalid category name.  */
1050   abort ();
1051 }
1052 
1053 /* Construction of sparse 3-level tables.  */
1054 #define TABLE category_table
1055 #define ELEMENT uint8_t
1056 #define DEFAULT 29 /* = log2(UC_CATEGORY_MASK_Cn) */
1057 #define xmalloc malloc
1058 #define xrealloc realloc
1059 #include "3level.h"
1060 
1061 /* Output the per-character category table.  */
1062 static void
1063 output_category (const char *filename, const char *version)
     /* [previous][next][first][last][top][bottom][index][help] */
1064 {
1065   FILE *stream;
1066   unsigned int ch, i;
1067   struct category_table t;
1068   unsigned int level1_offset, level2_offset, level3_offset;
1069   uint16_t *level3_packed;
1070 
1071   stream = fopen (filename, "w");
1072   if (stream == NULL)
1073     {
1074       fprintf (stderr, "cannot open '%s' for writing\n", filename);
1075       exit (1);
1076     }
1077 
1078   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1079   fprintf (stream, "/* Categories of Unicode characters.  */\n");
1080   fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
1081            version);
1082   fprintf (stream, "\n");
1083 
1084   fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
1085   fprintf (stream, "\n");
1086   output_library_license (stream, true);
1087   fprintf (stream, "\n");
1088 
1089   t.p = 7;
1090   t.q = 9;
1091   category_table_init (&t);
1092 
1093   for (ch = 0; ch < 0x110000; ch++)
1094     {
1095       int value;
1096       unsigned int log2_value;
1097 
1098       if (is_category_Cs (ch))
1099         value = UC_CATEGORY_MASK_Cs;
1100       else if (unicode_attributes[ch].name != NULL)
1101         value = general_category_byname (unicode_attributes[ch].category);
1102       else
1103         continue;
1104 
1105       /* Now value should contain exactly one bit.  */
1106       assert (value != 0 && (value & (value - 1)) == 0);
1107 
1108       for (log2_value = 0; value > 1; value >>= 1, log2_value++);
1109 
1110       assert (log2_value <= 0x1f);
1111 
1112       category_table_add (&t, ch, log2_value);
1113     }
1114 
1115   category_table_finalize (&t);
1116 
1117   /* Offsets in t.result, in memory of this process.  */
1118   level1_offset =
1119     5 * sizeof (uint32_t);
1120   level2_offset =
1121     5 * sizeof (uint32_t)
1122     + t.level1_size * sizeof (uint32_t);
1123   level3_offset =
1124     5 * sizeof (uint32_t)
1125     + t.level1_size * sizeof (uint32_t)
1126     + (t.level2_size << t.q) * sizeof (uint32_t);
1127 
1128   for (i = 0; i < 5; i++)
1129     fprintf (stream, "#define category_header_%d %d\n", i,
1130              ((uint32_t *) t.result)[i]);
1131   fprintf (stream, "static const\n");
1132   fprintf (stream, "struct\n");
1133   fprintf (stream, "  {\n");
1134   fprintf (stream, "    int level1[%zu];\n", t.level1_size);
1135   fprintf (stream, "    short level2[%zu << %d];\n", t.level2_size, t.q);
1136   fprintf (stream, "    unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1137            (1 << t.p) * 5 / 16);
1138   fprintf (stream, "  }\n");
1139   fprintf (stream, "u_category =\n");
1140   fprintf (stream, "{\n");
1141   fprintf (stream, "  {");
1142   if (t.level1_size > 8)
1143     fprintf (stream, "\n   ");
1144   for (i = 0; i < t.level1_size; i++)
1145     {
1146       uint32_t offset;
1147       if (i > 0 && (i % 8) == 0)
1148         fprintf (stream, "\n   ");
1149       offset = ((uint32_t *) (t.result + level1_offset))[i];
1150       if (offset == 0)
1151         fprintf (stream, " %5d", -1);
1152       else
1153         fprintf (stream, " %5zu",
1154                  (offset - level2_offset) / sizeof (uint32_t));
1155       if (i+1 < t.level1_size)
1156         fprintf (stream, ",");
1157     }
1158   if (t.level1_size > 8)
1159     fprintf (stream, "\n ");
1160   fprintf (stream, " },\n");
1161   fprintf (stream, "  {");
1162   if (t.level2_size << t.q > 8)
1163     fprintf (stream, "\n   ");
1164   for (i = 0; i < t.level2_size << t.q; i++)
1165     {
1166       uint32_t offset;
1167       if (i > 0 && (i % 8) == 0)
1168         fprintf (stream, "\n   ");
1169       offset = ((uint32_t *) (t.result + level2_offset))[i];
1170       if (offset == 0)
1171         fprintf (stream, " %5d", -1);
1172       else
1173         fprintf (stream, " %5zu",
1174                  (offset - level3_offset) / sizeof (uint8_t));
1175       if (i+1 < t.level2_size << t.q)
1176         fprintf (stream, ",");
1177     }
1178   if (t.level2_size << t.q > 8)
1179     fprintf (stream, "\n ");
1180   fprintf (stream, " },\n");
1181   /* Pack the level3 array.  Each entry needs 5 bits only.  Use 16-bit units,
1182      not 32-bit units, in order to make the lookup function easier.  */
1183   level3_packed =
1184     (uint16_t *)
1185     calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1186   for (i = 0; i < t.level3_size << t.p; i++)
1187     {
1188       unsigned int j = (i * 5) / 16;
1189       unsigned int k = (i * 5) % 16;
1190       uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1191       value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1192       level3_packed[j] = value & 0xffff;
1193       level3_packed[j+1] = value >> 16;
1194     }
1195   fprintf (stream, "  {");
1196   if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1197     fprintf (stream, "\n   ");
1198   for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1199     {
1200       if (i > 0 && (i % 8) == 0)
1201         fprintf (stream, "\n   ");
1202       fprintf (stream, " 0x%04x", level3_packed[i]);
1203       if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1204         fprintf (stream, ",");
1205     }
1206   if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1207     fprintf (stream, "\n ");
1208   fprintf (stream, " }\n");
1209   free (level3_packed);
1210   fprintf (stream, "};\n");
1211 
1212   if (ferror (stream) || fclose (stream))
1213     {
1214       fprintf (stderr, "error writing to '%s'\n", filename);
1215       exit (1);
1216     }
1217 }
1218 
1219 /* ========================================================================= */
1220 
1221 /* Canonical combining class.  */
1222 /* See Unicode 3.0 book, section 4.2,
1223        UCD.html.  */
1224 
1225 /* Construction of sparse 3-level tables.  */
1226 #define TABLE combclass_table
1227 #define ELEMENT uint8_t
1228 #define DEFAULT 0
1229 #define xmalloc malloc
1230 #define xrealloc realloc
1231 #include "3level.h"
1232 
1233 /* Output the per-character combining class table.  */
1234 static void
1235 output_combclass (const char *filename, const char *version)
     /* [previous][next][first][last][top][bottom][index][help] */
1236 {
1237   FILE *stream;
1238   unsigned int ch, i;
1239   struct combclass_table t;
1240   unsigned int level1_offset, level2_offset, level3_offset;
1241 
1242   stream = fopen (filename, "w");
1243   if (stream == NULL)
1244     {
1245       fprintf (stderr, "cannot open '%s' for writing\n", filename);
1246       exit (1);
1247     }
1248 
1249   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1250   fprintf (stream, "/* Combining class of Unicode characters.  */\n");
1251   fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
1252            version);
1253   fprintf (stream, "\n");
1254 
1255   fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
1256   fprintf (stream, "\n");
1257   output_library_license (stream, true);
1258   fprintf (stream, "\n");
1259 
1260   t.p = 7;
1261   t.q = 9;
1262   combclass_table_init (&t);
1263 
1264   for (ch = 0; ch < 0x110000; ch++)
1265     if (unicode_attributes[ch].name != NULL)
1266       {
1267         int value = atoi (unicode_attributes[ch].combining);
1268         assert (value >= 0 && value <= 255);
1269         combclass_table_add (&t, ch, value);
1270       }
1271 
1272   combclass_table_finalize (&t);
1273 
1274   /* Offsets in t.result, in memory of this process.  */
1275   level1_offset =
1276     5 * sizeof (uint32_t);
1277   level2_offset =
1278     5 * sizeof (uint32_t)
1279     + t.level1_size * sizeof (uint32_t);
1280   level3_offset =
1281     5 * sizeof (uint32_t)
1282     + t.level1_size * sizeof (uint32_t)
1283     + (t.level2_size << t.q) * sizeof (uint32_t);
1284 
1285   for (i = 0; i < 5; i++)
1286     fprintf (stream, "#define combclass_header_%d %d\n", i,
1287              ((uint32_t *) t.result)[i]);
1288   fprintf (stream, "static const\n");
1289   fprintf (stream, "struct\n");
1290   fprintf (stream, "  {\n");
1291   fprintf (stream, "    int level1[%zu];\n", t.level1_size);
1292   fprintf (stream, "    short level2[%zu << %d];\n", t.level2_size, t.q);
1293   fprintf (stream, "    unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
1294   fprintf (stream, "  }\n");
1295   fprintf (stream, "u_combclass =\n");
1296   fprintf (stream, "{\n");
1297   fprintf (stream, "  {");
1298   if (t.level1_size > 8)
1299     fprintf (stream, "\n   ");
1300   for (i = 0; i < t.level1_size; i++)
1301     {
1302       uint32_t offset;
1303       if (i > 0 && (i % 8) == 0)
1304         fprintf (stream, "\n   ");
1305       offset = ((uint32_t *) (t.result + level1_offset))[i];
1306       if (offset == 0)
1307         fprintf (stream, " %5d", -1);
1308       else
1309         fprintf (stream, " %5zu",
1310                  (offset - level2_offset) / sizeof (uint32_t));
1311       if (i+1 < t.level1_size)
1312         fprintf (stream, ",");
1313     }
1314   if (t.level1_size > 8)
1315     fprintf (stream, "\n ");
1316   fprintf (stream, " },\n");
1317   fprintf (stream, "  {");
1318   if (t.level2_size << t.q > 8)
1319     fprintf (stream, "\n   ");
1320   for (i = 0; i < t.level2_size << t.q; i++)
1321     {
1322       uint32_t offset;
1323       if (i > 0 && (i % 8) == 0)
1324         fprintf (stream, "\n   ");
1325       offset = ((uint32_t *) (t.result + level2_offset))[i];
1326       if (offset == 0)
1327         fprintf (stream, " %5d", -1);
1328       else
1329         fprintf (stream, " %5zu",
1330                  (offset - level3_offset) / sizeof (uint8_t));
1331       if (i+1 < t.level2_size << t.q)
1332         fprintf (stream, ",");
1333     }
1334   if (t.level2_size << t.q > 8)
1335     fprintf (stream, "\n ");
1336   fprintf (stream, " },\n");
1337   fprintf (stream, "  {");
1338   if (t.level3_size << t.p > 8)
1339     fprintf (stream, "\n   ");
1340   for (i = 0; i < t.level3_size << t.p; i++)
1341     {
1342       if (i > 0 && (i % 8) == 0)
1343         fprintf (stream, "\n   ");
1344       fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
1345       if (i+1 < t.level3_size << t.p)
1346         fprintf (stream, ",");
1347     }
1348   if (t.level3_size << t.p > 8)
1349     fprintf (stream, "\n ");
1350   fprintf (stream, " }\n");
1351   fprintf (stream, "};\n");
1352 
1353   if (ferror (stream) || fclose (stream))
1354     {
1355       fprintf (stderr, "error writing to '%s'\n", filename);
1356       exit (1);
1357     }
1358 }
1359 
1360 /* ========================================================================= */
1361 
1362 /* Bidirectional category.  */
1363 /* See Unicode 3.0 book, section 4.3,
1364        UCD.html.  */
1365 
1366 enum
1367 {
1368   UC_BIDI_L,   /* Left-to-Right */
1369   UC_BIDI_LRE, /* Left-to-Right Embedding */
1370   UC_BIDI_LRO, /* Left-to-Right Override */
1371   UC_BIDI_R,   /* Right-to-Left */
1372   UC_BIDI_AL,  /* Right-to-Left Arabic */
1373   UC_BIDI_RLE, /* Right-to-Left Embedding */
1374   UC_BIDI_RLO, /* Right-to-Left Override */
1375   UC_BIDI_PDF, /* Pop Directional Format */
1376   UC_BIDI_EN,  /* European Number */
1377   UC_BIDI_ES,  /* European Number Separator */
1378   UC_BIDI_ET,  /* European Number Terminator */
1379   UC_BIDI_AN,  /* Arabic Number */
1380   UC_BIDI_CS,  /* Common Number Separator */
1381   UC_BIDI_NSM, /* Non-Spacing Mark */
1382   UC_BIDI_BN,  /* Boundary Neutral */
1383   UC_BIDI_B,   /* Paragraph Separator */
1384   UC_BIDI_S,   /* Segment Separator */
1385   UC_BIDI_WS,  /* Whitespace */
1386   UC_BIDI_ON,  /* Other Neutral */
1387   UC_BIDI_LRI, /* Left-to-Right Isolate */
1388   UC_BIDI_RLI, /* Right-to-Left Isolate */
1389   UC_BIDI_FSI, /* First Strong Isolate */
1390   UC_BIDI_PDI  /* Pop Directional Isolate */
1391 };
1392 
1393 static int
1394 bidi_category_byname (const char *category_name)
     /* [previous][next][first][last][top][bottom][index][help] */
1395 {
1396   switch (category_name[0])
1397     {
1398     case 'A':
1399       switch (category_name[1])
1400         {
1401         case 'L':
1402           if (category_name[2] == '\0')
1403             return UC_BIDI_AL;
1404           break;
1405         case 'N':
1406           if (category_name[2] == '\0')
1407             return UC_BIDI_AN;
1408           break;
1409         }
1410       break;
1411     case 'B':
1412       switch (category_name[1])
1413         {
1414         case '\0':
1415           return UC_BIDI_B;
1416         case 'N':
1417           if (category_name[2] == '\0')
1418             return UC_BIDI_BN;
1419           break;
1420         }
1421       break;
1422     case 'C':
1423       switch (category_name[1])
1424         {
1425         case 'S':
1426           if (category_name[2] == '\0')
1427             return UC_BIDI_CS;
1428           break;
1429         }
1430       break;
1431     case 'E':
1432       switch (category_name[1])
1433         {
1434         case 'N':
1435           if (category_name[2] == '\0')
1436             return UC_BIDI_EN;
1437           break;
1438         case 'S':
1439           if (category_name[2] == '\0')
1440             return UC_BIDI_ES;
1441           break;
1442         case 'T':
1443           if (category_name[2] == '\0')
1444             return UC_BIDI_ET;
1445           break;
1446         }
1447       break;
1448     case 'F':
1449       switch (category_name[1])
1450         {
1451         case 'S':
1452           switch (category_name[2])
1453             {
1454             case 'I':
1455               if (category_name[3] == '\0')
1456                 return UC_BIDI_FSI;
1457               break;
1458             }
1459         }
1460       break;
1461    case 'L':
1462       switch (category_name[1])
1463         {
1464         case '\0':
1465           return UC_BIDI_L;
1466         case 'R':
1467           switch (category_name[2])
1468             {
1469             case 'E':
1470               if (category_name[3] == '\0')
1471                 return UC_BIDI_LRE;
1472               break;
1473             case 'O':
1474               if (category_name[3] == '\0')
1475                 return UC_BIDI_LRO;
1476               break;
1477             case 'I':
1478               if (category_name[3] == '\0')
1479                 return UC_BIDI_LRI;
1480               break;
1481            }
1482           break;
1483         }
1484       break;
1485     case 'N':
1486       switch (category_name[1])
1487         {
1488         case 'S':
1489           switch (category_name[2])
1490             {
1491             case 'M':
1492               if (category_name[3] == '\0')
1493                 return UC_BIDI_NSM;
1494               break;
1495             }
1496           break;
1497         }
1498       break;
1499     case 'O':
1500       switch (category_name[1])
1501         {
1502         case 'N':
1503           if (category_name[2] == '\0')
1504             return UC_BIDI_ON;
1505           break;
1506         }
1507       break;
1508     case 'P':
1509       switch (category_name[1])
1510         {
1511         case 'D':
1512           switch (category_name[2])
1513             {
1514             case 'F':
1515               if (category_name[3] == '\0')
1516                 return UC_BIDI_PDF;
1517               break;
1518             case 'I':
1519               if (category_name[3] == '\0')
1520                 return UC_BIDI_PDI;
1521               break;
1522             }
1523           break;
1524         }
1525       break;
1526     case 'R':
1527       switch (category_name[1])
1528         {
1529         case '\0':
1530           return UC_BIDI_R;
1531         case 'L':
1532           switch (category_name[2])
1533             {
1534             case 'E':
1535               if (category_name[3] == '\0')
1536                 return UC_BIDI_RLE;
1537               break;
1538             case 'O':
1539               if (category_name[3] == '\0')
1540                 return UC_BIDI_RLO;
1541               break;
1542             case 'I':
1543               if (category_name[3] == '\0')
1544                 return UC_BIDI_RLI;
1545               break;
1546            }
1547           break;
1548         }
1549       break;
1550     case 'S':
1551       if (category_name[1] == '\0')
1552         return UC_BIDI_S;
1553       break;
1554     case 'W':
1555       switch (category_name[1])
1556         {
1557         case 'S':
1558           if (category_name[2] == '\0')
1559             return UC_BIDI_WS;
1560           break;
1561         }
1562       break;
1563     }
1564   /* Invalid bidi category name.  */
1565   abort ();
1566 }
1567 
1568 static int
1569 get_bidi_category (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
1570 {
1571   if (unicode_attributes[ch].name != NULL)
1572     return bidi_category_byname (unicode_attributes[ch].bidi);
1573   else
1574     {
1575       /* The bidi category of unassigned characters depends on the range.
1576          See UTR #9 and DerivedBidiClass.txt.  */
1577       if ((ch >= 0x0590 && ch <= 0x05FF)
1578           || (ch >= 0x07FB && ch <= 0x08FF)
1579           || (ch >= 0xFB37 && ch <= 0xFB45)
1580           || (ch >= 0x10800 && ch <= 0x10FFF))
1581         return UC_BIDI_R;
1582       else if ((ch >= 0x0600 && ch <= 0x07BF)
1583                || (ch >= 0x2064 && ch <= 0x2069)
1584                || (ch >= 0xFBB2 && ch <= 0xFDCF)
1585                || (ch >= 0xFDFE && ch <= 0xFEFE))
1586         return UC_BIDI_AL;
1587       else if ((ch >= 0xFDD0 && ch <= 0xFDEF)
1588                || (ch >= 0xFFF0 && ch <= 0xFFFF)
1589                || (ch & 0xFFFF) == 0xFFFE
1590                || (ch & 0xFFFF) == 0xFFFF
1591                || (ch >= 0xE0000 && ch <= 0xE0FFF))
1592         return UC_BIDI_BN;
1593       else
1594         return UC_BIDI_L;
1595     }
1596 }
1597 
1598 /* Construction of sparse 3-level tables.  */
1599 #define TABLE bidi_category_table
1600 #define ELEMENT uint8_t
1601 #define DEFAULT UC_BIDI_L
1602 #define xmalloc malloc
1603 #define xrealloc realloc
1604 #include "3level.h"
1605 
1606 /* Output the per-character bidi category table.  */
1607 static void
1608 output_bidi_category (const char *filename, const char *version)
     /* [previous][next][first][last][top][bottom][index][help] */
1609 {
1610   FILE *stream;
1611   unsigned int ch, i;
1612   struct bidi_category_table t;
1613   unsigned int level1_offset, level2_offset, level3_offset;
1614   uint16_t *level3_packed;
1615 
1616   stream = fopen (filename, "w");
1617   if (stream == NULL)
1618     {
1619       fprintf (stderr, "cannot open '%s' for writing\n", filename);
1620       exit (1);
1621     }
1622 
1623   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1624   fprintf (stream, "/* Bidi categories of Unicode characters.  */\n");
1625   fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
1626            version);
1627   fprintf (stream, "\n");
1628 
1629   fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
1630   fprintf (stream, "\n");
1631   output_library_license (stream, true);
1632   fprintf (stream, "\n");
1633 
1634   t.p = 7;
1635   t.q = 9;
1636   bidi_category_table_init (&t);
1637 
1638   for (ch = 0; ch < 0x110000; ch++)
1639     {
1640       int value = get_bidi_category (ch);
1641 
1642       assert (value <= 0x1f);
1643 
1644       bidi_category_table_add (&t, ch, value);
1645     }
1646 
1647   bidi_category_table_finalize (&t);
1648 
1649   /* Offsets in t.result, in memory of this process.  */
1650   level1_offset =
1651     5 * sizeof (uint32_t);
1652   level2_offset =
1653     5 * sizeof (uint32_t)
1654     + t.level1_size * sizeof (uint32_t);
1655   level3_offset =
1656     5 * sizeof (uint32_t)
1657     + t.level1_size * sizeof (uint32_t)
1658     + (t.level2_size << t.q) * sizeof (uint32_t);
1659 
1660   for (i = 0; i < 5; i++)
1661     fprintf (stream, "#define bidi_category_header_%d %d\n", i,
1662              ((uint32_t *) t.result)[i]);
1663   fprintf (stream, "static const\n");
1664   fprintf (stream, "struct\n");
1665   fprintf (stream, "  {\n");
1666   fprintf (stream, "    int level1[%zu];\n", t.level1_size);
1667   fprintf (stream, "    short level2[%zu << %d];\n", t.level2_size, t.q);
1668   fprintf (stream, "    unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1669            (1 << t.p) * 5 / 16);
1670   fprintf (stream, "  }\n");
1671   fprintf (stream, "u_bidi_category =\n");
1672   fprintf (stream, "{\n");
1673   fprintf (stream, "  {");
1674   if (t.level1_size > 8)
1675     fprintf (stream, "\n   ");
1676   for (i = 0; i < t.level1_size; i++)
1677     {
1678       uint32_t offset;
1679       if (i > 0 && (i % 8) == 0)
1680         fprintf (stream, "\n   ");
1681       offset = ((uint32_t *) (t.result + level1_offset))[i];
1682       if (offset == 0)
1683         fprintf (stream, " %5d", -1);
1684       else
1685         fprintf (stream, " %5zu",
1686                  (offset - level2_offset) / sizeof (uint32_t));
1687       if (i+1 < t.level1_size)
1688         fprintf (stream, ",");
1689     }
1690   if (t.level1_size > 8)
1691     fprintf (stream, "\n ");
1692   fprintf (stream, " },\n");
1693   fprintf (stream, "  {");
1694   if (t.level2_size << t.q > 8)
1695     fprintf (stream, "\n   ");
1696   for (i = 0; i < t.level2_size << t.q; i++)
1697     {
1698       uint32_t offset;
1699       if (i > 0 && (i % 8) == 0)
1700         fprintf (stream, "\n   ");
1701       offset = ((uint32_t *) (t.result + level2_offset))[i];
1702       if (offset == 0)
1703         fprintf (stream, " %5d", -1);
1704       else
1705         fprintf (stream, " %5zu",
1706                  (offset - level3_offset) / sizeof (uint8_t));
1707       if (i+1 < t.level2_size << t.q)
1708         fprintf (stream, ",");
1709     }
1710   if (t.level2_size << t.q > 8)
1711     fprintf (stream, "\n ");
1712   fprintf (stream, " },\n");
1713   /* Pack the level3 array.  Each entry needs 5 bits only.  Use 16-bit units,
1714      not 32-bit units, in order to make the lookup function easier.  */
1715   level3_packed =
1716     (uint16_t *)
1717     calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1718   for (i = 0; i < t.level3_size << t.p; i++)
1719     {
1720       unsigned int j = (i * 5) / 16;
1721       unsigned int k = (i * 5) % 16;
1722       uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1723       value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1724       level3_packed[j] = value & 0xffff;
1725       level3_packed[j+1] = value >> 16;
1726     }
1727   fprintf (stream, "  {");
1728   if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1729     fprintf (stream, "\n   ");
1730   for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1731     {
1732       if (i > 0 && (i % 8) == 0)
1733         fprintf (stream, "\n   ");
1734       fprintf (stream, " 0x%04x", level3_packed[i]);
1735       if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1736         fprintf (stream, ",");
1737     }
1738   if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1739     fprintf (stream, "\n ");
1740   fprintf (stream, " }\n");
1741   free (level3_packed);
1742   fprintf (stream, "};\n");
1743 
1744   if (ferror (stream) || fclose (stream))
1745     {
1746       fprintf (stderr, "error writing to '%s'\n", filename);
1747       exit (1);
1748     }
1749 }
1750 
1751 /* ========================================================================= */
1752 
1753 /* Decimal digit value.  */
1754 /* See Unicode 3.0 book, section 4.6.  */
1755 
1756 static int
1757 get_decdigit_value (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
1758 {
1759   if (unicode_attributes[ch].name != NULL
1760       && unicode_attributes[ch].decdigit[0] != '\0')
1761     return atoi (unicode_attributes[ch].decdigit);
1762   return -1;
1763 }
1764 
1765 /* Construction of sparse 3-level tables.  */
1766 #define TABLE decdigit_table
1767 #define ELEMENT uint8_t
1768 #define DEFAULT 0
1769 #define xmalloc malloc
1770 #define xrealloc realloc
1771 #include "3level.h"
1772 
1773 /* Output the unit test for the per-character decimal digit value table.  */
1774 static void
1775 output_decimal_digit_test (const char *filename, const char *version)
     /* [previous][next][first][last][top][bottom][index][help] */
1776 {
1777   FILE *stream;
1778   bool need_comma;
1779   unsigned int ch;
1780 
1781   stream = fopen (filename, "w");
1782   if (stream == NULL)
1783     {
1784       fprintf (stderr, "cannot open '%s' for writing\n", filename);
1785       exit (1);
1786     }
1787 
1788   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1789   fprintf (stream, "/* Decimal digit values of Unicode characters.  */\n");
1790   fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
1791            version);
1792   fprintf (stream, "\n");
1793 
1794   fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
1795   fprintf (stream, "\n");
1796   output_tests_license (stream);
1797   fprintf (stream, "\n");
1798 
1799   need_comma = false;
1800   for (ch = 0; ch < 0x110000; ch++)
1801     {
1802       int value = get_decdigit_value (ch);
1803 
1804       assert (value >= -1 && value < 10);
1805 
1806       if (value >= 0)
1807         {
1808           if (need_comma)
1809             fprintf (stream, ",\n");
1810           fprintf (stream, "    { 0x%04X, %d }", ch, value);
1811           need_comma = true;
1812         }
1813     }
1814   if (need_comma)
1815     fprintf (stream, "\n");
1816 
1817   if (ferror (stream) || fclose (stream))
1818     {
1819       fprintf (stderr, "error writing to '%s'\n", filename);
1820       exit (1);
1821     }
1822 }
1823 
1824 /* Output the per-character decimal digit value table.  */
1825 static void
1826 output_decimal_digit (const char *filename, const char *version)
     /* [previous][next][first][last][top][bottom][index][help] */
1827 {
1828   FILE *stream;
1829   unsigned int ch, i;
1830   struct decdigit_table t;
1831   unsigned int level1_offset, level2_offset, level3_offset;
1832 
1833   stream = fopen (filename, "w");
1834   if (stream == NULL)
1835     {
1836       fprintf (stderr, "cannot open '%s' for writing\n", filename);
1837       exit (1);
1838     }
1839 
1840   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1841   fprintf (stream, "/* Decimal digit values of Unicode characters.  */\n");
1842   fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
1843            version);
1844   fprintf (stream, "\n");
1845 
1846   fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
1847   fprintf (stream, "\n");
1848   output_library_license (stream, false);
1849   fprintf (stream, "\n");
1850 
1851   t.p = 7;
1852   t.q = 9;
1853   decdigit_table_init (&t);
1854 
1855   for (ch = 0; ch < 0x110000; ch++)
1856     {
1857       int value = 1 + get_decdigit_value (ch);
1858 
1859       assert (value >= 0 && value <= 10);
1860 
1861       decdigit_table_add (&t, ch, value);
1862     }
1863 
1864   decdigit_table_finalize (&t);
1865 
1866   /* Offsets in t.result, in memory of this process.  */
1867   level1_offset =
1868     5 * sizeof (uint32_t);
1869   level2_offset =
1870     5 * sizeof (uint32_t)
1871     + t.level1_size * sizeof (uint32_t);
1872   level3_offset =
1873     5 * sizeof (uint32_t)
1874     + t.level1_size * sizeof (uint32_t)
1875     + (t.level2_size << t.q) * sizeof (uint32_t);
1876 
1877   for (i = 0; i < 5; i++)
1878     fprintf (stream, "#define decdigit_header_%d %d\n", i,
1879              ((uint32_t *) t.result)[i]);
1880   fprintf (stream, "static const\n");
1881   fprintf (stream, "struct\n");
1882   fprintf (stream, "  {\n");
1883   fprintf (stream, "    int level1[%zu];\n", t.level1_size);
1884   fprintf (stream, "    short level2[%zu << %d];\n", t.level2_size, t.q);
1885   fprintf (stream, "    unsigned char level3[%zu << %d];\n", t.level3_size,
1886            t.p - 1);
1887   fprintf (stream, "  }\n");
1888   fprintf (stream, "u_decdigit =\n");
1889   fprintf (stream, "{\n");
1890   fprintf (stream, "  {");
1891   if (t.level1_size > 8)
1892     fprintf (stream, "\n   ");
1893   for (i = 0; i < t.level1_size; i++)
1894     {
1895       uint32_t offset;
1896       if (i > 0 && (i % 8) == 0)
1897         fprintf (stream, "\n   ");
1898       offset = ((uint32_t *) (t.result + level1_offset))[i];
1899       if (offset == 0)
1900         fprintf (stream, " %5d", -1);
1901       else
1902         fprintf (stream, " %5zu",
1903                  (offset - level2_offset) / sizeof (uint32_t));
1904       if (i+1 < t.level1_size)
1905         fprintf (stream, ",");
1906     }
1907   if (t.level1_size > 8)
1908     fprintf (stream, "\n ");
1909   fprintf (stream, " },\n");
1910   fprintf (stream, "  {");
1911   if (t.level2_size << t.q > 8)
1912     fprintf (stream, "\n   ");
1913   for (i = 0; i < t.level2_size << t.q; i++)
1914     {
1915       uint32_t offset;
1916       if (i > 0 && (i % 8) == 0)
1917         fprintf (stream, "\n   ");
1918       offset = ((uint32_t *) (t.result + level2_offset))[i];
1919       if (offset == 0)
1920         fprintf (stream, " %5d", -1);
1921       else
1922         fprintf (stream, " %5zu",
1923                  (offset - level3_offset) / sizeof (uint8_t));
1924       if (i+1 < t.level2_size << t.q)
1925         fprintf (stream, ",");
1926     }
1927   if (t.level2_size << t.q > 8)
1928     fprintf (stream, "\n ");
1929   fprintf (stream, " },\n");
1930   /* Pack the level3 array.  Each entry needs 4 bits only.  */
1931   fprintf (stream, "  {");
1932   if (t.level3_size << (t.p - 1) > 8)
1933     fprintf (stream, "\n   ");
1934   for (i = 0; i < t.level3_size << (t.p - 1); i++)
1935     {
1936       if (i > 0 && (i % 8) == 0)
1937         fprintf (stream, "\n   ");
1938       fprintf (stream, " 0x%02x",
1939                ((uint8_t *) (t.result + level3_offset))[2*i]
1940                + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1941       if (i+1 < t.level3_size << (t.p - 1))
1942         fprintf (stream, ",");
1943     }
1944   if (t.level3_size << (t.p - 1) > 8)
1945     fprintf (stream, "\n ");
1946   fprintf (stream, " }\n");
1947   fprintf (stream, "};\n");
1948 
1949   if (ferror (stream) || fclose (stream))
1950     {
1951       fprintf (stderr, "error writing to '%s'\n", filename);
1952       exit (1);
1953     }
1954 }
1955 
1956 /* ========================================================================= */
1957 
1958 /* Digit value.  */
1959 /* See Unicode 3.0 book, section 4.6.  */
1960 
1961 static int
1962 get_digit_value (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
1963 {
1964   if (unicode_attributes[ch].name != NULL
1965       && unicode_attributes[ch].digit[0] != '\0')
1966     return atoi (unicode_attributes[ch].digit);
1967   return -1;
1968 }
1969 
1970 /* Output the unit test for the per-character digit value table.  */
1971 static void
1972 output_digit_test (const char *filename, const char *version)
     /* [previous][next][first][last][top][bottom][index][help] */
1973 {
1974   FILE *stream;
1975   bool need_comma;
1976   unsigned int ch;
1977 
1978   stream = fopen (filename, "w");
1979   if (stream == NULL)
1980     {
1981       fprintf (stderr, "cannot open '%s' for writing\n", filename);
1982       exit (1);
1983     }
1984 
1985   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1986   fprintf (stream, "/* Digit values of Unicode characters.  */\n");
1987   fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
1988            version);
1989   fprintf (stream, "\n");
1990 
1991   fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
1992   fprintf (stream, "\n");
1993   output_tests_license (stream);
1994   fprintf (stream, "\n");
1995 
1996   need_comma = false;
1997   for (ch = 0; ch < 0x110000; ch++)
1998     {
1999       int value = get_digit_value (ch);
2000 
2001       assert (value >= -1 && value < 10);
2002 
2003       if (value >= 0)
2004         {
2005           if (need_comma)
2006             fprintf (stream, ",\n");
2007           fprintf (stream, "    { 0x%04X, %d }", ch, value);
2008           need_comma = true;
2009         }
2010     }
2011   if (need_comma)
2012     fprintf (stream, "\n");
2013 
2014   if (ferror (stream) || fclose (stream))
2015     {
2016       fprintf (stderr, "error writing to '%s'\n", filename);
2017       exit (1);
2018     }
2019 }
2020 
2021 /* Output the per-character digit value table.  */
2022 static void
2023 output_digit (const char *filename, const char *version)
     /* [previous][next][first][last][top][bottom][index][help] */
2024 {
2025   FILE *stream;
2026   unsigned int ch, i;
2027   struct decdigit_table t;
2028   unsigned int level1_offset, level2_offset, level3_offset;
2029 
2030   stream = fopen (filename, "w");
2031   if (stream == NULL)
2032     {
2033       fprintf (stderr, "cannot open '%s' for writing\n", filename);
2034       exit (1);
2035     }
2036 
2037   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2038   fprintf (stream, "/* Digit values of Unicode characters.  */\n");
2039   fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
2040            version);
2041   fprintf (stream, "\n");
2042 
2043   fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
2044   fprintf (stream, "\n");
2045   output_library_license (stream, false);
2046   fprintf (stream, "\n");
2047 
2048   t.p = 7;
2049   t.q = 9;
2050   decdigit_table_init (&t);
2051 
2052   for (ch = 0; ch < 0x110000; ch++)
2053     {
2054       int value = 1 + get_digit_value (ch);
2055 
2056       assert (value >= 0 && value <= 10);
2057 
2058       decdigit_table_add (&t, ch, value);
2059     }
2060 
2061   decdigit_table_finalize (&t);
2062 
2063   /* Offsets in t.result, in memory of this process.  */
2064   level1_offset =
2065     5 * sizeof (uint32_t);
2066   level2_offset =
2067     5 * sizeof (uint32_t)
2068     + t.level1_size * sizeof (uint32_t);
2069   level3_offset =
2070     5 * sizeof (uint32_t)
2071     + t.level1_size * sizeof (uint32_t)
2072     + (t.level2_size << t.q) * sizeof (uint32_t);
2073 
2074   for (i = 0; i < 5; i++)
2075     fprintf (stream, "#define digit_header_%d %d\n", i,
2076              ((uint32_t *) t.result)[i]);
2077   fprintf (stream, "static const\n");
2078   fprintf (stream, "struct\n");
2079   fprintf (stream, "  {\n");
2080   fprintf (stream, "    int level1[%zu];\n", t.level1_size);
2081   fprintf (stream, "    short level2[%zu << %d];\n", t.level2_size, t.q);
2082   fprintf (stream, "    unsigned char level3[%zu << %d];\n", t.level3_size,
2083            t.p - 1);
2084   fprintf (stream, "  }\n");
2085   fprintf (stream, "u_digit =\n");
2086   fprintf (stream, "{\n");
2087   fprintf (stream, "  {");
2088   if (t.level1_size > 8)
2089     fprintf (stream, "\n   ");
2090   for (i = 0; i < t.level1_size; i++)
2091     {
2092       uint32_t offset;
2093       if (i > 0 && (i % 8) == 0)
2094         fprintf (stream, "\n   ");
2095       offset = ((uint32_t *) (t.result + level1_offset))[i];
2096       if (offset == 0)
2097         fprintf (stream, " %5d", -1);
2098       else
2099         fprintf (stream, " %5zu",
2100                  (offset - level2_offset) / sizeof (uint32_t));
2101       if (i+1 < t.level1_size)
2102         fprintf (stream, ",");
2103     }
2104   if (t.level1_size > 8)
2105     fprintf (stream, "\n ");
2106   fprintf (stream, " },\n");
2107   fprintf (stream, "  {");
2108   if (t.level2_size << t.q > 8)
2109     fprintf (stream, "\n   ");
2110   for (i = 0; i < t.level2_size << t.q; i++)
2111     {
2112       uint32_t offset;
2113       if (i > 0 && (i % 8) == 0)
2114         fprintf (stream, "\n   ");
2115       offset = ((uint32_t *) (t.result + level2_offset))[i];
2116       if (offset == 0)
2117         fprintf (stream, " %5d", -1);
2118       else
2119         fprintf (stream, " %5zu",
2120                  (offset - level3_offset) / sizeof (uint8_t));
2121       if (i+1 < t.level2_size << t.q)
2122         fprintf (stream, ",");
2123     }
2124   if (t.level2_size << t.q > 8)
2125     fprintf (stream, "\n ");
2126   fprintf (stream, " },\n");
2127   /* Pack the level3 array.  Each entry needs 4 bits only.  */
2128   fprintf (stream, "  {");
2129   if (t.level3_size << (t.p - 1) > 8)
2130     fprintf (stream, "\n   ");
2131   for (i = 0; i < t.level3_size << (t.p - 1); i++)
2132     {
2133       if (i > 0 && (i % 8) == 0)
2134         fprintf (stream, "\n   ");
2135       fprintf (stream, " 0x%02x",
2136                ((uint8_t *) (t.result + level3_offset))[2*i]
2137                + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
2138       if (i+1 < t.level3_size << (t.p - 1))
2139         fprintf (stream, ",");
2140     }
2141   if (t.level3_size << (t.p - 1) > 8)
2142     fprintf (stream, "\n ");
2143   fprintf (stream, " }\n");
2144   fprintf (stream, "};\n");
2145 
2146   if (ferror (stream) || fclose (stream))
2147     {
2148       fprintf (stderr, "error writing to '%s'\n", filename);
2149       exit (1);
2150     }
2151 }
2152 
2153 /* ========================================================================= */
2154 
2155 /* Numeric value.  */
2156 /* See Unicode 3.0 book, section 4.6.  */
2157 
2158 typedef struct { int numerator; int denominator; } uc_fraction_t;
2159 
2160 static uc_fraction_t
2161 get_numeric_value (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
2162 {
2163   uc_fraction_t value;
2164 
2165   if (unicode_attributes[ch].name != NULL
2166       && unicode_attributes[ch].numeric[0] != '\0')
2167     {
2168       const char *str = unicode_attributes[ch].numeric;
2169       /* str is of the form "integer" or "integer/posinteger".  */
2170       value.numerator = atoi (str);
2171       if (strchr (str, '/') != NULL)
2172         value.denominator = atoi (strchr (str, '/') + 1);
2173       else
2174         value.denominator = 1;
2175     }
2176   else
2177     {
2178       value.numerator = 0;
2179       value.denominator = 0;
2180     }
2181   return value;
2182 }
2183 
2184 /* Output the unit test for the per-character numeric value table.  */
2185 static void
2186 output_numeric_test (const char *filename, const char *version)
     /* [previous][next][first][last][top][bottom][index][help] */
2187 {
2188   FILE *stream;
2189   bool need_comma;
2190   unsigned int ch;
2191 
2192   stream = fopen (filename, "w");
2193   if (stream == NULL)
2194     {
2195       fprintf (stderr, "cannot open '%s' for writing\n", filename);
2196       exit (1);
2197     }
2198 
2199   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2200   fprintf (stream, "/* Numeric values of Unicode characters.  */\n");
2201   fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
2202            version);
2203   fprintf (stream, "\n");
2204 
2205   fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
2206   fprintf (stream, "\n");
2207   output_tests_license (stream);
2208   fprintf (stream, "\n");
2209 
2210   need_comma = false;
2211   for (ch = 0; ch < 0x110000; ch++)
2212     {
2213       uc_fraction_t value = get_numeric_value (ch);
2214 
2215       if (value.numerator != 0 || value.denominator != 0)
2216         {
2217           if (need_comma)
2218             fprintf (stream, ",\n");
2219           fprintf (stream, "    { 0x%04X, %d, %d }",
2220                    ch, value.numerator, value.denominator);
2221           need_comma = true;
2222         }
2223     }
2224   if (need_comma)
2225     fprintf (stream, "\n");
2226 
2227   if (ferror (stream) || fclose (stream))
2228     {
2229       fprintf (stderr, "error writing to '%s'\n", filename);
2230       exit (1);
2231     }
2232 }
2233 
2234 /* Construction of sparse 3-level tables.  */
2235 #define TABLE numeric_table
2236 #define ELEMENT uint8_t
2237 #define DEFAULT 0
2238 #define xmalloc malloc
2239 #define xrealloc realloc
2240 #include "3level.h"
2241 
2242 /* Output the per-character numeric value table.  */
2243 static void
2244 output_numeric (const char *filename, const char *version)
     /* [previous][next][first][last][top][bottom][index][help] */
2245 {
2246   FILE *stream;
2247   uc_fraction_t fractions[160];
2248   unsigned int nfractions;
2249   unsigned int ch, i, j;
2250   struct numeric_table t;
2251   unsigned int level1_offset, level2_offset, level3_offset;
2252   uint16_t *level3_packed;
2253 
2254   stream = fopen (filename, "w");
2255   if (stream == NULL)
2256     {
2257       fprintf (stderr, "cannot open '%s' for writing\n", filename);
2258       exit (1);
2259     }
2260 
2261   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2262   fprintf (stream, "/* Numeric values of Unicode characters.  */\n");
2263   fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
2264            version);
2265   fprintf (stream, "\n");
2266 
2267   fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
2268   fprintf (stream, "\n");
2269   output_library_license (stream, false);
2270   fprintf (stream, "\n");
2271 
2272   /* Create table of occurring fractions.  */
2273   nfractions = 0;
2274   for (ch = 0; ch < 0x110000; ch++)
2275     {
2276       uc_fraction_t value = get_numeric_value (ch);
2277 
2278       for (i = 0; i < nfractions; i++)
2279         if (value.numerator == fractions[i].numerator
2280             && value.denominator == fractions[i].denominator)
2281           break;
2282       if (i == nfractions)
2283         {
2284           assert (nfractions != SIZEOF (fractions));
2285           for (i = 0; i < nfractions; i++)
2286             if (value.denominator < fractions[i].denominator
2287                 || (value.denominator == fractions[i].denominator
2288                     && value.numerator < fractions[i].numerator))
2289               break;
2290           for (j = nfractions; j > i; j--)
2291             fractions[j] = fractions[j - 1];
2292           fractions[i] = value;
2293           nfractions++;
2294         }
2295     }
2296 
2297   fprintf (stream, "static const uc_fraction_t u_numeric_values[%d] =\n",
2298            nfractions);
2299   fprintf (stream, "{\n");
2300   for (i = 0; i < nfractions; i++)
2301     {
2302       fprintf (stream, "  { %d, %d }", fractions[i].numerator,
2303                fractions[i].denominator);
2304       if (i+1 < nfractions)
2305         fprintf (stream, ",");
2306       fprintf (stream, "\n");
2307     }
2308   fprintf (stream, "};\n");
2309 
2310   t.p = 7;
2311   t.q = 9;
2312   numeric_table_init (&t);
2313 
2314   for (ch = 0; ch < 0x110000; ch++)
2315     {
2316       uc_fraction_t value = get_numeric_value (ch);
2317 
2318       for (i = 0; i < nfractions; i++)
2319         if (value.numerator == fractions[i].numerator
2320             && value.denominator == fractions[i].denominator)
2321           break;
2322       assert (i != nfractions);
2323 
2324       numeric_table_add (&t, ch, i);
2325     }
2326 
2327   numeric_table_finalize (&t);
2328 
2329   /* Offsets in t.result, in memory of this process.  */
2330   level1_offset =
2331     5 * sizeof (uint32_t);
2332   level2_offset =
2333     5 * sizeof (uint32_t)
2334     + t.level1_size * sizeof (uint32_t);
2335   level3_offset =
2336     5 * sizeof (uint32_t)
2337     + t.level1_size * sizeof (uint32_t)
2338     + (t.level2_size << t.q) * sizeof (uint32_t);
2339 
2340   for (i = 0; i < 5; i++)
2341     fprintf (stream, "#define numeric_header_%d %d\n", i,
2342              ((uint32_t *) t.result)[i]);
2343   fprintf (stream, "static const\n");
2344   fprintf (stream, "struct\n");
2345   fprintf (stream, "  {\n");
2346   fprintf (stream, "    int level1[%zu];\n", t.level1_size);
2347   fprintf (stream, "    short level2[%zu << %d];\n", t.level2_size, t.q);
2348   fprintf (stream, "    unsigned short level3[%zu * %d + 1];\n", t.level3_size,
2349            (1 << t.p) * 8 / 16);
2350   fprintf (stream, "  }\n");
2351   fprintf (stream, "u_numeric =\n");
2352   fprintf (stream, "{\n");
2353   fprintf (stream, "  {");
2354   if (t.level1_size > 8)
2355     fprintf (stream, "\n   ");
2356   for (i = 0; i < t.level1_size; i++)
2357     {
2358       uint32_t offset;
2359       if (i > 0 && (i % 8) == 0)
2360         fprintf (stream, "\n   ");
2361       offset = ((uint32_t *) (t.result + level1_offset))[i];
2362       if (offset == 0)
2363         fprintf (stream, " %5d", -1);
2364       else
2365         fprintf (stream, " %5zu",
2366                  (offset - level2_offset) / sizeof (uint32_t));
2367       if (i+1 < t.level1_size)
2368         fprintf (stream, ",");
2369     }
2370   if (t.level1_size > 8)
2371     fprintf (stream, "\n ");
2372   fprintf (stream, " },\n");
2373   fprintf (stream, "  {");
2374   if (t.level2_size << t.q > 8)
2375     fprintf (stream, "\n   ");
2376   for (i = 0; i < t.level2_size << t.q; i++)
2377     {
2378       uint32_t offset;
2379       if (i > 0 && (i % 8) == 0)
2380         fprintf (stream, "\n   ");
2381       offset = ((uint32_t *) (t.result + level2_offset))[i];
2382       if (offset == 0)
2383         fprintf (stream, " %5d", -1);
2384       else
2385         fprintf (stream, " %5zu",
2386                  (offset - level3_offset) / sizeof (uint8_t));
2387       if (i+1 < t.level2_size << t.q)
2388         fprintf (stream, ",");
2389     }
2390   if (t.level2_size << t.q > 8)
2391     fprintf (stream, "\n ");
2392   fprintf (stream, " },\n");
2393   /* Pack the level3 array.  Each entry needs 8 bits only.  Use 16-bit units,
2394      not 32-bit units, in order to make the lookup function easier.  */
2395   level3_packed =
2396     (uint16_t *)
2397     calloc ((t.level3_size << t.p) * 8 / 16 + 1, sizeof (uint16_t));
2398   for (i = 0; i < t.level3_size << t.p; i++)
2399     {
2400       unsigned int j = (i * 8) / 16;
2401       unsigned int k = (i * 8) % 16;
2402       uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
2403       value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
2404       level3_packed[j] = value & 0xffff;
2405       level3_packed[j+1] = value >> 16;
2406     }
2407   fprintf (stream, "  {");
2408   if ((t.level3_size << t.p) * 8 / 16 + 1 > 8)
2409     fprintf (stream, "\n   ");
2410   for (i = 0; i < (t.level3_size << t.p) * 8 / 16 + 1; i++)
2411     {
2412       if (i > 0 && (i % 8) == 0)
2413         fprintf (stream, "\n   ");
2414       fprintf (stream, " 0x%04x", level3_packed[i]);
2415       if (i+1 < (t.level3_size << t.p) * 8 / 16 + 1)
2416         fprintf (stream, ",");
2417     }
2418   if ((t.level3_size << t.p) * 8 / 16 + 1 > 8)
2419     fprintf (stream, "\n ");
2420   fprintf (stream, " }\n");
2421   free (level3_packed);
2422   fprintf (stream, "};\n");
2423 
2424   if (ferror (stream) || fclose (stream))
2425     {
2426       fprintf (stderr, "error writing to '%s'\n", filename);
2427       exit (1);
2428     }
2429 }
2430 
2431 /* ========================================================================= */
2432 
2433 /* Mirrored.  */
2434 /* See Unicode 3.0 book, section 4.7,
2435        UAX #9.  */
2436 
2437 /* List of mirrored character pairs.  This is a subset of the characters
2438    having the BidiMirrored property.  */
2439 static unsigned int mirror_pairs[][2] =
2440 {
2441   { 0x0028, 0x0029 },
2442   { 0x003C, 0x003E },
2443   { 0x005B, 0x005D },
2444   { 0x007B, 0x007D },
2445   { 0x00AB, 0x00BB },
2446   { 0x2039, 0x203A },
2447   { 0x2045, 0x2046 },
2448   { 0x207D, 0x207E },
2449   { 0x208D, 0x208E },
2450   { 0x2208, 0x220B },
2451   { 0x220A, 0x220D },
2452   { 0x223C, 0x223D },
2453   { 0x2243, 0x22CD },
2454   { 0x2252, 0x2253 },
2455   { 0x2254, 0x2255 },
2456   { 0x2264, 0x2265 },
2457   { 0x2266, 0x2267 },
2458   { 0x226A, 0x226B },
2459   { 0x2276, 0x2277 },
2460   { 0x2278, 0x2279 },
2461   { 0x227A, 0x227B },
2462   { 0x227C, 0x227D },
2463   { 0x2282, 0x2283 },
2464   { 0x2286, 0x2287 },
2465   { 0x228F, 0x2290 },
2466   { 0x2291, 0x2292 },
2467   { 0x22A2, 0x22A3 },
2468   { 0x22B0, 0x22B1 },
2469   { 0x22B2, 0x22B3 },
2470   { 0x22B4, 0x22B5 },
2471   { 0x22B6, 0x22B7 },
2472   { 0x22C9, 0x22CA },
2473   { 0x22CB, 0x22CC },
2474   { 0x22D0, 0x22D1 },
2475   { 0x22D6, 0x22D7 },
2476   { 0x22D8, 0x22D9 },
2477   { 0x22DA, 0x22DB },
2478   { 0x22DC, 0x22DD },
2479   { 0x22DE, 0x22DF },
2480   { 0x22F0, 0x22F1 },
2481   { 0x2308, 0x2309 },
2482   { 0x230A, 0x230B },
2483   { 0x2329, 0x232A },
2484   { 0x3008, 0x3009 },
2485   { 0x300A, 0x300B },
2486   { 0x300C, 0x300D },
2487   { 0x300E, 0x300F },
2488   { 0x3010, 0x3011 },
2489   { 0x3014, 0x3015 },
2490   { 0x3016, 0x3017 },
2491   { 0x3018, 0x3019 },
2492   { 0x301A, 0x301B }
2493 };
2494 
2495 static int
2496 get_mirror_value (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
2497 {
2498   bool mirrored;
2499   unsigned int mirror_char;
2500   unsigned int i;
2501 
2502   mirrored = (unicode_attributes[ch].name != NULL
2503               && unicode_attributes[ch].mirrored);
2504   mirror_char = 0xfffd;
2505   for (i = 0; i < sizeof (mirror_pairs) / sizeof (mirror_pairs[0]); i++)
2506     if (ch == mirror_pairs[i][0])
2507       {
2508         mirror_char = mirror_pairs[i][1];
2509         break;
2510       }
2511     else if (ch == mirror_pairs[i][1])
2512       {
2513         mirror_char = mirror_pairs[i][0];
2514         break;
2515       }
2516   if (mirrored)
2517     return (int) mirror_char - (int) ch;
2518   else
2519     {
2520       assert (mirror_char == 0xfffd);
2521       return 0;
2522     }
2523 }
2524 
2525 /* Construction of sparse 3-level tables.  */
2526 #define TABLE mirror_table
2527 #define ELEMENT int32_t
2528 #define DEFAULT 0
2529 #define xmalloc malloc
2530 #define xrealloc realloc
2531 #include "3level.h"
2532 
2533 /* Output the per-character mirror table.  */
2534 static void
2535 output_mirror (const char *filename, const char *version)
     /* [previous][next][first][last][top][bottom][index][help] */
2536 {
2537   FILE *stream;
2538   unsigned int ch, i;
2539   struct mirror_table t;
2540   unsigned int level1_offset, level2_offset, level3_offset;
2541 
2542   stream = fopen (filename, "w");
2543   if (stream == NULL)
2544     {
2545       fprintf (stderr, "cannot open '%s' for writing\n", filename);
2546       exit (1);
2547     }
2548 
2549   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2550   fprintf (stream, "/* Mirrored Unicode characters.  */\n");
2551   fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
2552            version);
2553   fprintf (stream, "\n");
2554 
2555   fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
2556   fprintf (stream, "\n");
2557   output_library_license (stream, false);
2558   fprintf (stream, "\n");
2559 
2560   t.p = 7;
2561   t.q = 9;
2562   mirror_table_init (&t);
2563 
2564   for (ch = 0; ch < 0x110000; ch++)
2565     {
2566       int value = get_mirror_value (ch);
2567 
2568       mirror_table_add (&t, ch, value);
2569     }
2570 
2571   mirror_table_finalize (&t);
2572 
2573   /* Offsets in t.result, in memory of this process.  */
2574   level1_offset =
2575     5 * sizeof (uint32_t);
2576   level2_offset =
2577     5 * sizeof (uint32_t)
2578     + t.level1_size * sizeof (uint32_t);
2579   level3_offset =
2580     5 * sizeof (uint32_t)
2581     + t.level1_size * sizeof (uint32_t)
2582     + (t.level2_size << t.q) * sizeof (uint32_t);
2583 
2584   for (i = 0; i < 5; i++)
2585     fprintf (stream, "#define mirror_header_%d %d\n", i,
2586              ((uint32_t *) t.result)[i]);
2587   fprintf (stream, "static const\n");
2588   fprintf (stream, "struct\n");
2589   fprintf (stream, "  {\n");
2590   fprintf (stream, "    int level1[%zu];\n", t.level1_size);
2591   fprintf (stream, "    short level2[%zu << %d];\n", t.level2_size, t.q);
2592   fprintf (stream, "    int level3[%zu << %d];\n", t.level3_size, t.p);
2593   fprintf (stream, "  }\n");
2594   fprintf (stream, "u_mirror =\n");
2595   fprintf (stream, "{\n");
2596   fprintf (stream, "  {");
2597   if (t.level1_size > 8)
2598     fprintf (stream, "\n   ");
2599   for (i = 0; i < t.level1_size; i++)
2600     {
2601       uint32_t offset;
2602       if (i > 0 && (i % 8) == 0)
2603         fprintf (stream, "\n   ");
2604       offset = ((uint32_t *) (t.result + level1_offset))[i];
2605       if (offset == 0)
2606         fprintf (stream, " %5d", -1);
2607       else
2608         fprintf (stream, " %5zu",
2609                  (offset - level2_offset) / sizeof (uint32_t));
2610       if (i+1 < t.level1_size)
2611         fprintf (stream, ",");
2612     }
2613   if (t.level1_size > 8)
2614     fprintf (stream, "\n ");
2615   fprintf (stream, " },\n");
2616   fprintf (stream, "  {");
2617   if (t.level2_size << t.q > 8)
2618     fprintf (stream, "\n   ");
2619   for (i = 0; i < t.level2_size << t.q; i++)
2620     {
2621       uint32_t offset;
2622       if (i > 0 && (i % 8) == 0)
2623         fprintf (stream, "\n   ");
2624       offset = ((uint32_t *) (t.result + level2_offset))[i];
2625       if (offset == 0)
2626         fprintf (stream, " %5d", -1);
2627       else
2628         fprintf (stream, " %5zu",
2629                  (offset - level3_offset) / sizeof (int32_t));
2630       if (i+1 < t.level2_size << t.q)
2631         fprintf (stream, ",");
2632     }
2633   if (t.level2_size << t.q > 8)
2634     fprintf (stream, "\n ");
2635   fprintf (stream, " },\n");
2636   fprintf (stream, "  {");
2637   if (t.level3_size << t.p > 8)
2638     fprintf (stream, "\n   ");
2639   for (i = 0; i < t.level3_size << t.p; i++)
2640     {
2641       if (i > 0 && (i % 8) == 0)
2642         fprintf (stream, "\n   ");
2643       fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
2644       if (i+1 < t.level3_size << t.p)
2645         fprintf (stream, ",");
2646     }
2647   if (t.level3_size << t.p > 8)
2648     fprintf (stream, "\n ");
2649   fprintf (stream, " }\n");
2650   fprintf (stream, "};\n");
2651 
2652   if (ferror (stream) || fclose (stream))
2653     {
2654       fprintf (stderr, "error writing to '%s'\n", filename);
2655       exit (1);
2656     }
2657 }
2658 
2659 /* ========================================================================= */
2660 
2661 /* Particular values of the word break property.  */
2662 
2663 static bool
2664 is_WBP_MIDNUMLET (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
2665 {
2666   return (ch == 0x002E || ch == 0x2018 || ch == 0x2019
2667           || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E);
2668 }
2669 
2670 static bool
2671 is_WBP_MIDLETTER (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
2672 {
2673   return (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A
2674           || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A
2675           || ch == 0x02D7);
2676 }
2677 
2678 /* ========================================================================= */
2679 
2680 /* Properties.  */
2681 
2682 /* Reading PropList.txt and DerivedCoreProperties.txt.  */
2683 enum
2684 {
2685   /* PropList.txt */
2686   PROP_WHITE_SPACE,
2687   PROP_BIDI_CONTROL,
2688   PROP_JOIN_CONTROL,
2689   PROP_DASH,
2690   PROP_HYPHEN,
2691   PROP_QUOTATION_MARK,
2692   PROP_TERMINAL_PUNCTUATION,
2693   PROP_OTHER_MATH,
2694   PROP_HEX_DIGIT,
2695   PROP_ASCII_HEX_DIGIT,
2696   PROP_OTHER_ALPHABETIC,
2697   PROP_IDEOGRAPHIC,
2698   PROP_DIACRITIC,
2699   PROP_EXTENDER,
2700   PROP_OTHER_LOWERCASE,
2701   PROP_OTHER_UPPERCASE,
2702   PROP_NONCHARACTER_CODE_POINT,
2703   PROP_OTHER_GRAPHEME_EXTEND,
2704   PROP_IDS_BINARY_OPERATOR,
2705   PROP_IDS_TRINARY_OPERATOR,
2706   PROP_RADICAL,
2707   PROP_UNIFIED_IDEOGRAPH,
2708   PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT,
2709   PROP_DEPRECATED,
2710   PROP_SOFT_DOTTED,
2711   PROP_LOGICAL_ORDER_EXCEPTION,
2712   PROP_OTHER_ID_START,
2713   PROP_OTHER_ID_CONTINUE,
2714   PROP_STERM,
2715   PROP_VARIATION_SELECTOR,
2716   PROP_PATTERN_WHITE_SPACE,
2717   PROP_PATTERN_SYNTAX,
2718   PROP_PREPENDED_CONCATENATION_MARK,
2719   /* DerivedCoreProperties.txt */
2720   PROP_MATH,
2721   PROP_ALPHABETIC,
2722   PROP_LOWERCASE,
2723   PROP_UPPERCASE,
2724   PROP_CASED,
2725   PROP_CASE_IGNORABLE,
2726   PROP_CHANGES_WHEN_LOWERCASED,
2727   PROP_CHANGES_WHEN_UPPERCASED,
2728   PROP_CHANGES_WHEN_TITLECASED,
2729   PROP_CHANGES_WHEN_CASEFOLDED,
2730   PROP_CHANGES_WHEN_CASEMAPPED,
2731   PROP_ID_START,
2732   PROP_ID_CONTINUE,
2733   PROP_XID_START,
2734   PROP_XID_CONTINUE,
2735   PROP_DEFAULT_IGNORABLE_CODE_POINT,
2736   PROP_GRAPHEME_EXTEND,
2737   PROP_GRAPHEME_BASE,
2738   PROP_GRAPHEME_LINK
2739 };
2740 unsigned long long unicode_properties[0x110000];
2741 
2742 static void
2743 clear_properties (void)
     /* [previous][next][first][last][top][bottom][index][help] */
2744 {
2745   unsigned int i;
2746 
2747   for (i = 0; i < 0x110000; i++)
2748     unicode_properties[i] = 0;
2749 }
2750 
2751 /* Stores in unicode_properties[] the properties from the
2752    PropList.txt or DerivedCoreProperties.txt file.  */
2753 static void
2754 fill_properties (const char *proplist_filename)
     /* [previous][next][first][last][top][bottom][index][help] */
2755 {
2756   unsigned int i;
2757   FILE *stream;
2758 
2759   stream = fopen (proplist_filename, "r");
2760   if (stream == NULL)
2761     {
2762       fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2763       exit (1);
2764     }
2765 
2766   for (;;)
2767     {
2768       char buf[200+1];
2769       unsigned int i1, i2;
2770       char padding[200+1];
2771       char propname[200+1];
2772       unsigned int propvalue;
2773 
2774       if (fscanf (stream, "%200[^\n]\n", buf) < 1)
2775         break;
2776 
2777       if (buf[0] == '\0' || buf[0] == '#')
2778         continue;
2779 
2780       if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
2781         {
2782           if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
2783             {
2784               fprintf (stderr, "parse error in '%s'\n", proplist_filename);
2785               exit (1);
2786             }
2787           i2 = i1;
2788         }
2789 #define PROP(name,value) \
2790       if (strcmp (propname, name) == 0) propvalue = value; else
2791       /* PropList.txt */
2792       PROP ("White_Space", PROP_WHITE_SPACE)
2793       PROP ("Bidi_Control", PROP_BIDI_CONTROL)
2794       PROP ("Join_Control", PROP_JOIN_CONTROL)
2795       PROP ("Dash", PROP_DASH)
2796       PROP ("Hyphen", PROP_HYPHEN)
2797       PROP ("Quotation_Mark", PROP_QUOTATION_MARK)
2798       PROP ("Terminal_Punctuation", PROP_TERMINAL_PUNCTUATION)
2799       PROP ("Other_Math", PROP_OTHER_MATH)
2800       PROP ("Hex_Digit", PROP_HEX_DIGIT)
2801       PROP ("ASCII_Hex_Digit", PROP_ASCII_HEX_DIGIT)
2802       PROP ("Other_Alphabetic", PROP_OTHER_ALPHABETIC)
2803       PROP ("Ideographic", PROP_IDEOGRAPHIC)
2804       PROP ("Diacritic", PROP_DIACRITIC)
2805       PROP ("Extender", PROP_EXTENDER)
2806       PROP ("Other_Lowercase", PROP_OTHER_LOWERCASE)
2807       PROP ("Other_Uppercase", PROP_OTHER_UPPERCASE)
2808       PROP ("Noncharacter_Code_Point", PROP_NONCHARACTER_CODE_POINT)
2809       PROP ("Other_Grapheme_Extend", PROP_OTHER_GRAPHEME_EXTEND)
2810       PROP ("IDS_Binary_Operator", PROP_IDS_BINARY_OPERATOR)
2811       PROP ("IDS_Trinary_Operator", PROP_IDS_TRINARY_OPERATOR)
2812       PROP ("Radical", PROP_RADICAL)
2813       PROP ("Unified_Ideograph", PROP_UNIFIED_IDEOGRAPH)
2814       PROP ("Other_Default_Ignorable_Code_Point", PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)
2815       PROP ("Deprecated", PROP_DEPRECATED)
2816       PROP ("Soft_Dotted", PROP_SOFT_DOTTED)
2817       PROP ("Logical_Order_Exception", PROP_LOGICAL_ORDER_EXCEPTION)
2818       PROP ("Other_ID_Start", PROP_OTHER_ID_START)
2819       PROP ("Other_ID_Continue", PROP_OTHER_ID_CONTINUE)
2820       PROP ("Sentence_Terminal", PROP_STERM)
2821       PROP ("Variation_Selector", PROP_VARIATION_SELECTOR)
2822       PROP ("Pattern_White_Space", PROP_PATTERN_WHITE_SPACE)
2823       PROP ("Pattern_Syntax", PROP_PATTERN_SYNTAX)
2824       PROP ("Prepended_Concatenation_Mark", PROP_PREPENDED_CONCATENATION_MARK)
2825       /* DerivedCoreProperties.txt */
2826       PROP ("Math", PROP_MATH)
2827       PROP ("Alphabetic", PROP_ALPHABETIC)
2828       PROP ("Lowercase", PROP_LOWERCASE)
2829       PROP ("Uppercase", PROP_UPPERCASE)
2830       PROP ("Cased", PROP_CASED)
2831       PROP ("Case_Ignorable", PROP_CASE_IGNORABLE)
2832       PROP ("Changes_When_Lowercased", PROP_CHANGES_WHEN_LOWERCASED)
2833       PROP ("Changes_When_Uppercased", PROP_CHANGES_WHEN_UPPERCASED)
2834       PROP ("Changes_When_Titlecased", PROP_CHANGES_WHEN_TITLECASED)
2835       PROP ("Changes_When_Casefolded", PROP_CHANGES_WHEN_CASEFOLDED)
2836       PROP ("Changes_When_Casemapped", PROP_CHANGES_WHEN_CASEMAPPED)
2837       PROP ("ID_Start", PROP_ID_START)
2838       PROP ("ID_Continue", PROP_ID_CONTINUE)
2839       PROP ("XID_Start", PROP_XID_START)
2840       PROP ("XID_Continue", PROP_XID_CONTINUE)
2841       PROP ("Default_Ignorable_Code_Point", PROP_DEFAULT_IGNORABLE_CODE_POINT)
2842       PROP ("Grapheme_Extend", PROP_GRAPHEME_EXTEND)
2843       PROP ("Grapheme_Base", PROP_GRAPHEME_BASE)
2844       PROP ("Grapheme_Link", PROP_GRAPHEME_LINK)
2845 #undef PROP
2846         {
2847           fprintf (stderr, "unknown property named '%s' in '%s'\n", propname,
2848                    proplist_filename);
2849           exit (1);
2850         }
2851       assert (i1 <= i2 && i2 < 0x110000);
2852 
2853       for (i = i1; i <= i2; i++)
2854         unicode_properties[i] |= 1ULL << propvalue;
2855     }
2856 
2857   if (ferror (stream) || fclose (stream))
2858     {
2859       fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2860       exit (1);
2861     }
2862 }
2863 
2864 /* Stores in array the given property from the Unicode 3.0 PropList.txt
2865    file.  */
2866 static void
2867 fill_property30 (char array[0x110000], const char *proplist_filename, const char *property_name)
     /* [previous][next][first][last][top][bottom][index][help] */
2868 {
2869   unsigned int i;
2870   FILE *stream;
2871   char buf[100+1];
2872 
2873   for (i = 0; i < 0x110000; i++)
2874     array[i] = 0;
2875 
2876   stream = fopen (proplist_filename, "r");
2877   if (stream == NULL)
2878     {
2879       fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2880       exit (1);
2881     }
2882 
2883   /* Search for the "Property dump for: ..." line.  */
2884   do
2885     {
2886       if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2887         {
2888           fprintf (stderr, "no property found in '%s'\n", proplist_filename);
2889           exit (1);
2890         }
2891     }
2892   while (strstr (buf, property_name) == NULL);
2893 
2894   for (;;)
2895     {
2896       unsigned int i1, i2;
2897 
2898       if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2899         break;
2900       if (buf[0] == '*')
2901         break;
2902       if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.')
2903         {
2904           if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2)
2905             {
2906               fprintf (stderr, "parse error in property in '%s'\n",
2907                        proplist_filename);
2908               exit (1);
2909             }
2910         }
2911       else if (strlen (buf) >= 4)
2912         {
2913           if (sscanf (buf, "%4X", &i1) < 1)
2914             {
2915               fprintf (stderr, "parse error in property in '%s'\n",
2916                        proplist_filename);
2917               exit (1);
2918             }
2919           i2 = i1;
2920         }
2921       else
2922         {
2923           fprintf (stderr, "parse error in property in '%s'\n",
2924                    proplist_filename);
2925           exit (1);
2926         }
2927       assert (i1 <= i2 && i2 < 0x110000);
2928       for (i = i1; i <= i2; i++)
2929         array[i] = 1;
2930     }
2931 
2932   if (ferror (stream) || fclose (stream))
2933     {
2934       fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2935       exit (1);
2936     }
2937 }
2938 
2939 /* Properties from Unicode 3.0 PropList.txt file.  */
2940 
2941 /* The paired punctuation property from the PropList.txt file.  */
2942 char unicode_pairedpunctuation[0x110000];
2943 
2944 /* The left of pair property from the PropList.txt file.  */
2945 char unicode_leftofpair[0x110000];
2946 
2947 static void
2948 fill_properties30 (const char *proplist30_filename)
     /* [previous][next][first][last][top][bottom][index][help] */
2949 {
2950   fill_property30 (unicode_pairedpunctuation, proplist30_filename, "(Paired Punctuation)");
2951   fill_property30 (unicode_leftofpair, proplist30_filename, "(Left of Pair)");
2952 }
2953 
2954 /* ------------------------------------------------------------------------- */
2955 
2956 /* See PropList.txt, UCD.html.  */
2957 static bool
2958 is_property_white_space (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
2959 {
2960   return ((unicode_properties[ch] & (1ULL << PROP_WHITE_SPACE)) != 0);
2961 }
2962 
2963 /* See Unicode 3.0 book, section 4.10,
2964        PropList.txt, UCD.html,
2965        DerivedCoreProperties.txt, UCD.html.  */
2966 static bool
2967 is_property_alphabetic (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
2968 {
2969   bool result1 =
2970     is_category_L (ch)
2971     || ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0)
2972     /* For some reason, the following are listed as having property
2973        Alphabetic but not as having property Other_Alphabetic.  */
2974     || (ch >= 0x16EE && ch <= 0x16F0) /* RUNIC SYMBOLS */
2975     || (ch >= 0x2160 && ch <= 0x2182) /* ROMAN NUMERALS */
2976     || (ch >= 0x2185 && ch <= 0x2188) /* ROMAN NUMERALS */
2977     || (ch >= 0x24D0 && ch <= 0x24E9) /* CIRCLED LATIN SMALL LETTER */
2978     || (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */
2979     || (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */
2980     || (ch >= 0x3038 && ch <= 0x303A) /* HANGZHOU NUMERAL */
2981     || (ch >= 0xA6E6 && ch <= 0xA6EF) /* BAMUM LETTERS */
2982     || (ch >= 0x10140 && ch <= 0x10174) /* GREEK ACROPHONICS */
2983     || (ch == 0x10341) /* GOTHIC LETTER NINETY */
2984     || (ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
2985     || (ch >= 0x103D1 && ch <= 0x103D5) /* OLD PERSIAN NUMBERS */
2986     || (ch >= 0x12400 && ch <= 0x1246E); /* CUNEIFORM NUMERIC SIGNS */
2987   bool result2 =
2988     ((unicode_properties[ch] & (1ULL << PROP_ALPHABETIC)) != 0);
2989 
2990   assert (result1 == result2);
2991   return result1;
2992 }
2993 
2994 /* See PropList.txt, UCD.html.  */
2995 static bool
2996 is_property_other_alphabetic (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
2997 {
2998   return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0);
2999 }
3000 
3001 /* See PropList.txt, UCD.html.  */
3002 static bool
3003 is_property_not_a_character (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3004 {
3005   return ((unicode_properties[ch] & (1ULL << PROP_NONCHARACTER_CODE_POINT)) != 0);
3006 }
3007 
3008 /* See PropList.txt, UCD.html,
3009        DerivedCoreProperties.txt, UCD.html.  */
3010 static bool
3011 is_property_default_ignorable_code_point (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3012 {
3013   bool result1 =
3014     (is_category_Cf (ch)
3015      && !(ch >= 0xFFF9 && ch <= 0xFFFB) /* Annotations */
3016      && !((ch >= 0x0600 && ch <= 0x0605) || ch == 0x06DD || ch == 0x070F)
3017      /* For some reason, the following are not listed as having property
3018         Default_Ignorable_Code_Point.  */
3019      && !(ch == 0x110BD)
3020      && !(ch == 0x8E2))
3021     || ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0)
3022     || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
3023   bool result2 =
3024     ((unicode_properties[ch] & (1ULL << PROP_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
3025 
3026   assert (result1 == result2);
3027   return result1;
3028 }
3029 
3030 /* See PropList.txt, UCD.html.  */
3031 static bool
3032 is_property_other_default_ignorable_code_point (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3033 {
3034   return ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
3035 }
3036 
3037 /* See PropList.txt, UCD.html.  */
3038 static bool
3039 is_property_deprecated (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3040 {
3041   return ((unicode_properties[ch] & (1ULL << PROP_DEPRECATED)) != 0);
3042 }
3043 
3044 /* See PropList.txt, UCD.html.  */
3045 static bool
3046 is_property_logical_order_exception (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3047 {
3048   return ((unicode_properties[ch] & (1ULL << PROP_LOGICAL_ORDER_EXCEPTION)) != 0);
3049 }
3050 
3051 /* See PropList.txt, UCD.html.  */
3052 static bool
3053 is_property_variation_selector (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3054 {
3055   return ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
3056 }
3057 
3058 /* See PropList-3.0.1.txt.  */
3059 static bool
3060 is_property_private_use (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3061 {
3062   /* Determined through "grep 'Private Use,' UnicodeData-3.1.0.txt".  */
3063   return (ch >= 0xE000 && ch <= 0xF8FF)
3064          || (ch >= 0xF0000 && ch <= 0xFFFFD)
3065          || (ch >= 0x100000 && ch <= 0x10FFFD);
3066 }
3067 
3068 /* See PropList-3.0.1.txt.  */
3069 static bool
3070 is_property_unassigned_code_value (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3071 {
3072   return (is_category_Cn (ch) && !is_property_not_a_character (ch));
3073 }
3074 
3075 /* See PropList.txt, UCD.html,
3076        DerivedCoreProperties.txt, UCD.html.  */
3077 static bool
3078 is_property_uppercase (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3079 {
3080   bool result1 =
3081     is_category_Lu (ch)
3082     || ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
3083   bool result2 =
3084     ((unicode_properties[ch] & (1ULL << PROP_UPPERCASE)) != 0);
3085 
3086   assert (result1 == result2);
3087   return result1;
3088 }
3089 
3090 /* See PropList.txt, UCD.html.  */
3091 static bool
3092 is_property_other_uppercase (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3093 {
3094   return ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
3095 }
3096 
3097 /* See PropList.txt, UCD.html,
3098        DerivedCoreProperties.txt, UCD.html.  */
3099 static bool
3100 is_property_lowercase (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3101 {
3102   bool result1 =
3103     is_category_Ll (ch)
3104     || ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
3105   bool result2 =
3106     ((unicode_properties[ch] & (1ULL << PROP_LOWERCASE)) != 0);
3107 
3108   assert (result1 == result2);
3109   return result1;
3110 }
3111 
3112 /* See PropList.txt, UCD.html.  */
3113 static bool
3114 is_property_other_lowercase (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3115 {
3116   return ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
3117 }
3118 
3119 /* See PropList-3.0.1.txt.  */
3120 static bool
3121 is_property_titlecase (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3122 {
3123   return is_category_Lt (ch);
3124 }
3125 
3126 /* See DerivedCoreProperties.txt.  */
3127 static bool
3128 is_property_cased (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3129 {
3130   bool result1 = (is_property_lowercase (ch)
3131                   || is_property_uppercase (ch)
3132                   || is_category_Lt (ch));
3133   bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASED)) != 0);
3134 
3135   assert (result1 == result2);
3136   return result1;
3137 }
3138 
3139 /* See DerivedCoreProperties.txt.  */
3140 static bool
3141 is_property_case_ignorable (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3142 {
3143   bool result1 = (is_WBP_MIDLETTER (ch) || is_WBP_MIDNUMLET (ch)
3144                   || ch == 0x0027
3145                   || is_category_Mn (ch)
3146                   || is_category_Me (ch)
3147                   || is_category_Cf (ch)
3148                   || is_category_Lm (ch)
3149                   || is_category_Sk (ch));
3150   bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASE_IGNORABLE)) != 0);
3151 
3152   assert (result1 == result2);
3153   return result1;
3154 }
3155 
3156 /* See DerivedCoreProperties.txt.  */
3157 static bool
3158 is_property_changes_when_lowercased (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3159 {
3160   bool result1 = ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_LOWERCASED)) != 0);
3161   bool result2 = (unicode_attributes[ch].name != NULL
3162                   && unicode_attributes[ch].lower != NONE
3163                   && unicode_attributes[ch].lower != ch);
3164 
3165   assert (result1 == result2);
3166   return result1;
3167 }
3168 
3169 /* See DerivedCoreProperties.txt.  */
3170 static bool
3171 is_property_changes_when_uppercased (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3172 {
3173   return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_UPPERCASED)) != 0);
3174 }
3175 
3176 /* See DerivedCoreProperties.txt.  */
3177 static bool
3178 is_property_changes_when_titlecased (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3179 {
3180   return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_TITLECASED)) != 0);
3181 }
3182 
3183 /* See DerivedCoreProperties.txt.  */
3184 static bool
3185 is_property_changes_when_casefolded (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3186 {
3187   return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEFOLDED)) != 0);
3188 }
3189 
3190 /* See DerivedCoreProperties.txt.  */
3191 static bool
3192 is_property_changes_when_casemapped (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3193 {
3194   return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEMAPPED)) != 0);
3195 }
3196 
3197 /* See PropList.txt, UCD.html.  */
3198 static bool
3199 is_property_soft_dotted (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3200 {
3201   return ((unicode_properties[ch] & (1ULL << PROP_SOFT_DOTTED)) != 0);
3202 }
3203 
3204 /* See DerivedCoreProperties.txt, UCD.html.  */
3205 static bool
3206 is_property_id_start (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3207 {
3208   return ((unicode_properties[ch] & (1ULL << PROP_ID_START)) != 0);
3209 }
3210 
3211 /* See PropList.txt, UCD.html.  */
3212 static bool
3213 is_property_other_id_start (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3214 {
3215   return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_START)) != 0);
3216 }
3217 
3218 /* See DerivedCoreProperties.txt, UCD.html.  */
3219 static bool
3220 is_property_id_continue (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3221 {
3222   return ((unicode_properties[ch] & (1ULL << PROP_ID_CONTINUE)) != 0);
3223 }
3224 
3225 /* See PropList.txt, UCD.html.  */
3226 static bool
3227 is_property_other_id_continue (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3228 {
3229   return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_CONTINUE)) != 0);
3230 }
3231 
3232 /* See DerivedCoreProperties.txt, UCD.html.  */
3233 static bool
3234 is_property_xid_start (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3235 {
3236   return ((unicode_properties[ch] & (1ULL << PROP_XID_START)) != 0);
3237 }
3238 
3239 /* See DerivedCoreProperties.txt, UCD.html.  */
3240 static bool
3241 is_property_xid_continue (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3242 {
3243   return ((unicode_properties[ch] & (1ULL << PROP_XID_CONTINUE)) != 0);
3244 }
3245 
3246 /* See PropList.txt, UCD.html.  */
3247 static bool
3248 is_property_pattern_white_space (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3249 {
3250   return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_WHITE_SPACE)) != 0);
3251 }
3252 
3253 /* See PropList.txt, UCD.html.  */
3254 static bool
3255 is_property_pattern_syntax (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3256 {
3257   return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_SYNTAX)) != 0);
3258 }
3259 
3260 /* See PropList.txt, UCD.html.  */
3261 static bool
3262 is_property_join_control (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3263 {
3264   return ((unicode_properties[ch] & (1ULL << PROP_JOIN_CONTROL)) != 0);
3265 }
3266 
3267 /* See DerivedCoreProperties.txt, UCD.html.  */
3268 static bool
3269 is_property_grapheme_base (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3270 {
3271   return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_BASE)) != 0);
3272 }
3273 
3274 /* See DerivedCoreProperties.txt, UCD.html.  */
3275 static bool
3276 is_property_grapheme_extend (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3277 {
3278   return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_EXTEND)) != 0);
3279 }
3280 
3281 /* See PropList.txt, UCD.html.  */
3282 static bool
3283 is_property_other_grapheme_extend (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3284 {
3285   return ((unicode_properties[ch] & (1ULL << PROP_OTHER_GRAPHEME_EXTEND)) != 0);
3286 }
3287 
3288 /* See DerivedCoreProperties.txt, UCD.html.  */
3289 static bool
3290 is_property_grapheme_link (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3291 {
3292   return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_LINK)) != 0);
3293 }
3294 
3295 /* See PropList.txt, UCD.html.  */
3296 static bool
3297 is_property_bidi_control (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3298 {
3299   return ((unicode_properties[ch] & (1ULL << PROP_BIDI_CONTROL)) != 0);
3300 }
3301 
3302 /* See PropList-3.0.1.txt.  */
3303 static bool
3304 is_property_bidi_left_to_right (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3305 {
3306   return (get_bidi_category (ch) == UC_BIDI_L);
3307 }
3308 
3309 /* See PropList-3.0.1.txt.  */
3310 static bool
3311 is_property_bidi_hebrew_right_to_left (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3312 {
3313   return (get_bidi_category (ch) == UC_BIDI_R);
3314 }
3315 
3316 /* See PropList-3.0.1.txt.  */
3317 static bool
3318 is_property_bidi_arabic_right_to_left (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3319 {
3320   return (get_bidi_category (ch) == UC_BIDI_AL);
3321 }
3322 
3323 /* See PropList-3.0.1.txt.  */
3324 static bool
3325 is_property_bidi_european_digit (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3326 {
3327   return (get_bidi_category (ch) == UC_BIDI_EN);
3328 }
3329 
3330 /* See PropList-3.0.1.txt.  */
3331 static bool
3332 is_property_bidi_eur_num_separator (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3333 {
3334   return (get_bidi_category (ch) == UC_BIDI_ES);
3335 }
3336 
3337 /* See PropList-3.0.1.txt.  */
3338 static bool
3339 is_property_bidi_eur_num_terminator (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3340 {
3341   return (get_bidi_category (ch) == UC_BIDI_ET);
3342 }
3343 
3344 /* See PropList-3.0.1.txt.  */
3345 static bool
3346 is_property_bidi_arabic_digit (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3347 {
3348   return (get_bidi_category (ch) == UC_BIDI_AN);
3349 }
3350 
3351 /* See PropList-3.0.1.txt.  */
3352 static bool
3353 is_property_bidi_common_separator (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3354 {
3355   return (get_bidi_category (ch) == UC_BIDI_CS);
3356 }
3357 
3358 /* See PropList-3.0.1.txt.  */
3359 static bool
3360 is_property_bidi_block_separator (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3361 {
3362   return (get_bidi_category (ch) == UC_BIDI_B);
3363 }
3364 
3365 /* See PropList-3.0.1.txt.  */
3366 static bool
3367 is_property_bidi_segment_separator (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3368 {
3369   return (get_bidi_category (ch) == UC_BIDI_S);
3370 }
3371 
3372 /* See PropList-3.0.1.txt.  */
3373 static bool
3374 is_property_bidi_whitespace (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3375 {
3376   return (get_bidi_category (ch) == UC_BIDI_WS);
3377 }
3378 
3379 /* See PropList-3.0.1.txt.  */
3380 static bool
3381 is_property_bidi_non_spacing_mark (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3382 {
3383   return (get_bidi_category (ch) == UC_BIDI_NSM);
3384 }
3385 
3386 /* See PropList-3.0.1.txt.  */
3387 static bool
3388 is_property_bidi_boundary_neutral (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3389 {
3390   return (get_bidi_category (ch) == UC_BIDI_BN);
3391 }
3392 
3393 /* See PropList-3.0.1.txt.  */
3394 static bool
3395 is_property_bidi_pdf (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3396 {
3397   return (get_bidi_category (ch) == UC_BIDI_PDF);
3398 }
3399 
3400 /* See PropList-3.0.1.txt.  */
3401 static bool
3402 is_property_bidi_embedding_or_override (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3403 {
3404   int category = get_bidi_category (ch);
3405   return (category == UC_BIDI_LRE || category == UC_BIDI_LRO
3406           || category == UC_BIDI_RLE || category == UC_BIDI_RLO);
3407 }
3408 
3409 /* See PropList-3.0.1.txt.  */
3410 static bool
3411 is_property_bidi_other_neutral (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3412 {
3413   return (get_bidi_category (ch) == UC_BIDI_ON);
3414 }
3415 
3416 /* See PropList.txt, UCD.html.  */
3417 static bool
3418 is_property_hex_digit (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3419 {
3420   return ((unicode_properties[ch] & (1ULL << PROP_HEX_DIGIT)) != 0);
3421 }
3422 
3423 /* See PropList.txt, UCD.html.  */
3424 static bool
3425 is_property_ascii_hex_digit (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3426 {
3427   return ((unicode_properties[ch] & (1ULL << PROP_ASCII_HEX_DIGIT)) != 0);
3428 }
3429 
3430 /* See Unicode 3.0 book, section 4.10,
3431        PropList.txt, UCD.html.  */
3432 static bool
3433 is_property_ideographic (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3434 {
3435   return ((unicode_properties[ch] & (1ULL << PROP_IDEOGRAPHIC)) != 0);
3436 }
3437 
3438 /* See PropList.txt, UCD.html.  */
3439 static bool
3440 is_property_unified_ideograph (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3441 {
3442   return ((unicode_properties[ch] & (1ULL << PROP_UNIFIED_IDEOGRAPH)) != 0);
3443 }
3444 
3445 /* See PropList.txt, UCD.html.  */
3446 static bool
3447 is_property_radical (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3448 {
3449   return ((unicode_properties[ch] & (1ULL << PROP_RADICAL)) != 0);
3450 }
3451 
3452 /* See PropList.txt, UCD.html.  */
3453 static bool
3454 is_property_ids_binary_operator (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3455 {
3456   return ((unicode_properties[ch] & (1ULL << PROP_IDS_BINARY_OPERATOR)) != 0);
3457 }
3458 
3459 /* See PropList.txt, UCD.html.  */
3460 static bool
3461 is_property_ids_trinary_operator (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3462 {
3463   return ((unicode_properties[ch] & (1ULL << PROP_IDS_TRINARY_OPERATOR)) != 0);
3464 }
3465 
3466 /* See PropList-3.0.1.txt.  */
3467 static bool
3468 is_property_zero_width (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3469 {
3470   return is_category_Cf (ch)
3471          || (unicode_attributes[ch].name != NULL
3472              && strstr (unicode_attributes[ch].name, "ZERO WIDTH") != NULL);
3473 }
3474 
3475 /* See PropList-3.0.1.txt.  */
3476 static bool
3477 is_property_space (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3478 {
3479   return is_category_Zs (ch);
3480 }
3481 
3482 /* See PropList-3.0.1.txt.  */
3483 static bool
3484 is_property_non_break (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3485 {
3486   /* This is exactly the set of characters having line breaking
3487      property GL.  */
3488   return (ch == 0x00A0 /* NO-BREAK SPACE */
3489           || ch == 0x034F /* COMBINING GRAPHEME JOINER */
3490           || ch == 0x035C /* COMBINING DOUBLE BREVE BELOW */
3491           || ch == 0x035D /* COMBINING DOUBLE BREVE */
3492           || ch == 0x035E /* COMBINING DOUBLE MACRON */
3493           || ch == 0x035F /* COMBINING DOUBLE MACRON BELOW */
3494           || ch == 0x0360 /* COMBINING DOUBLE TILDE */
3495           || ch == 0x0361 /* COMBINING DOUBLE INVERTED BREVE */
3496           || ch == 0x0362 /* COMBINING DOUBLE RIGHTWARDS ARROW BELOW */
3497           || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
3498           || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
3499           || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
3500           || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
3501           || ch == 0x2007 /* FIGURE SPACE */
3502           || ch == 0x2011 /* NON-BREAKING HYPHEN */
3503           || ch == 0x202F /* NARROW NO-BREAK SPACE */);
3504 }
3505 
3506 /* See PropList-3.0.1.txt.  */
3507 static bool
3508 is_property_iso_control (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3509 {
3510   bool result1 =
3511     (unicode_attributes[ch].name != NULL
3512      && strcmp (unicode_attributes[ch].name, "<control>") == 0);
3513   bool result2 =
3514     is_category_Cc (ch);
3515 
3516   assert (result1 == result2);
3517   return result1;
3518 }
3519 
3520 /* See PropList-3.0.1.txt.  */
3521 static bool
3522 is_property_format_control (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3523 {
3524   return (is_category_Cf (ch)
3525           && get_bidi_category (ch) == UC_BIDI_BN
3526           && !is_property_join_control (ch)
3527           && ch != 0xFEFF);
3528 }
3529 
3530 /* See PropList.txt, UCD.html.  */
3531 static bool
3532 is_property_dash (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3533 {
3534   return ((unicode_properties[ch] & (1ULL << PROP_DASH)) != 0);
3535 }
3536 
3537 /* See PropList.txt, UCD.html.  */
3538 static bool
3539 is_property_hyphen (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3540 {
3541   return ((unicode_properties[ch] & (1ULL << PROP_HYPHEN)) != 0);
3542 }
3543 
3544 /* See PropList-3.0.1.txt.  */
3545 static bool
3546 is_property_punctuation (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3547 {
3548   return is_category_P (ch);
3549 }
3550 
3551 /* See PropList-3.0.1.txt.  */
3552 static bool
3553 is_property_line_separator (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3554 {
3555   return is_category_Zl (ch);
3556 }
3557 
3558 /* See PropList-3.0.1.txt.  */
3559 static bool
3560 is_property_paragraph_separator (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3561 {
3562   return is_category_Zp (ch);
3563 }
3564 
3565 /* See PropList.txt, UCD.html.  */
3566 static bool
3567 is_property_quotation_mark (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3568 {
3569   return ((unicode_properties[ch] & (1ULL << PROP_QUOTATION_MARK)) != 0);
3570 }
3571 
3572 /* See PropList.txt, UCD.html.  */
3573 static bool
3574 is_property_sentence_terminal (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3575 {
3576   return ((unicode_properties[ch] & (1ULL << PROP_STERM)) != 0);
3577 }
3578 
3579 /* See PropList.txt, UCD.html.  */
3580 static bool
3581 is_property_terminal_punctuation (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3582 {
3583   return ((unicode_properties[ch] & (1ULL << PROP_TERMINAL_PUNCTUATION)) != 0);
3584 }
3585 
3586 /* See PropList-3.0.1.txt.  */
3587 static bool
3588 is_property_currency_symbol (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3589 {
3590   return is_category_Sc (ch);
3591 }
3592 
3593 /* See Unicode 3.0 book, section 4.9,
3594        PropList.txt, UCD.html,
3595        DerivedCoreProperties.txt, UCD.html.  */
3596 static bool
3597 is_property_math (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3598 {
3599   bool result1 =
3600     is_category_Sm (ch)
3601     || ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3602   bool result2 =
3603     ((unicode_properties[ch] & (1ULL << PROP_MATH)) != 0);
3604 
3605   assert (result1 == result2);
3606   return result1;
3607 }
3608 
3609 /* See PropList.txt, UCD.html.  */
3610 static bool
3611 is_property_other_math (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3612 {
3613   return ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3614 }
3615 
3616 /* See PropList-3.0.1.txt.  */
3617 static bool
3618 is_property_paired_punctuation (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3619 {
3620   return unicode_pairedpunctuation[ch];
3621 }
3622 
3623 /* See PropList-3.0.1.txt.  */
3624 static bool
3625 is_property_left_of_pair (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3626 {
3627   return unicode_leftofpair[ch];
3628 }
3629 
3630 /* See PropList-3.0.1.txt.  */
3631 static bool
3632 is_property_combining (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3633 {
3634   return (unicode_attributes[ch].name != NULL
3635           && (strcmp (unicode_attributes[ch].combining, "0") != 0
3636               || is_category_Mc (ch)
3637               || is_category_Me (ch)
3638               || is_category_Mn (ch)));
3639 }
3640 
3641 #if 0 /* same as is_property_bidi_non_spacing_mark */
3642 /* See PropList-3.0.1.txt.  */
3643 static bool
3644 is_property_non_spacing (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3645 {
3646   return (unicode_attributes[ch].name != NULL
3647           && get_bidi_category (ch) == UC_BIDI_NSM);
3648 }
3649 #endif
3650 
3651 /* See PropList-3.0.1.txt.  */
3652 static bool
3653 is_property_composite (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3654 {
3655   /* This definition differs from the one in PropList-3.0.1.txt, but is more
3656      logical in some sense.  */
3657   if (ch >= 0xAC00 && ch <= 0xD7A4) /* Hangul Syllables */
3658     return true;
3659   if (unicode_attributes[ch].name != NULL
3660       && unicode_attributes[ch].decomposition != NULL)
3661     {
3662       /* Test whether the decomposition contains more than one character,
3663          and the first is not a space.  */
3664       const char *decomp = unicode_attributes[ch].decomposition;
3665       if (decomp[0] == '<')
3666         {
3667           decomp = strchr (decomp, '>') + 1;
3668           if (decomp[0] == ' ')
3669             decomp++;
3670         }
3671       return strchr (decomp, ' ') != NULL && strncmp (decomp, "0020 ", 5) != 0;
3672     }
3673   return false;
3674 }
3675 
3676 /* See PropList-3.0.1.txt.  */
3677 static bool
3678 is_property_decimal_digit (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3679 {
3680   return is_category_Nd (ch);
3681 }
3682 
3683 /* See PropList-3.0.1.txt.  */
3684 static bool
3685 is_property_numeric (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3686 {
3687   return ((get_numeric_value (ch)).denominator > 0)
3688          || (ch == 0x09F8) /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */
3689          || (ch == 0x2183); /* ROMAN NUMERAL REVERSED ONE HUNDRED */
3690 }
3691 
3692 /* See PropList.txt, UCD.html.  */
3693 static bool
3694 is_property_diacritic (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3695 {
3696   return ((unicode_properties[ch] & (1ULL << PROP_DIACRITIC)) != 0);
3697 }
3698 
3699 /* See PropList.txt, UCD.html.  */
3700 static bool
3701 is_property_extender (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3702 {
3703   return ((unicode_properties[ch] & (1ULL << PROP_EXTENDER)) != 0);
3704 }
3705 
3706 /* See PropList-3.0.1.txt.  */
3707 static bool
3708 is_property_ignorable_control (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
3709 {
3710   return ((is_category_Cc (ch) && get_bidi_category (ch) == UC_BIDI_BN)
3711           || is_category_Cf (ch))
3712          && ch != 0x0000;
3713 }
3714 
3715 /* ------------------------------------------------------------------------- */
3716 
3717 /* Output all properties.  */
3718 static void
3719 output_properties (const char *version)
     /* [previous][next][first][last][top][bottom][index][help] */
3720 {
3721 #define PROPERTY(P) \
3722   debug_output_predicate ("unictype/pr_" #P ".txt", is_property_ ## P); \
3723   output_predicate_test ("../tests/unictype/test-pr_" #P ".c", is_property_ ## P, "uc_is_property_" #P " (c)"); \
3724   output_predicate ("unictype/pr_" #P ".h", is_property_ ## P, "u_property_" #P, "Properties", version);
3725   PROPERTY(white_space)
3726   PROPERTY(alphabetic)
3727   PROPERTY(other_alphabetic)
3728   PROPERTY(not_a_character)
3729   PROPERTY(default_ignorable_code_point)
3730   PROPERTY(other_default_ignorable_code_point)
3731   PROPERTY(deprecated)
3732   PROPERTY(logical_order_exception)
3733   PROPERTY(variation_selector)
3734   PROPERTY(private_use)
3735   PROPERTY(unassigned_code_value)
3736   PROPERTY(uppercase)
3737   PROPERTY(other_uppercase)
3738   PROPERTY(lowercase)
3739   PROPERTY(other_lowercase)
3740   PROPERTY(titlecase)
3741   PROPERTY(cased)
3742   PROPERTY(case_ignorable)
3743   PROPERTY(changes_when_lowercased)
3744   PROPERTY(changes_when_uppercased)
3745   PROPERTY(changes_when_titlecased)
3746   PROPERTY(changes_when_casefolded)
3747   PROPERTY(changes_when_casemapped)
3748   PROPERTY(soft_dotted)
3749   PROPERTY(id_start)
3750   PROPERTY(other_id_start)
3751   PROPERTY(id_continue)
3752   PROPERTY(other_id_continue)
3753   PROPERTY(xid_start)
3754   PROPERTY(xid_continue)
3755   PROPERTY(pattern_white_space)
3756   PROPERTY(pattern_syntax)
3757   PROPERTY(join_control)
3758   PROPERTY(grapheme_base)
3759   PROPERTY(grapheme_extend)
3760   PROPERTY(other_grapheme_extend)
3761   PROPERTY(grapheme_link)
3762   PROPERTY(bidi_control)
3763   PROPERTY(bidi_left_to_right)
3764   PROPERTY(bidi_hebrew_right_to_left)
3765   PROPERTY(bidi_arabic_right_to_left)
3766   PROPERTY(bidi_european_digit)
3767   PROPERTY(bidi_eur_num_separator)
3768   PROPERTY(bidi_eur_num_terminator)
3769   PROPERTY(bidi_arabic_digit)
3770   PROPERTY(bidi_common_separator)
3771   PROPERTY(bidi_block_separator)
3772   PROPERTY(bidi_segment_separator)
3773   PROPERTY(bidi_whitespace)
3774   PROPERTY(bidi_non_spacing_mark)
3775   PROPERTY(bidi_boundary_neutral)
3776   PROPERTY(bidi_pdf)
3777   PROPERTY(bidi_embedding_or_override)
3778   PROPERTY(bidi_other_neutral)
3779   PROPERTY(hex_digit)
3780   PROPERTY(ascii_hex_digit)
3781   PROPERTY(ideographic)
3782   PROPERTY(unified_ideograph)
3783   PROPERTY(radical)
3784   PROPERTY(ids_binary_operator)
3785   PROPERTY(ids_trinary_operator)
3786   PROPERTY(zero_width)
3787   PROPERTY(space)
3788   PROPERTY(non_break)
3789   PROPERTY(iso_control)
3790   PROPERTY(format_control)
3791   PROPERTY(dash)
3792   PROPERTY(hyphen)
3793   PROPERTY(punctuation)
3794   PROPERTY(line_separator)
3795   PROPERTY(paragraph_separator)
3796   PROPERTY(quotation_mark)
3797   PROPERTY(sentence_terminal)
3798   PROPERTY(terminal_punctuation)
3799   PROPERTY(currency_symbol)
3800   PROPERTY(math)
3801   PROPERTY(other_math)
3802   PROPERTY(paired_punctuation)
3803   PROPERTY(left_of_pair)
3804   PROPERTY(combining)
3805   PROPERTY(composite)
3806   PROPERTY(decimal_digit)
3807   PROPERTY(numeric)
3808   PROPERTY(diacritic)
3809   PROPERTY(extender)
3810   PROPERTY(ignorable_control)
3811 #undef PROPERTY
3812 }
3813 
3814 /* ========================================================================= */
3815 
3816 /* Arabic Shaping.  */
3817 
3818 enum
3819 {
3820   UC_JOINING_TYPE_U, /* Non_Joining */
3821   UC_JOINING_TYPE_T, /* Transparent */
3822   UC_JOINING_TYPE_C, /* Join_Causing */
3823   UC_JOINING_TYPE_L, /* Left_Joining */
3824   UC_JOINING_TYPE_R, /* Right_Joining */
3825   UC_JOINING_TYPE_D  /* Dual_Joining */
3826 };
3827 
3828 static uint8_t unicode_joining_type[0x110000];
3829 
3830 enum
3831 {
3832   UC_JOINING_GROUP_NONE,                  /* No_Joining_Group */
3833   UC_JOINING_GROUP_AIN,                   /* Ain */
3834   UC_JOINING_GROUP_ALAPH,                 /* Alaph */
3835   UC_JOINING_GROUP_ALEF,                  /* Alef */
3836   UC_JOINING_GROUP_BEH,                   /* Beh */
3837   UC_JOINING_GROUP_BETH,                  /* Beth */
3838   UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, /* Burushaski_Yeh_Barree */
3839   UC_JOINING_GROUP_DAL,                   /* Dal */
3840   UC_JOINING_GROUP_DALATH_RISH,           /* Dalath_Rish */
3841   UC_JOINING_GROUP_E,                     /* E */
3842   UC_JOINING_GROUP_FARSI_YEH,             /* Farsi_Yeh */
3843   UC_JOINING_GROUP_FE,                    /* Fe */
3844   UC_JOINING_GROUP_FEH,                   /* Feh */
3845   UC_JOINING_GROUP_FINAL_SEMKATH,         /* Final_Semkath */
3846   UC_JOINING_GROUP_GAF,                   /* Gaf */
3847   UC_JOINING_GROUP_GAMAL,                 /* Gamal */
3848   UC_JOINING_GROUP_HAH,                   /* Hah */
3849   UC_JOINING_GROUP_HE,                    /* He */
3850   UC_JOINING_GROUP_HEH,                   /* Heh */
3851   UC_JOINING_GROUP_HEH_GOAL,              /* Heh_Goal */
3852   UC_JOINING_GROUP_HETH,                  /* Heth */
3853   UC_JOINING_GROUP_KAF,                   /* Kaf */
3854   UC_JOINING_GROUP_KAPH,                  /* Kaph */
3855   UC_JOINING_GROUP_KHAPH,                 /* Khaph */
3856   UC_JOINING_GROUP_KNOTTED_HEH,           /* Knotted_Heh */
3857   UC_JOINING_GROUP_LAM,                   /* Lam */
3858   UC_JOINING_GROUP_LAMADH,                /* Lamadh */
3859   UC_JOINING_GROUP_MEEM,                  /* Meem */
3860   UC_JOINING_GROUP_MIM,                   /* Mim */
3861   UC_JOINING_GROUP_NOON,                  /* Noon */
3862   UC_JOINING_GROUP_NUN,                   /* Nun */
3863   UC_JOINING_GROUP_NYA,                   /* Nya */
3864   UC_JOINING_GROUP_PE,                    /* Pe */
3865   UC_JOINING_GROUP_QAF,                   /* Qaf */
3866   UC_JOINING_GROUP_QAPH,                  /* Qaph */
3867   UC_JOINING_GROUP_REH,                   /* Reh */
3868   UC_JOINING_GROUP_REVERSED_PE,           /* Reversed_Pe */
3869   UC_JOINING_GROUP_SAD,                   /* Sad */
3870   UC_JOINING_GROUP_SADHE,                 /* Sadhe */
3871   UC_JOINING_GROUP_SEEN,                  /* Seen */
3872   UC_JOINING_GROUP_SEMKATH,               /* Semkath */
3873   UC_JOINING_GROUP_SHIN,                  /* Shin */
3874   UC_JOINING_GROUP_SWASH_KAF,             /* Swash_Kaf */
3875   UC_JOINING_GROUP_SYRIAC_WAW,            /* Syriac_Waw */
3876   UC_JOINING_GROUP_TAH,                   /* Tah */
3877   UC_JOINING_GROUP_TAW,                   /* Taw */
3878   UC_JOINING_GROUP_TEH_MARBUTA,           /* Teh_Marbuta */
3879   UC_JOINING_GROUP_TEH_MARBUTA_GOAL,      /* Teh_Marbuta_Goal */
3880   UC_JOINING_GROUP_TETH,                  /* Teth */
3881   UC_JOINING_GROUP_WAW,                   /* Waw */
3882   UC_JOINING_GROUP_YEH,                   /* Yeh */
3883   UC_JOINING_GROUP_YEH_BARREE,            /* Yeh_Barree */
3884   UC_JOINING_GROUP_YEH_WITH_TAIL,         /* Yeh_With_Tail */
3885   UC_JOINING_GROUP_YUDH,                  /* Yudh */
3886   UC_JOINING_GROUP_YUDH_HE,               /* Yudh_He */
3887   UC_JOINING_GROUP_ZAIN,                  /* Zain */
3888   UC_JOINING_GROUP_ZHAIN,                 /* Zhain */
3889   UC_JOINING_GROUP_ROHINGYA_YEH,          /* Rohingya_Yeh */
3890   UC_JOINING_GROUP_STRAIGHT_WAW,          /* Straight_Waw */
3891   UC_JOINING_GROUP_MANICHAEAN_ALEPH,      /* Manichaean_Aleph */
3892   UC_JOINING_GROUP_MANICHAEAN_BETH,       /* Manichaean_Beth */
3893   UC_JOINING_GROUP_MANICHAEAN_GIMEL,      /* Manichaean_Gimel */
3894   UC_JOINING_GROUP_MANICHAEAN_DALETH,     /* Manichaean_Daleth */
3895   UC_JOINING_GROUP_MANICHAEAN_WAW,        /* Manichaean_Waw */
3896   UC_JOINING_GROUP_MANICHAEAN_ZAYIN,      /* Manichaean_Zayin */
3897   UC_JOINING_GROUP_MANICHAEAN_HETH,       /* Manichaean_Heth */
3898   UC_JOINING_GROUP_MANICHAEAN_TETH,       /* Manichaean_Teth */
3899   UC_JOINING_GROUP_MANICHAEAN_YODH,       /* Manichaean_Yodh */
3900   UC_JOINING_GROUP_MANICHAEAN_KAPH,       /* Manichaean_Kaph */
3901   UC_JOINING_GROUP_MANICHAEAN_LAMEDH,     /* Manichaean_Lamedh */
3902   UC_JOINING_GROUP_MANICHAEAN_DHAMEDH,    /* Manichaean_Dhamedh */
3903   UC_JOINING_GROUP_MANICHAEAN_THAMEDH,    /* Manichaean_Thamedh */
3904   UC_JOINING_GROUP_MANICHAEAN_MEM,        /* Manichaean_Mem */
3905   UC_JOINING_GROUP_MANICHAEAN_NUN,        /* Manichaean_Nun */
3906   UC_JOINING_GROUP_MANICHAEAN_SAMEKH,     /* Manichaean_Aleph */
3907   UC_JOINING_GROUP_MANICHAEAN_AYIN,       /* Manichaean_Ayin */
3908   UC_JOINING_GROUP_MANICHAEAN_PE,         /* Manichaean_Pe */
3909   UC_JOINING_GROUP_MANICHAEAN_SADHE,      /* Manichaean_Sadhe */
3910   UC_JOINING_GROUP_MANICHAEAN_QOPH,       /* Manichaean_Qoph */
3911   UC_JOINING_GROUP_MANICHAEAN_RESH,       /* Manichaean_Resh */
3912   UC_JOINING_GROUP_MANICHAEAN_TAW,        /* Manichaean_Taw */
3913   UC_JOINING_GROUP_MANICHAEAN_ONE,        /* Manichaean_One */
3914   UC_JOINING_GROUP_MANICHAEAN_FIVE,       /* Manichaean_Five */
3915   UC_JOINING_GROUP_MANICHAEAN_TEN,        /* Manichaean_Ten */
3916   UC_JOINING_GROUP_MANICHAEAN_TWENTY,     /* Manichaean_Twenty */
3917   UC_JOINING_GROUP_MANICHAEAN_HUNDRED,    /* Manichaean_Hundred */
3918   UC_JOINING_GROUP_AFRICAN_FEH,           /* African_Feh */
3919   UC_JOINING_GROUP_AFRICAN_QAF,           /* African_Qaf */
3920   UC_JOINING_GROUP_AFRICAN_NOON           /* African_Noon */
3921 };
3922 
3923 static uint8_t unicode_joining_group[0x110000];
3924 
3925 static void
3926 fill_arabicshaping (const char *arabicshaping_filename)
     /* [previous][next][first][last][top][bottom][index][help] */
3927 {
3928   FILE *stream;
3929   unsigned int i;
3930   int lineno;
3931 
3932   stream = fopen (arabicshaping_filename, "r");
3933   if (stream == NULL)
3934     {
3935       fprintf (stderr, "error during fopen of '%s'\n", arabicshaping_filename);
3936       exit (1);
3937     }
3938 
3939   for (i = 0; i < 0x110000; i++)
3940     {
3941       unicode_joining_type[i] = (uint8_t)~(uint8_t)0;
3942       unicode_joining_group[i] = UC_JOINING_GROUP_NONE;
3943     }
3944 
3945   lineno = 0;
3946   for (;;)
3947     {
3948       char buf[200+1];
3949       char separator1[200+1];
3950       char schematic_name[200+1];
3951       char separator2[200+1];
3952       char joining_type_name[200+1];
3953       char separator3[200+1];
3954       char joining_group_name[200+1];
3955       int joining_type;
3956       int joining_group;
3957 
3958       lineno++;
3959       if (fscanf (stream, "%200[^\n]\n", buf) < 1)
3960         break;
3961 
3962       if (buf[0] == '\0' || buf[0] == '#')
3963         continue;
3964 
3965       if (sscanf (buf, "%X%[; ]%[^;]%[; ]%[^;]%[; ]%100[^\n]",
3966                   &i, separator1, schematic_name, separator2, joining_type_name,
3967                   separator3, joining_group_name) != 7)
3968         {
3969           fprintf (stderr, "parse error in '%s':%d\n",
3970                    arabicshaping_filename, lineno);
3971           exit (1);
3972         }
3973       assert (i < 0x110000);
3974 
3975 #define TRY(name) else if (strcmp (joining_type_name, #name + 16) == 0) joining_type = name;
3976       if (false) {}
3977       TRY(UC_JOINING_TYPE_U)
3978       TRY(UC_JOINING_TYPE_T)
3979       TRY(UC_JOINING_TYPE_C)
3980       TRY(UC_JOINING_TYPE_L)
3981       TRY(UC_JOINING_TYPE_R)
3982       TRY(UC_JOINING_TYPE_D)
3983 #undef TRY
3984       else
3985         {
3986           fprintf (stderr, "unknown joining type value \"%s\" in '%s':%d\n",
3987                    joining_type_name, arabicshaping_filename, lineno);
3988           exit (1);
3989         }
3990 
3991       /* Remove trailing spaces.  */
3992       while (joining_group_name[0] != '\0'
3993              && joining_group_name[strlen (joining_group_name) - 1] == ' ')
3994         joining_group_name[strlen (joining_group_name) - 1] = '\0';
3995 
3996 #define TRY(value,name) else if (strcmp (joining_group_name, name) == 0) joining_group = value;
3997       if (false) {}
3998       TRY(UC_JOINING_GROUP_NONE,                  "No_Joining_Group")
3999       TRY(UC_JOINING_GROUP_AIN,                   "AIN")
4000       TRY(UC_JOINING_GROUP_ALAPH,                 "ALAPH")
4001       TRY(UC_JOINING_GROUP_ALEF,                  "ALEF")
4002       TRY(UC_JOINING_GROUP_BEH,                   "BEH")
4003       TRY(UC_JOINING_GROUP_BETH,                  "BETH")
4004       TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, "BURUSHASKI YEH BARREE")
4005       TRY(UC_JOINING_GROUP_DAL,                   "DAL")
4006       TRY(UC_JOINING_GROUP_DALATH_RISH,           "DALATH RISH")
4007       TRY(UC_JOINING_GROUP_E,                     "E")
4008       TRY(UC_JOINING_GROUP_FARSI_YEH,             "FARSI YEH")
4009       TRY(UC_JOINING_GROUP_FE,                    "FE")
4010       TRY(UC_JOINING_GROUP_FEH,                   "FEH")
4011       TRY(UC_JOINING_GROUP_FINAL_SEMKATH,         "FINAL SEMKATH")
4012       TRY(UC_JOINING_GROUP_GAF,                   "GAF")
4013       TRY(UC_JOINING_GROUP_GAMAL,                 "GAMAL")
4014       TRY(UC_JOINING_GROUP_HAH,                   "HAH")
4015       TRY(UC_JOINING_GROUP_HE,                    "HE")
4016       TRY(UC_JOINING_GROUP_HEH,                   "HEH")
4017       TRY(UC_JOINING_GROUP_HEH_GOAL,              "HEH GOAL")
4018       TRY(UC_JOINING_GROUP_HETH,                  "HETH")
4019       TRY(UC_JOINING_GROUP_KAF,                   "KAF")
4020       TRY(UC_JOINING_GROUP_KAPH,                  "KAPH")
4021       TRY(UC_JOINING_GROUP_KHAPH,                 "KHAPH")
4022       TRY(UC_JOINING_GROUP_KNOTTED_HEH,           "KNOTTED HEH")
4023       TRY(UC_JOINING_GROUP_LAM,                   "LAM")
4024       TRY(UC_JOINING_GROUP_LAMADH,                "LAMADH")
4025       TRY(UC_JOINING_GROUP_MEEM,                  "MEEM")
4026       TRY(UC_JOINING_GROUP_MIM,                   "MIM")
4027       TRY(UC_JOINING_GROUP_NOON,                  "NOON")
4028       TRY(UC_JOINING_GROUP_NUN,                   "NUN")
4029       TRY(UC_JOINING_GROUP_NYA,                   "NYA")
4030       TRY(UC_JOINING_GROUP_PE,                    "PE")
4031       TRY(UC_JOINING_GROUP_QAF,                   "QAF")
4032       TRY(UC_JOINING_GROUP_QAPH,                  "QAPH")
4033       TRY(UC_JOINING_GROUP_REH,                   "REH")
4034       TRY(UC_JOINING_GROUP_REVERSED_PE,           "REVERSED PE")
4035       TRY(UC_JOINING_GROUP_SAD,                   "SAD")
4036       TRY(UC_JOINING_GROUP_SADHE,                 "SADHE")
4037       TRY(UC_JOINING_GROUP_SEEN,                  "SEEN")
4038       TRY(UC_JOINING_GROUP_SEMKATH,               "SEMKATH")
4039       TRY(UC_JOINING_GROUP_SHIN,                  "SHIN")
4040       TRY(UC_JOINING_GROUP_SWASH_KAF,             "SWASH KAF")
4041       TRY(UC_JOINING_GROUP_SYRIAC_WAW,            "SYRIAC WAW")
4042       TRY(UC_JOINING_GROUP_TAH,                   "TAH")
4043       TRY(UC_JOINING_GROUP_TAW,                   "TAW")
4044       TRY(UC_JOINING_GROUP_TEH_MARBUTA,           "TEH MARBUTA")
4045       TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL,      "TEH MARBUTA GOAL")
4046       TRY(UC_JOINING_GROUP_TETH,                  "TETH")
4047       TRY(UC_JOINING_GROUP_WAW,                   "WAW")
4048       TRY(UC_JOINING_GROUP_YEH,                   "YEH")
4049       TRY(UC_JOINING_GROUP_YEH_BARREE,            "YEH BARREE")
4050       TRY(UC_JOINING_GROUP_YEH_WITH_TAIL,         "YEH WITH TAIL")
4051       TRY(UC_JOINING_GROUP_YUDH,                  "YUDH")
4052       TRY(UC_JOINING_GROUP_YUDH_HE,               "YUDH HE")
4053       TRY(UC_JOINING_GROUP_ZAIN,                  "ZAIN")
4054       TRY(UC_JOINING_GROUP_ZHAIN,                 "ZHAIN")
4055       TRY(UC_JOINING_GROUP_ROHINGYA_YEH,          "ROHINGYA YEH")
4056       TRY(UC_JOINING_GROUP_STRAIGHT_WAW,          "STRAIGHT WAW")
4057       TRY(UC_JOINING_GROUP_MANICHAEAN_ALEPH,      "MANICHAEAN ALEPH")
4058       TRY(UC_JOINING_GROUP_MANICHAEAN_BETH,       "MANICHAEAN BETH")
4059       TRY(UC_JOINING_GROUP_MANICHAEAN_GIMEL,      "MANICHAEAN GIMEL")
4060       TRY(UC_JOINING_GROUP_MANICHAEAN_DALETH,     "MANICHAEAN DALETH")
4061       TRY(UC_JOINING_GROUP_MANICHAEAN_WAW,        "MANICHAEAN WAW")
4062       TRY(UC_JOINING_GROUP_MANICHAEAN_ZAYIN,      "MANICHAEAN ZAYIN")
4063       TRY(UC_JOINING_GROUP_MANICHAEAN_HETH,       "MANICHAEAN HETH")
4064       TRY(UC_JOINING_GROUP_MANICHAEAN_TETH,       "MANICHAEAN TETH")
4065       TRY(UC_JOINING_GROUP_MANICHAEAN_YODH,       "MANICHAEAN YODH")
4066       TRY(UC_JOINING_GROUP_MANICHAEAN_KAPH,       "MANICHAEAN KAPH")
4067       TRY(UC_JOINING_GROUP_MANICHAEAN_LAMEDH,     "MANICHAEAN LAMEDH")
4068       TRY(UC_JOINING_GROUP_MANICHAEAN_DHAMEDH,    "MANICHAEAN DHAMEDH")
4069       TRY(UC_JOINING_GROUP_MANICHAEAN_THAMEDH,    "MANICHAEAN THAMEDH")
4070       TRY(UC_JOINING_GROUP_MANICHAEAN_MEM,        "MANICHAEAN MEM")
4071       TRY(UC_JOINING_GROUP_MANICHAEAN_NUN,        "MANICHAEAN NUN")
4072       TRY(UC_JOINING_GROUP_MANICHAEAN_SAMEKH,     "MANICHAEAN SAMEKH")
4073       TRY(UC_JOINING_GROUP_MANICHAEAN_AYIN,       "MANICHAEAN AYIN")
4074       TRY(UC_JOINING_GROUP_MANICHAEAN_PE,         "MANICHAEAN PE")
4075       TRY(UC_JOINING_GROUP_MANICHAEAN_SADHE,      "MANICHAEAN SADHE")
4076       TRY(UC_JOINING_GROUP_MANICHAEAN_QOPH,       "MANICHAEAN QOPH")
4077       TRY(UC_JOINING_GROUP_MANICHAEAN_RESH,       "MANICHAEAN RESH")
4078       TRY(UC_JOINING_GROUP_MANICHAEAN_TAW,        "MANICHAEAN TAW")
4079       TRY(UC_JOINING_GROUP_MANICHAEAN_ONE,        "MANICHAEAN ONE")
4080       TRY(UC_JOINING_GROUP_MANICHAEAN_FIVE,       "MANICHAEAN FIVE")
4081       TRY(UC_JOINING_GROUP_MANICHAEAN_TEN,        "MANICHAEAN TEN")
4082       TRY(UC_JOINING_GROUP_MANICHAEAN_TWENTY,     "MANICHAEAN TWENTY")
4083       TRY(UC_JOINING_GROUP_MANICHAEAN_HUNDRED,    "MANICHAEAN HUNDRED")
4084       TRY(UC_JOINING_GROUP_AFRICAN_FEH,           "AFRICAN FEH")
4085       TRY(UC_JOINING_GROUP_AFRICAN_QAF,           "AFRICAN QAF")
4086       TRY(UC_JOINING_GROUP_AFRICAN_NOON,          "AFRICAN NOON")
4087 #undef TRY
4088       else
4089         {
4090           fprintf (stderr, "unknown joining group value \"%s\" in '%s':%d\n",
4091                    joining_group_name, arabicshaping_filename, lineno);
4092           exit (1);
4093         }
4094 
4095       unicode_joining_type[i] = joining_type;
4096       unicode_joining_group[i] = joining_group;
4097     }
4098 
4099   if (ferror (stream) || fclose (stream))
4100     {
4101       fprintf (stderr, "error reading from '%s'\n", arabicshaping_filename);
4102       exit (1);
4103     }
4104 }
4105 
4106 /* Convert a Joining_Type value to a C identifier.  */
4107 static const char *
4108 joining_type_as_c_identifier (int joining_type)
     /* [previous][next][first][last][top][bottom][index][help] */
4109 {
4110 #define TRY(value) if (joining_type == value) return #value;
4111   TRY(UC_JOINING_TYPE_U)
4112   TRY(UC_JOINING_TYPE_T)
4113   TRY(UC_JOINING_TYPE_C)
4114   TRY(UC_JOINING_TYPE_L)
4115   TRY(UC_JOINING_TYPE_R)
4116   TRY(UC_JOINING_TYPE_D)
4117 #undef TRY
4118   abort ();
4119 }
4120 
4121 static void
4122 output_joining_type_test (const char *filename, const char *version)
     /* [previous][next][first][last][top][bottom][index][help] */
4123 {
4124   FILE *stream;
4125   bool need_comma;
4126   unsigned int ch;
4127 
4128   stream = fopen (filename, "w");
4129   if (stream == NULL)
4130     {
4131       fprintf (stderr, "cannot open '%s' for writing\n", filename);
4132       exit (1);
4133     }
4134 
4135   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4136   fprintf (stream, "/* Arabic joining type of Unicode characters.  */\n");
4137   fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
4138            version);
4139   fprintf (stream, "\n");
4140 
4141   fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
4142   fprintf (stream, "\n");
4143   output_tests_license (stream);
4144   fprintf (stream, "\n");
4145 
4146   need_comma = false;
4147   for (ch = 0; ch < 0x110000; ch++)
4148     {
4149       int value = unicode_joining_type[ch];
4150 
4151       if (value != (uint8_t)~(uint8_t)0)
4152         {
4153           if (need_comma)
4154             fprintf (stream, ",\n");
4155           fprintf (stream, "    { 0x%04X, %s }", ch, joining_type_as_c_identifier (value));
4156           need_comma = true;
4157         }
4158     }
4159   if (need_comma)
4160     fprintf (stream, "\n");
4161 
4162   if (ferror (stream) || fclose (stream))
4163     {
4164       fprintf (stderr, "error writing to '%s'\n", filename);
4165       exit (1);
4166     }
4167 }
4168 
4169 /* Construction of sparse 3-level tables.  */
4170 #define TABLE joining_type_table
4171 #define ELEMENT uint8_t
4172 #define DEFAULT (uint8_t)~(uint8_t)0
4173 #define xmalloc malloc
4174 #define xrealloc realloc
4175 #include "3level.h"
4176 
4177 static void
4178 output_joining_type (const char *filename, const char *version)
     /* [previous][next][first][last][top][bottom][index][help] */
4179 {
4180   FILE *stream;
4181   unsigned int ch, i;
4182   struct joining_type_table t;
4183   unsigned int level1_offset, level2_offset, level3_offset;
4184   uint8_t *level3_packed;
4185 
4186   stream = fopen (filename, "w");
4187   if (stream == NULL)
4188     {
4189       fprintf (stderr, "cannot open '%s' for writing\n", filename);
4190       exit (1);
4191     }
4192 
4193   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4194   fprintf (stream, "/* Arabic joining type of Unicode characters.  */\n");
4195   fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
4196            version);
4197   fprintf (stream, "\n");
4198 
4199   fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
4200   fprintf (stream, "\n");
4201   output_library_license (stream, true);
4202   fprintf (stream, "\n");
4203 
4204   t.p = 7;
4205   t.q = 9;
4206   joining_type_table_init (&t);
4207 
4208   for (ch = 0; ch < 0x110000; ch++)
4209     {
4210       uint8_t value = unicode_joining_type[ch];
4211 
4212       assert (value == (uint8_t)~(uint8_t)0 || value <= 0x0f);
4213 
4214       joining_type_table_add (&t, ch, value);
4215     }
4216 
4217   joining_type_table_finalize (&t);
4218 
4219   /* Offsets in t.result, in memory of this process.  */
4220   level1_offset =
4221     5 * sizeof (uint32_t);
4222   level2_offset =
4223     5 * sizeof (uint32_t)
4224     + t.level1_size * sizeof (uint32_t);
4225   level3_offset =
4226     5 * sizeof (uint32_t)
4227     + t.level1_size * sizeof (uint32_t)
4228     + (t.level2_size << t.q) * sizeof (uint32_t);
4229 
4230   for (i = 0; i < 5; i++)
4231     fprintf (stream, "#define joining_type_header_%d %d\n", i,
4232              ((uint32_t *) t.result)[i]);
4233   fprintf (stream, "static const\n");
4234   fprintf (stream, "struct\n");
4235   fprintf (stream, "  {\n");
4236   fprintf (stream, "    int level1[%zu];\n", t.level1_size);
4237   fprintf (stream, "    short level2[%zu << %d];\n", t.level2_size, t.q);
4238   fprintf (stream, "    unsigned char level3[%zu * %d];\n", t.level3_size,
4239            (1 << t.p) * 4 / 8);
4240   fprintf (stream, "  }\n");
4241   fprintf (stream, "u_joining_type =\n");
4242   fprintf (stream, "{\n");
4243   fprintf (stream, "  {");
4244   if (t.level1_size > 8)
4245     fprintf (stream, "\n   ");
4246   for (i = 0; i < t.level1_size; i++)
4247     {
4248       uint32_t offset;
4249       if (i > 0 && (i % 8) == 0)
4250         fprintf (stream, "\n   ");
4251       offset = ((uint32_t *) (t.result + level1_offset))[i];
4252       if (offset == 0)
4253         fprintf (stream, " %5d", -1);
4254       else
4255         fprintf (stream, " %5zu",
4256                  (offset - level2_offset) / sizeof (uint32_t));
4257       if (i+1 < t.level1_size)
4258         fprintf (stream, ",");
4259     }
4260   if (t.level1_size > 8)
4261     fprintf (stream, "\n ");
4262   fprintf (stream, " },\n");
4263   fprintf (stream, "  {");
4264   if (t.level2_size << t.q > 8)
4265     fprintf (stream, "\n   ");
4266   for (i = 0; i < t.level2_size << t.q; i++)
4267     {
4268       uint32_t offset;
4269       if (i > 0 && (i % 8) == 0)
4270         fprintf (stream, "\n   ");
4271       offset = ((uint32_t *) (t.result + level2_offset))[i];
4272       if (offset == 0)
4273         fprintf (stream, " %5d", -1);
4274       else
4275         fprintf (stream, " %5zu",
4276                  (offset - level3_offset) / sizeof (uint8_t));
4277       if (i+1 < t.level2_size << t.q)
4278         fprintf (stream, ",");
4279     }
4280   if (t.level2_size << t.q > 8)
4281     fprintf (stream, "\n ");
4282   fprintf (stream, " },\n");
4283   /* Pack the level3 array.  Each entry needs 4 bits only.  */
4284   level3_packed =
4285     (uint8_t *) calloc ((t.level3_size << t.p) * 4 / 8, sizeof (uint8_t));
4286   for (i = 0; i < t.level3_size << t.p; i++)
4287     {
4288       unsigned int j = (i * 4) / 8;
4289       unsigned int k = (i * 4) % 8;
4290       uint32_t value = ((unsigned char *) (t.result + level3_offset))[i] & 0x0f;
4291       level3_packed[j] |= (value << k);
4292     }
4293   fprintf (stream, "  {");
4294   if ((t.level3_size << t.p) * 4 / 8 > 8)
4295     fprintf (stream, "\n   ");
4296   for (i = 0; i < (t.level3_size << t.p) * 4 / 8; i++)
4297     {
4298       if (i > 0 && (i % 8) == 0)
4299         fprintf (stream, "\n   ");
4300       fprintf (stream, " 0x%02x", level3_packed[i]);
4301       if (i+1 < (t.level3_size << t.p) * 4 / 8)
4302         fprintf (stream, ",");
4303     }
4304   if ((t.level3_size << t.p) * 4 / 8 > 8)
4305     fprintf (stream, "\n ");
4306   fprintf (stream, " }\n");
4307   free (level3_packed);
4308   fprintf (stream, "};\n");
4309 
4310   if (ferror (stream) || fclose (stream))
4311     {
4312       fprintf (stderr, "error writing to '%s'\n", filename);
4313       exit (1);
4314     }
4315 }
4316 
4317 /* Convert a Joining_Group value to a C identifier.  */
4318 static const char *
4319 joining_group_as_c_identifier (int joining_group)
     /* [previous][next][first][last][top][bottom][index][help] */
4320 {
4321 #define TRY(value) if (joining_group == value) return #value;
4322   TRY(UC_JOINING_GROUP_NONE)
4323   TRY(UC_JOINING_GROUP_AIN)
4324   TRY(UC_JOINING_GROUP_ALAPH)
4325   TRY(UC_JOINING_GROUP_ALEF)
4326   TRY(UC_JOINING_GROUP_BEH)
4327   TRY(UC_JOINING_GROUP_BETH)
4328   TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE)
4329   TRY(UC_JOINING_GROUP_DAL)
4330   TRY(UC_JOINING_GROUP_DALATH_RISH)
4331   TRY(UC_JOINING_GROUP_E)
4332   TRY(UC_JOINING_GROUP_FARSI_YEH)
4333   TRY(UC_JOINING_GROUP_FE)
4334   TRY(UC_JOINING_GROUP_FEH)
4335   TRY(UC_JOINING_GROUP_FINAL_SEMKATH)
4336   TRY(UC_JOINING_GROUP_GAF)
4337   TRY(UC_JOINING_GROUP_GAMAL)
4338   TRY(UC_JOINING_GROUP_HAH)
4339   TRY(UC_JOINING_GROUP_HE)
4340   TRY(UC_JOINING_GROUP_HEH)
4341   TRY(UC_JOINING_GROUP_HEH_GOAL)
4342   TRY(UC_JOINING_GROUP_HETH)
4343   TRY(UC_JOINING_GROUP_KAF)
4344   TRY(UC_JOINING_GROUP_KAPH)
4345   TRY(UC_JOINING_GROUP_KHAPH)
4346   TRY(UC_JOINING_GROUP_KNOTTED_HEH)
4347   TRY(UC_JOINING_GROUP_LAM)
4348   TRY(UC_JOINING_GROUP_LAMADH)
4349   TRY(UC_JOINING_GROUP_MEEM)
4350   TRY(UC_JOINING_GROUP_MIM)
4351   TRY(UC_JOINING_GROUP_NOON)
4352   TRY(UC_JOINING_GROUP_NUN)
4353   TRY(UC_JOINING_GROUP_NYA)
4354   TRY(UC_JOINING_GROUP_PE)
4355   TRY(UC_JOINING_GROUP_QAF)
4356   TRY(UC_JOINING_GROUP_QAPH)
4357   TRY(UC_JOINING_GROUP_REH)
4358   TRY(UC_JOINING_GROUP_REVERSED_PE)
4359   TRY(UC_JOINING_GROUP_SAD)
4360   TRY(UC_JOINING_GROUP_SADHE)
4361   TRY(UC_JOINING_GROUP_SEEN)
4362   TRY(UC_JOINING_GROUP_SEMKATH)
4363   TRY(UC_JOINING_GROUP_SHIN)
4364   TRY(UC_JOINING_GROUP_SWASH_KAF)
4365   TRY(UC_JOINING_GROUP_SYRIAC_WAW)
4366   TRY(UC_JOINING_GROUP_TAH)
4367   TRY(UC_JOINING_GROUP_TAW)
4368   TRY(UC_JOINING_GROUP_TEH_MARBUTA)
4369   TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL)
4370   TRY(UC_JOINING_GROUP_TETH)
4371   TRY(UC_JOINING_GROUP_WAW)
4372   TRY(UC_JOINING_GROUP_YEH)
4373   TRY(UC_JOINING_GROUP_YEH_BARREE)
4374   TRY(UC_JOINING_GROUP_YEH_WITH_TAIL)
4375   TRY(UC_JOINING_GROUP_YUDH)
4376   TRY(UC_JOINING_GROUP_YUDH_HE)
4377   TRY(UC_JOINING_GROUP_ZAIN)
4378   TRY(UC_JOINING_GROUP_ZHAIN)
4379   TRY(UC_JOINING_GROUP_ROHINGYA_YEH)
4380   TRY(UC_JOINING_GROUP_STRAIGHT_WAW)
4381   TRY(UC_JOINING_GROUP_MANICHAEAN_ALEPH)
4382   TRY(UC_JOINING_GROUP_MANICHAEAN_BETH)
4383   TRY(UC_JOINING_GROUP_MANICHAEAN_GIMEL)
4384   TRY(UC_JOINING_GROUP_MANICHAEAN_DALETH)
4385   TRY(UC_JOINING_GROUP_MANICHAEAN_WAW)
4386   TRY(UC_JOINING_GROUP_MANICHAEAN_ZAYIN)
4387   TRY(UC_JOINING_GROUP_MANICHAEAN_HETH)
4388   TRY(UC_JOINING_GROUP_MANICHAEAN_TETH)
4389   TRY(UC_JOINING_GROUP_MANICHAEAN_YODH)
4390   TRY(UC_JOINING_GROUP_MANICHAEAN_KAPH)
4391   TRY(UC_JOINING_GROUP_MANICHAEAN_LAMEDH)
4392   TRY(UC_JOINING_GROUP_MANICHAEAN_DHAMEDH)
4393   TRY(UC_JOINING_GROUP_MANICHAEAN_THAMEDH)
4394   TRY(UC_JOINING_GROUP_MANICHAEAN_MEM)
4395   TRY(UC_JOINING_GROUP_MANICHAEAN_NUN)
4396   TRY(UC_JOINING_GROUP_MANICHAEAN_SAMEKH)
4397   TRY(UC_JOINING_GROUP_MANICHAEAN_AYIN)
4398   TRY(UC_JOINING_GROUP_MANICHAEAN_PE)
4399   TRY(UC_JOINING_GROUP_MANICHAEAN_SADHE)
4400   TRY(UC_JOINING_GROUP_MANICHAEAN_QOPH)
4401   TRY(UC_JOINING_GROUP_MANICHAEAN_RESH)
4402   TRY(UC_JOINING_GROUP_MANICHAEAN_TAW)
4403   TRY(UC_JOINING_GROUP_MANICHAEAN_ONE)
4404   TRY(UC_JOINING_GROUP_MANICHAEAN_FIVE)
4405   TRY(UC_JOINING_GROUP_MANICHAEAN_TEN)
4406   TRY(UC_JOINING_GROUP_MANICHAEAN_TWENTY)
4407   TRY(UC_JOINING_GROUP_MANICHAEAN_HUNDRED)
4408   TRY(UC_JOINING_GROUP_AFRICAN_FEH)
4409   TRY(UC_JOINING_GROUP_AFRICAN_QAF)
4410   TRY(UC_JOINING_GROUP_AFRICAN_NOON)
4411 #undef TRY
4412   abort ();
4413 }
4414 
4415 static void
4416 output_joining_group_test (const char *filename, const char *version)
     /* [previous][next][first][last][top][bottom][index][help] */
4417 {
4418   FILE *stream;
4419   bool need_comma;
4420   unsigned int ch;
4421 
4422   stream = fopen (filename, "w");
4423   if (stream == NULL)
4424     {
4425       fprintf (stderr, "cannot open '%s' for writing\n", filename);
4426       exit (1);
4427     }
4428 
4429   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4430   fprintf (stream, "/* Arabic joining group of Unicode characters.  */\n");
4431   fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
4432            version);
4433   fprintf (stream, "\n");
4434 
4435   fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
4436   fprintf (stream, "\n");
4437   output_tests_license (stream);
4438   fprintf (stream, "\n");
4439 
4440   need_comma = false;
4441   for (ch = 0; ch < 0x110000; ch++)
4442     {
4443       int value = unicode_joining_group[ch];
4444 
4445       if (value != UC_JOINING_GROUP_NONE)
4446         {
4447           if (need_comma)
4448             fprintf (stream, ",\n");
4449           fprintf (stream, "    { 0x%04X, %s }", ch, joining_group_as_c_identifier (value));
4450           need_comma = true;
4451         }
4452     }
4453   if (need_comma)
4454     fprintf (stream, "\n");
4455 
4456   if (ferror (stream) || fclose (stream))
4457     {
4458       fprintf (stderr, "error writing to '%s'\n", filename);
4459       exit (1);
4460     }
4461 }
4462 
4463 /* Construction of sparse 3-level tables.  */
4464 #define TABLE joining_group_table
4465 #define ELEMENT uint8_t
4466 #define DEFAULT UC_JOINING_GROUP_NONE
4467 #define xmalloc malloc
4468 #define xrealloc realloc
4469 #include "3level.h"
4470 
4471 static void
4472 output_joining_group (const char *filename, const char *version)
     /* [previous][next][first][last][top][bottom][index][help] */
4473 {
4474   FILE *stream;
4475   unsigned int ch, i;
4476   struct joining_group_table t;
4477   unsigned int level1_offset, level2_offset, level3_offset;
4478   uint16_t *level3_packed;
4479 
4480   stream = fopen (filename, "w");
4481   if (stream == NULL)
4482     {
4483       fprintf (stderr, "cannot open '%s' for writing\n", filename);
4484       exit (1);
4485     }
4486 
4487   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4488   fprintf (stream, "/* Arabic joining group of Unicode characters.  */\n");
4489   fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
4490            version);
4491   fprintf (stream, "\n");
4492 
4493   fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
4494   fprintf (stream, "\n");
4495   output_library_license (stream, false);
4496   fprintf (stream, "\n");
4497 
4498   t.p = 7;
4499   t.q = 9;
4500   joining_group_table_init (&t);
4501 
4502   for (ch = 0; ch < 0x110000; ch++)
4503     {
4504       uint8_t value = unicode_joining_group[ch];
4505 
4506       assert (value <= 0x7f);
4507 
4508       joining_group_table_add (&t, ch, value);
4509     }
4510 
4511   joining_group_table_finalize (&t);
4512 
4513   /* Offsets in t.result, in memory of this process.  */
4514   level1_offset =
4515     5 * sizeof (uint32_t);
4516   level2_offset =
4517     5 * sizeof (uint32_t)
4518     + t.level1_size * sizeof (uint32_t);
4519   level3_offset =
4520     5 * sizeof (uint32_t)
4521     + t.level1_size * sizeof (uint32_t)
4522     + (t.level2_size << t.q) * sizeof (uint32_t);
4523 
4524   for (i = 0; i < 5; i++)
4525     fprintf (stream, "#define joining_group_header_%d %d\n", i,
4526              ((uint32_t *) t.result)[i]);
4527   fprintf (stream, "static const\n");
4528   fprintf (stream, "struct\n");
4529   fprintf (stream, "  {\n");
4530   fprintf (stream, "    int level1[%zu];\n", t.level1_size);
4531   fprintf (stream, "    short level2[%zu << %d];\n", t.level2_size, t.q);
4532   fprintf (stream, "    unsigned short level3[%zu * %d + 1];\n", t.level3_size,
4533            (1 << t.p) * 7 / 16);
4534   fprintf (stream, "  }\n");
4535   fprintf (stream, "u_joining_group =\n");
4536   fprintf (stream, "{\n");
4537   fprintf (stream, "  {");
4538   if (t.level1_size > 8)
4539     fprintf (stream, "\n   ");
4540   for (i = 0; i < t.level1_size; i++)
4541     {
4542       uint32_t offset;
4543       if (i > 0 && (i % 8) == 0)
4544         fprintf (stream, "\n   ");
4545       offset = ((uint32_t *) (t.result + level1_offset))[i];
4546       if (offset == 0)
4547         fprintf (stream, " %5d", -1);
4548       else
4549         fprintf (stream, " %5zu",
4550                  (offset - level2_offset) / sizeof (uint32_t));
4551       if (i+1 < t.level1_size)
4552         fprintf (stream, ",");
4553     }
4554   if (t.level1_size > 8)
4555     fprintf (stream, "\n ");
4556   fprintf (stream, " },\n");
4557   fprintf (stream, "  {");
4558   if (t.level2_size << t.q > 8)
4559     fprintf (stream, "\n   ");
4560   for (i = 0; i < t.level2_size << t.q; i++)
4561     {
4562       uint32_t offset;
4563       if (i > 0 && (i % 8) == 0)
4564         fprintf (stream, "\n   ");
4565       offset = ((uint32_t *) (t.result + level2_offset))[i];
4566       if (offset == 0)
4567         fprintf (stream, " %5d", -1);
4568       else
4569         fprintf (stream, " %5zu",
4570                  (offset - level3_offset) / sizeof (uint8_t));
4571       if (i+1 < t.level2_size << t.q)
4572         fprintf (stream, ",");
4573     }
4574   if (t.level2_size << t.q > 8)
4575     fprintf (stream, "\n ");
4576   fprintf (stream, " },\n");
4577   /* Pack the level3 array.  Each entry needs 7 bits only.  Use 16-bit units,
4578      not 32-bit units, in order to make the lookup function easier.  */
4579   level3_packed =
4580     (uint16_t *)
4581     calloc ((t.level3_size << t.p) * 7 / 16 + 1, sizeof (uint16_t));
4582   for (i = 0; i < t.level3_size << t.p; i++)
4583     {
4584       unsigned int j = (i * 7) / 16;
4585       unsigned int k = (i * 7) % 16;
4586       uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
4587       value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
4588       level3_packed[j] = value & 0xffff;
4589       level3_packed[j+1] = value >> 16;
4590     }
4591   fprintf (stream, "  {");
4592   if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
4593     fprintf (stream, "\n   ");
4594   for (i = 0; i < (t.level3_size << t.p) * 7 / 16 + 1; i++)
4595     {
4596       if (i > 0 && (i % 8) == 0)
4597         fprintf (stream, "\n   ");
4598       fprintf (stream, " 0x%04x", level3_packed[i]);
4599       if (i+1 < (t.level3_size << t.p) * 7 / 16 + 1)
4600         fprintf (stream, ",");
4601     }
4602   if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
4603     fprintf (stream, "\n ");
4604   fprintf (stream, " }\n");
4605   free (level3_packed);
4606   fprintf (stream, "};\n");
4607 
4608   if (ferror (stream) || fclose (stream))
4609     {
4610       fprintf (stderr, "error writing to '%s'\n", filename);
4611       exit (1);
4612     }
4613 }
4614 
4615 /* ========================================================================= */
4616 
4617 /* Scripts.  */
4618 
4619 static const char *scripts[256];
4620 static unsigned int numscripts;
4621 
4622 static uint8_t unicode_scripts[0x110000];
4623 
4624 static void
4625 fill_scripts (const char *scripts_filename)
     /* [previous][next][first][last][top][bottom][index][help] */
4626 {
4627   FILE *stream;
4628   unsigned int i;
4629 
4630   stream = fopen (scripts_filename, "r");
4631   if (stream == NULL)
4632     {
4633       fprintf (stderr, "error during fopen of '%s'\n", scripts_filename);
4634       exit (1);
4635     }
4636 
4637   numscripts = 0;
4638 
4639   for (i = 0; i < 0x110000; i++)
4640     unicode_scripts[i] = (uint8_t)~(uint8_t)0;
4641 
4642   for (;;)
4643     {
4644       char buf[200+1];
4645       unsigned int i1, i2;
4646       char padding[200+1];
4647       char scriptname[200+1];
4648       int script;
4649 
4650       if (fscanf (stream, "%200[^\n]\n", buf) < 1)
4651         break;
4652 
4653       if (buf[0] == '\0' || buf[0] == '#')
4654         continue;
4655 
4656       if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4)
4657         {
4658           if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3)
4659             {
4660               fprintf (stderr, "parse error in '%s'\n", scripts_filename);
4661               exit (1);
4662             }
4663           i2 = i1;
4664         }
4665       assert (i2 >= i1);
4666       assert (i2 < 0x110000);
4667 
4668       for (script = numscripts - 1; script >= 0; script--)
4669         if (strcmp (scripts[script], scriptname) == 0)
4670           break;
4671       if (script < 0)
4672         {
4673           scripts[numscripts] = strdup (scriptname);
4674           script = numscripts;
4675           numscripts++;
4676           assert (numscripts != 256);
4677         }
4678 
4679       for (i = i1; i <= i2; i++)
4680         {
4681           if (unicode_scripts[i] != (uint8_t)~(uint8_t)0)
4682             fprintf (stderr, "0x%04X belongs to multiple scripts\n", i);
4683           unicode_scripts[i] = script;
4684         }
4685     }
4686 
4687   if (ferror (stream) || fclose (stream))
4688     {
4689       fprintf (stderr, "error reading from '%s'\n", scripts_filename);
4690       exit (1);
4691     }
4692 }
4693 
4694 /* Construction of sparse 3-level tables.  */
4695 #define TABLE script_table
4696 #define ELEMENT uint8_t
4697 #define DEFAULT (uint8_t)~(uint8_t)0
4698 #define xmalloc malloc
4699 #define xrealloc realloc
4700 #include "3level.h"
4701 
4702 static void
4703 output_scripts (const char *version)
     /* [previous][next][first][last][top][bottom][index][help] */
4704 {
4705   const char *filename = "unictype/scripts.h";
4706   FILE *stream;
4707   unsigned int ch, s, i;
4708   struct script_table t;
4709   unsigned int level1_offset, level2_offset, level3_offset;
4710 
4711   typedef struct
4712   {
4713     const char *lowercase_name;
4714   }
4715   scriptinfo_t;
4716   scriptinfo_t scriptinfo[256];
4717 
4718   stream = fopen (filename, "w");
4719   if (stream == NULL)
4720     {
4721       fprintf (stderr, "cannot open '%s' for writing\n", filename);
4722       exit (1);
4723     }
4724 
4725   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4726   fprintf (stream, "/* Unicode scripts.  */\n");
4727   fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
4728            version);
4729   fprintf (stream, "\n");
4730 
4731   fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
4732   fprintf (stream, "\n");
4733   output_library_license (stream, true);
4734   fprintf (stream, "\n");
4735 
4736   for (s = 0; s < numscripts; s++)
4737     {
4738       char *lcp = strdup (scripts[s]);
4739       char *cp;
4740 
4741       for (cp = lcp; *cp != '\0'; cp++)
4742         if (*cp >= 'A' && *cp <= 'Z')
4743           *cp += 'a' - 'A';
4744 
4745       scriptinfo[s].lowercase_name = lcp;
4746     }
4747 
4748   for (s = 0; s < numscripts; s++)
4749     {
4750       fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n",
4751                scriptinfo[s].lowercase_name);
4752       fprintf (stream, "{\n");
4753       i = 0;
4754       for (ch = 0; ch < 0x110000; ch++)
4755         if (unicode_scripts[ch] == s)
4756           {
4757             unsigned int start;
4758             unsigned int end;
4759 
4760             start = ch;
4761             while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s)
4762               ch++;
4763             end = ch;
4764 
4765             if (i > 0)
4766               fprintf (stream, ",\n");
4767             if (start == end)
4768               fprintf (stream, "  { 0x%04X, 1, 1 }", start);
4769             else
4770               fprintf (stream, "  { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }",
4771                        start, end);
4772             i++;
4773           }
4774       fprintf (stream, "\n");
4775       fprintf (stream, "};\n");
4776     }
4777 
4778   fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts);
4779   fprintf (stream, "{\n");
4780   for (s = 0; s < numscripts; s++)
4781     {
4782       fprintf (stream, "  {\n");
4783       fprintf (stream, "    sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n",
4784                scriptinfo[s].lowercase_name);
4785       fprintf (stream, "    script_%s_intervals,\n",
4786                scriptinfo[s].lowercase_name);
4787       fprintf (stream, "    \"%s\"\n", scripts[s]);
4788       fprintf (stream, "  }");
4789       if (s+1 < numscripts)
4790         fprintf (stream, ",");
4791       fprintf (stream, "\n");
4792     }
4793   fprintf (stream, "};\n");
4794 
4795   t.p = 7;
4796   t.q = 9;
4797   script_table_init (&t);
4798 
4799   for (ch = 0; ch < 0x110000; ch++)
4800     {
4801       unsigned int s = unicode_scripts[ch];
4802       if (s != (uint8_t)~(uint8_t)0)
4803         script_table_add (&t, ch, s);
4804     }
4805 
4806   script_table_finalize (&t);
4807 
4808   /* Offsets in t.result, in memory of this process.  */
4809   level1_offset =
4810     5 * sizeof (uint32_t);
4811   level2_offset =
4812     5 * sizeof (uint32_t)
4813     + t.level1_size * sizeof (uint32_t);
4814   level3_offset =
4815     5 * sizeof (uint32_t)
4816     + t.level1_size * sizeof (uint32_t)
4817     + (t.level2_size << t.q) * sizeof (uint32_t);
4818 
4819   for (i = 0; i < 5; i++)
4820     fprintf (stream, "#define script_header_%d %d\n", i,
4821              ((uint32_t *) t.result)[i]);
4822   fprintf (stream, "static const\n");
4823   fprintf (stream, "struct\n");
4824   fprintf (stream, "  {\n");
4825   fprintf (stream, "    int level1[%zu];\n", t.level1_size);
4826   fprintf (stream, "    short level2[%zu << %d];\n", t.level2_size, t.q);
4827   fprintf (stream, "    unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
4828   fprintf (stream, "  }\n");
4829   fprintf (stream, "u_script =\n");
4830   fprintf (stream, "{\n");
4831   fprintf (stream, "  {");
4832   if (t.level1_size > 8)
4833     fprintf (stream, "\n   ");
4834   for (i = 0; i < t.level1_size; i++)
4835     {
4836       uint32_t offset;
4837       if (i > 0 && (i % 8) == 0)
4838         fprintf (stream, "\n   ");
4839       offset = ((uint32_t *) (t.result + level1_offset))[i];
4840       if (offset == 0)
4841         fprintf (stream, " %5d", -1);
4842       else
4843         fprintf (stream, " %5zu",
4844                  (offset - level2_offset) / sizeof (uint32_t));
4845       if (i+1 < t.level1_size)
4846         fprintf (stream, ",");
4847     }
4848   if (t.level1_size > 8)
4849     fprintf (stream, "\n ");
4850   fprintf (stream, " },\n");
4851   fprintf (stream, "  {");
4852   if (t.level2_size << t.q > 8)
4853     fprintf (stream, "\n   ");
4854   for (i = 0; i < t.level2_size << t.q; i++)
4855     {
4856       uint32_t offset;
4857       if (i > 0 && (i % 8) == 0)
4858         fprintf (stream, "\n   ");
4859       offset = ((uint32_t *) (t.result + level2_offset))[i];
4860       if (offset == 0)
4861         fprintf (stream, " %5d", -1);
4862       else
4863         fprintf (stream, " %5zu",
4864                  (offset - level3_offset) / sizeof (uint8_t));
4865       if (i+1 < t.level2_size << t.q)
4866         fprintf (stream, ",");
4867     }
4868   if (t.level2_size << t.q > 8)
4869     fprintf (stream, "\n ");
4870   fprintf (stream, " },\n");
4871   fprintf (stream, "  {");
4872   if (t.level3_size << t.p > 8)
4873     fprintf (stream, "\n   ");
4874   for (i = 0; i < t.level3_size << t.p; i++)
4875     {
4876       if (i > 0 && (i % 8) == 0)
4877         fprintf (stream, "\n   ");
4878       fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
4879       if (i+1 < t.level3_size << t.p)
4880         fprintf (stream, ",");
4881     }
4882   if (t.level3_size << t.p > 8)
4883     fprintf (stream, "\n ");
4884   fprintf (stream, " }\n");
4885   fprintf (stream, "};\n");
4886 
4887   if (ferror (stream) || fclose (stream))
4888     {
4889       fprintf (stderr, "error writing to '%s'\n", filename);
4890       exit (1);
4891     }
4892 }
4893 
4894 static void
4895 output_scripts_byname (const char *version)
     /* [previous][next][first][last][top][bottom][index][help] */
4896 {
4897   const char *filename = "unictype/scripts_byname.gperf";
4898   FILE *stream;
4899   unsigned int s;
4900 
4901   stream = fopen (filename, "w");
4902   if (stream == NULL)
4903     {
4904       fprintf (stderr, "cannot open '%s' for writing\n", filename);
4905       exit (1);
4906     }
4907 
4908   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4909   fprintf (stream, "/* Unicode scripts.  */\n");
4910   fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
4911            version);
4912   fprintf (stream, "\n");
4913 
4914   fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
4915   fprintf (stream, "\n");
4916   output_library_license (stream, true);
4917   fprintf (stream, "\n");
4918 
4919   fprintf (stream, "struct named_script { int name; unsigned int index; };\n");
4920   fprintf (stream, "%%struct-type\n");
4921   fprintf (stream, "%%language=ANSI-C\n");
4922   fprintf (stream, "%%define hash-function-name scripts_hash\n");
4923   fprintf (stream, "%%define lookup-function-name uc_script_lookup\n");
4924   fprintf (stream, "%%readonly-tables\n");
4925   fprintf (stream, "%%global-table\n");
4926   fprintf (stream, "%%define word-array-name script_names\n");
4927   fprintf (stream, "%%pic\n");
4928   fprintf (stream, "%%define string-pool-name script_stringpool\n");
4929   fprintf (stream, "%%%%\n");
4930   for (s = 0; s < numscripts; s++)
4931     fprintf (stream, "%s, %u\n", scripts[s], s);
4932 
4933   if (ferror (stream) || fclose (stream))
4934     {
4935       fprintf (stderr, "error writing to '%s'\n", filename);
4936       exit (1);
4937     }
4938 }
4939 
4940 /* ========================================================================= */
4941 
4942 /* Blocks.  */
4943 
4944 typedef struct { unsigned int start; unsigned int end; const char *name; }
4945   block_t;
4946 static block_t blocks[384];
4947 static unsigned int numblocks;
4948 
4949 static void
4950 fill_blocks (const char *blocks_filename)
     /* [previous][next][first][last][top][bottom][index][help] */
4951 {
4952   FILE *stream;
4953 
4954   stream = fopen (blocks_filename, "r");
4955   if (stream == NULL)
4956     {
4957       fprintf (stderr, "error during fopen of '%s'\n", blocks_filename);
4958       exit (1);
4959     }
4960 
4961   for (;;)
4962     {
4963       char buf[200+1];
4964       unsigned int i1, i2;
4965       char padding[200+1];
4966       char blockname[200+1];
4967 
4968       if (fscanf (stream, "%200[^\n]\n", buf) < 1)
4969         break;
4970 
4971       if (buf[0] == '\0' || buf[0] == '#')
4972         continue;
4973 
4974       if (sscanf (buf, "%X..%X%[ ;]%[^\r]", &i1, &i2, padding, blockname) != 4)
4975         {
4976           fprintf (stderr, "parse error in '%s'\n", blocks_filename);
4977           exit (1);
4978         }
4979       blocks[numblocks].start = i1;
4980       blocks[numblocks].end = i2;
4981       blocks[numblocks].name = strdup (blockname);
4982       /* It must be sorted.  */
4983       assert (numblocks == 0 || blocks[numblocks-1].end < blocks[numblocks].start);
4984       numblocks++;
4985       assert (numblocks != SIZEOF (blocks));
4986     }
4987 
4988   if (ferror (stream) || fclose (stream))
4989     {
4990       fprintf (stderr, "error reading from '%s'\n", blocks_filename);
4991       exit (1);
4992     }
4993 }
4994 
4995 /* Return the smallest block index among the blocks for characters >= ch.  */
4996 static unsigned int
4997 block_first_index (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
4998 {
4999   /* Binary search.  */
5000   unsigned int lo = 0;
5001   unsigned int hi = numblocks;
5002   /* Invariants:
5003      All blocks[i], i < lo, have blocks[i].end < ch,
5004      all blocks[i], i >= hi, have blocks[i].end >= ch.  */
5005   while (lo < hi)
5006     {
5007       unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
5008       if (blocks[mid].end < ch)
5009         lo = mid + 1;
5010       else
5011         hi = mid;
5012     }
5013   return hi;
5014 }
5015 
5016 /* Return the largest block index among the blocks for characters <= ch,
5017    plus 1.  */
5018 static unsigned int
5019 block_last_index (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
5020 {
5021   /* Binary search.  */
5022   unsigned int lo = 0;
5023   unsigned int hi = numblocks;
5024   /* Invariants:
5025      All blocks[i], i < lo, have blocks[i].start <= ch,
5026      all blocks[i], i >= hi, have blocks[i].start > ch.  */
5027   while (lo < hi)
5028     {
5029       unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
5030       if (blocks[mid].start <= ch)
5031         lo = mid + 1;
5032       else
5033         hi = mid;
5034     }
5035   return hi;
5036 }
5037 
5038 static void
5039 output_blocks (const char *version)
     /* [previous][next][first][last][top][bottom][index][help] */
5040 {
5041   const char *filename = "unictype/blocks.h";
5042   const unsigned int shift = 8; /* bits to shift away for array access */
5043   const unsigned int threshold = 0x28000; /* cut-off table here to save space */
5044   FILE *stream;
5045   unsigned int i;
5046   unsigned int i1;
5047 
5048   stream = fopen (filename, "w");
5049   if (stream == NULL)
5050     {
5051       fprintf (stderr, "cannot open '%s' for writing\n", filename);
5052       exit (1);
5053     }
5054 
5055   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
5056   fprintf (stream, "/* Unicode blocks.  */\n");
5057   fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
5058            version);
5059   fprintf (stream, "\n");
5060 
5061   fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
5062   fprintf (stream, "\n");
5063   output_library_license (stream, false);
5064   fprintf (stream, "\n");
5065 
5066   fprintf (stream, "static const uc_block_t blocks[] =\n");
5067   fprintf (stream, "{\n");
5068   for (i = 0; i < numblocks; i++)
5069     {
5070       fprintf (stream, "  { 0x%04X, 0x%04X, \"%s\" }", blocks[i].start,
5071                blocks[i].end, blocks[i].name);
5072       if (i+1 < numblocks)
5073         fprintf (stream, ",");
5074       fprintf (stream, "\n");
5075     }
5076   fprintf (stream, "};\n");
5077   fprintf (stream, "#define blocks_level1_shift %d\n", shift);
5078   fprintf (stream, "#define blocks_level1_threshold 0x%04X\n", threshold);
5079   fprintf (stream, "static const uint16_t blocks_level1[%d * 2] =\n",
5080            threshold >> shift);
5081   fprintf (stream, "{\n");
5082   for (i1 = 0; i1 < (threshold >> shift); i1++)
5083     {
5084       unsigned int first_index = block_first_index (i1 << shift);
5085       unsigned int last_index = block_last_index (((i1 + 1) << shift) - 1);
5086       fprintf (stream, "  %3d, %3d", first_index, last_index);
5087       if (i1+1 < (threshold >> shift))
5088         fprintf (stream, ",");
5089       fprintf (stream, "\n");
5090     }
5091   fprintf (stream, "};\n");
5092   fprintf (stream, "#define blocks_upper_first_index %d\n",
5093            block_first_index (threshold));
5094   fprintf (stream, "#define blocks_upper_last_index %d\n",
5095            block_last_index (0x10FFFF));
5096 
5097   if (ferror (stream) || fclose (stream))
5098     {
5099       fprintf (stderr, "error writing to '%s'\n", filename);
5100       exit (1);
5101     }
5102 }
5103 
5104 /* ========================================================================= */
5105 
5106 /* C and Java syntax.  */
5107 
5108 enum
5109 {
5110   UC_IDENTIFIER_START,    /* valid as first or subsequent character */
5111   UC_IDENTIFIER_VALID,    /* valid as subsequent character only */
5112   UC_IDENTIFIER_INVALID,  /* not valid */
5113   UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */
5114 };
5115 
5116 /* ISO C 99 section 6.4.(3).  */
5117 static bool
5118 is_c_whitespace (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
5119 {
5120   return (ch == ' ' /* space */
5121           || ch == '\t' /* horizontal tab */
5122           || ch == '\n' || ch == '\r' /* new-line */
5123           || ch == '\v' /* vertical tab */
5124           || ch == '\f'); /* form-feed */
5125 }
5126 
5127 /* ISO C 99 section 6.4.2.1 and appendix D.  */
5128 static int
5129 c_ident_category (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
5130 {
5131   /* Section 6.4.2.1.  */
5132   if (ch >= '0' && ch <= '9')
5133     return UC_IDENTIFIER_VALID;
5134   if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_')
5135     return UC_IDENTIFIER_START;
5136   /* Appendix D.  */
5137   if (0
5138       /* Latin */
5139       || (ch == 0x00AA)
5140       || (ch == 0x00BA)
5141       || (ch >= 0x00C0 && ch <= 0x00D6)
5142       || (ch >= 0x00D8 && ch <= 0x00F6)
5143       || (ch >= 0x00F8 && ch <= 0x01F5)
5144       || (ch >= 0x01FA && ch <= 0x0217)
5145       || (ch >= 0x0250 && ch <= 0x02A8)
5146       || (ch >= 0x1E00 && ch <= 0x1E9B)
5147       || (ch >= 0x1EA0 && ch <= 0x1EF9)
5148       || (ch == 0x207F)
5149       /* Greek */
5150       || (ch == 0x0386)
5151       || (ch >= 0x0388 && ch <= 0x038A)
5152       || (ch == 0x038C)
5153       || (ch >= 0x038E && ch <= 0x03A1)
5154       || (ch >= 0x03A3 && ch <= 0x03CE)
5155       || (ch >= 0x03D0 && ch <= 0x03D6)
5156       || (ch == 0x03DA)
5157       || (ch == 0x03DC)
5158       || (ch == 0x03DE)
5159       || (ch == 0x03E0)
5160       || (ch >= 0x03E2 && ch <= 0x03F3)
5161       || (ch >= 0x1F00 && ch <= 0x1F15)
5162       || (ch >= 0x1F18 && ch <= 0x1F1D)
5163       || (ch >= 0x1F20 && ch <= 0x1F45)
5164       || (ch >= 0x1F48 && ch <= 0x1F4D)
5165       || (ch >= 0x1F50 && ch <= 0x1F57)
5166       || (ch == 0x1F59)
5167       || (ch == 0x1F5B)
5168       || (ch == 0x1F5D)
5169       || (ch >= 0x1F5F && ch <= 0x1F7D)
5170       || (ch >= 0x1F80 && ch <= 0x1FB4)
5171       || (ch >= 0x1FB6 && ch <= 0x1FBC)
5172       || (ch >= 0x1FC2 && ch <= 0x1FC4)
5173       || (ch >= 0x1FC6 && ch <= 0x1FCC)
5174       || (ch >= 0x1FD0 && ch <= 0x1FD3)
5175       || (ch >= 0x1FD6 && ch <= 0x1FDB)
5176       || (ch >= 0x1FE0 && ch <= 0x1FEC)
5177       || (ch >= 0x1FF2 && ch <= 0x1FF4)
5178       || (ch >= 0x1FF6 && ch <= 0x1FFC)
5179       /* Cyrillic */
5180       || (ch >= 0x0401 && ch <= 0x040C)
5181       || (ch >= 0x040E && ch <= 0x044F)
5182       || (ch >= 0x0451 && ch <= 0x045C)
5183       || (ch >= 0x045E && ch <= 0x0481)
5184       || (ch >= 0x0490 && ch <= 0x04C4)
5185       || (ch >= 0x04C7 && ch <= 0x04C8)
5186       || (ch >= 0x04CB && ch <= 0x04CC)
5187       || (ch >= 0x04D0 && ch <= 0x04EB)
5188       || (ch >= 0x04EE && ch <= 0x04F5)
5189       || (ch >= 0x04F8 && ch <= 0x04F9)
5190       /* Armenian */
5191       || (ch >= 0x0531 && ch <= 0x0556)
5192       || (ch >= 0x0561 && ch <= 0x0587)
5193       /* Hebrew */
5194       || (ch >= 0x05B0 && ch <= 0x05B9)
5195       || (ch >= 0x05BB && ch <= 0x05BD)
5196       || (ch == 0x05BF)
5197       || (ch >= 0x05C1 && ch <= 0x05C2)
5198       || (ch >= 0x05D0 && ch <= 0x05EA)
5199       || (ch >= 0x05F0 && ch <= 0x05F2)
5200       /* Arabic */
5201       || (ch >= 0x0621 && ch <= 0x063A)
5202       || (ch >= 0x0640 && ch <= 0x0652)
5203       || (ch >= 0x0670 && ch <= 0x06B7)
5204       || (ch >= 0x06BA && ch <= 0x06BE)
5205       || (ch >= 0x06C0 && ch <= 0x06CE)
5206       || (ch >= 0x06D0 && ch <= 0x06DC)
5207       || (ch >= 0x06E5 && ch <= 0x06E8)
5208       || (ch >= 0x06EA && ch <= 0x06ED)
5209       /* Devanagari */
5210       || (ch >= 0x0901 && ch <= 0x0903)
5211       || (ch >= 0x0905 && ch <= 0x0939)
5212       || (ch >= 0x093E && ch <= 0x094D)
5213       || (ch >= 0x0950 && ch <= 0x0952)
5214       || (ch >= 0x0958 && ch <= 0x0963)
5215       /* Bengali */
5216       || (ch >= 0x0981 && ch <= 0x0983)
5217       || (ch >= 0x0985 && ch <= 0x098C)
5218       || (ch >= 0x098F && ch <= 0x0990)
5219       || (ch >= 0x0993 && ch <= 0x09A8)
5220       || (ch >= 0x09AA && ch <= 0x09B0)
5221       || (ch == 0x09B2)
5222       || (ch >= 0x09B6 && ch <= 0x09B9)
5223       || (ch >= 0x09BE && ch <= 0x09C4)
5224       || (ch >= 0x09C7 && ch <= 0x09C8)
5225       || (ch >= 0x09CB && ch <= 0x09CD)
5226       || (ch >= 0x09DC && ch <= 0x09DD)
5227       || (ch >= 0x09DF && ch <= 0x09E3)
5228       || (ch >= 0x09F0 && ch <= 0x09F1)
5229       /* Gurmukhi */
5230       || (ch == 0x0A02)
5231       || (ch >= 0x0A05 && ch <= 0x0A0A)
5232       || (ch >= 0x0A0F && ch <= 0x0A10)
5233       || (ch >= 0x0A13 && ch <= 0x0A28)
5234       || (ch >= 0x0A2A && ch <= 0x0A30)
5235       || (ch >= 0x0A32 && ch <= 0x0A33)
5236       || (ch >= 0x0A35 && ch <= 0x0A36)
5237       || (ch >= 0x0A38 && ch <= 0x0A39)
5238       || (ch >= 0x0A3E && ch <= 0x0A42)
5239       || (ch >= 0x0A47 && ch <= 0x0A48)
5240       || (ch >= 0x0A4B && ch <= 0x0A4D)
5241       || (ch >= 0x0A59 && ch <= 0x0A5C)
5242       || (ch == 0x0A5E)
5243       || (ch == 0x0A74)
5244       /* Gujarati */
5245       || (ch >= 0x0A81 && ch <= 0x0A83)
5246       || (ch >= 0x0A85 && ch <= 0x0A8B)
5247       || (ch == 0x0A8D)
5248       || (ch >= 0x0A8F && ch <= 0x0A91)
5249       || (ch >= 0x0A93 && ch <= 0x0AA8)
5250       || (ch >= 0x0AAA && ch <= 0x0AB0)
5251       || (ch >= 0x0AB2 && ch <= 0x0AB3)
5252       || (ch >= 0x0AB5 && ch <= 0x0AB9)
5253       || (ch >= 0x0ABD && ch <= 0x0AC5)
5254       || (ch >= 0x0AC7 && ch <= 0x0AC9)
5255       || (ch >= 0x0ACB && ch <= 0x0ACD)
5256       || (ch == 0x0AD0)
5257       || (ch == 0x0AE0)
5258       /* Oriya */
5259       || (ch >= 0x0B01 && ch <= 0x0B03)
5260       || (ch >= 0x0B05 && ch <= 0x0B0C)
5261       || (ch >= 0x0B0F && ch <= 0x0B10)
5262       || (ch >= 0x0B13 && ch <= 0x0B28)
5263       || (ch >= 0x0B2A && ch <= 0x0B30)
5264       || (ch >= 0x0B32 && ch <= 0x0B33)
5265       || (ch >= 0x0B36 && ch <= 0x0B39)
5266       || (ch >= 0x0B3E && ch <= 0x0B43)
5267       || (ch >= 0x0B47 && ch <= 0x0B48)
5268       || (ch >= 0x0B4B && ch <= 0x0B4D)
5269       || (ch >= 0x0B5C && ch <= 0x0B5D)
5270       || (ch >= 0x0B5F && ch <= 0x0B61)
5271       /* Tamil */
5272       || (ch >= 0x0B82 && ch <= 0x0B83)
5273       || (ch >= 0x0B85 && ch <= 0x0B8A)
5274       || (ch >= 0x0B8E && ch <= 0x0B90)
5275       || (ch >= 0x0B92 && ch <= 0x0B95)
5276       || (ch >= 0x0B99 && ch <= 0x0B9A)
5277       || (ch == 0x0B9C)
5278       || (ch >= 0x0B9E && ch <= 0x0B9F)
5279       || (ch >= 0x0BA3 && ch <= 0x0BA4)
5280       || (ch >= 0x0BA8 && ch <= 0x0BAA)
5281       || (ch >= 0x0BAE && ch <= 0x0BB5)
5282       || (ch >= 0x0BB7 && ch <= 0x0BB9)
5283       || (ch >= 0x0BBE && ch <= 0x0BC2)
5284       || (ch >= 0x0BC6 && ch <= 0x0BC8)
5285       || (ch >= 0x0BCA && ch <= 0x0BCD)
5286       /* Telugu */
5287       || (ch >= 0x0C01 && ch <= 0x0C03)
5288       || (ch >= 0x0C05 && ch <= 0x0C0C)
5289       || (ch >= 0x0C0E && ch <= 0x0C10)
5290       || (ch >= 0x0C12 && ch <= 0x0C28)
5291       || (ch >= 0x0C2A && ch <= 0x0C33)
5292       || (ch >= 0x0C35 && ch <= 0x0C39)
5293       || (ch >= 0x0C3E && ch <= 0x0C44)
5294       || (ch >= 0x0C46 && ch <= 0x0C48)
5295       || (ch >= 0x0C4A && ch <= 0x0C4D)
5296       || (ch >= 0x0C60 && ch <= 0x0C61)
5297       /* Kannada */
5298       || (ch >= 0x0C82 && ch <= 0x0C83)
5299       || (ch >= 0x0C85 && ch <= 0x0C8C)
5300       || (ch >= 0x0C8E && ch <= 0x0C90)
5301       || (ch >= 0x0C92 && ch <= 0x0CA8)
5302       || (ch >= 0x0CAA && ch <= 0x0CB3)
5303       || (ch >= 0x0CB5 && ch <= 0x0CB9)
5304       || (ch >= 0x0CBE && ch <= 0x0CC4)
5305       || (ch >= 0x0CC6 && ch <= 0x0CC8)
5306       || (ch >= 0x0CCA && ch <= 0x0CCD)
5307       || (ch == 0x0CDE)
5308       || (ch >= 0x0CE0 && ch <= 0x0CE1)
5309       /* Malayalam */
5310       || (ch >= 0x0D02 && ch <= 0x0D03)
5311       || (ch >= 0x0D05 && ch <= 0x0D0C)
5312       || (ch >= 0x0D0E && ch <= 0x0D10)
5313       || (ch >= 0x0D12 && ch <= 0x0D28)
5314       || (ch >= 0x0D2A && ch <= 0x0D39)
5315       || (ch >= 0x0D3E && ch <= 0x0D43)
5316       || (ch >= 0x0D46 && ch <= 0x0D48)
5317       || (ch >= 0x0D4A && ch <= 0x0D4D)
5318       || (ch >= 0x0D60 && ch <= 0x0D61)
5319       /* Thai */
5320       || (ch >= 0x0E01 && ch <= 0x0E3A)
5321       || (ch >= 0x0E40 && ch <= 0x0E5B)
5322       /* Lao */
5323       || (ch >= 0x0E81 && ch <= 0x0E82)
5324       || (ch == 0x0E84)
5325       || (ch >= 0x0E87 && ch <= 0x0E88)
5326       || (ch == 0x0E8A)
5327       || (ch == 0x0E8D)
5328       || (ch >= 0x0E94 && ch <= 0x0E97)
5329       || (ch >= 0x0E99 && ch <= 0x0E9F)
5330       || (ch >= 0x0EA1 && ch <= 0x0EA3)
5331       || (ch == 0x0EA5)
5332       || (ch == 0x0EA7)
5333       || (ch >= 0x0EAA && ch <= 0x0EAB)
5334       || (ch >= 0x0EAD && ch <= 0x0EAE)
5335       || (ch >= 0x0EB0 && ch <= 0x0EB9)
5336       || (ch >= 0x0EBB && ch <= 0x0EBD)
5337       || (ch >= 0x0EC0 && ch <= 0x0EC4)
5338       || (ch == 0x0EC6)
5339       || (ch >= 0x0EC8 && ch <= 0x0ECD)
5340       || (ch >= 0x0EDC && ch <= 0x0EDD)
5341       /* Tibetan */
5342       || (ch == 0x0F00)
5343       || (ch >= 0x0F18 && ch <= 0x0F19)
5344       || (ch == 0x0F35)
5345       || (ch == 0x0F37)
5346       || (ch == 0x0F39)
5347       || (ch >= 0x0F3E && ch <= 0x0F47)
5348       || (ch >= 0x0F49 && ch <= 0x0F69)
5349       || (ch >= 0x0F71 && ch <= 0x0F84)
5350       || (ch >= 0x0F86 && ch <= 0x0F8B)
5351       || (ch >= 0x0F90 && ch <= 0x0F95)
5352       || (ch == 0x0F97)
5353       || (ch >= 0x0F99 && ch <= 0x0FAD)
5354       || (ch >= 0x0FB1 && ch <= 0x0FB7)
5355       || (ch == 0x0FB9)
5356       /* Georgian */
5357       || (ch >= 0x10A0 && ch <= 0x10C5)
5358       || (ch >= 0x10D0 && ch <= 0x10F6)
5359       /* Hiragana */
5360       || (ch >= 0x3041 && ch <= 0x3093)
5361       || (ch >= 0x309B && ch <= 0x309C)
5362       /* Katakana */
5363       || (ch >= 0x30A1 && ch <= 0x30F6)
5364       || (ch >= 0x30FB && ch <= 0x30FC)
5365       /* Bopomofo */
5366       || (ch >= 0x3105 && ch <= 0x312C)
5367       /* CJK Unified Ideographs */
5368       || (ch >= 0x4E00 && ch <= 0x9FA5)
5369       /* Hangul */
5370       || (ch >= 0xAC00 && ch <= 0xD7A3)
5371       /* Digits */
5372       || (ch >= 0x0660 && ch <= 0x0669)
5373       || (ch >= 0x06F0 && ch <= 0x06F9)
5374       || (ch >= 0x0966 && ch <= 0x096F)
5375       || (ch >= 0x09E6 && ch <= 0x09EF)
5376       || (ch >= 0x0A66 && ch <= 0x0A6F)
5377       || (ch >= 0x0AE6 && ch <= 0x0AEF)
5378       || (ch >= 0x0B66 && ch <= 0x0B6F)
5379       || (ch >= 0x0BE7 && ch <= 0x0BEF)
5380       || (ch >= 0x0C66 && ch <= 0x0C6F)
5381       || (ch >= 0x0CE6 && ch <= 0x0CEF)
5382       || (ch >= 0x0D66 && ch <= 0x0D6F)
5383       || (ch >= 0x0E50 && ch <= 0x0E59)
5384       || (ch >= 0x0ED0 && ch <= 0x0ED9)
5385       || (ch >= 0x0F20 && ch <= 0x0F33)
5386       /* Special characters */
5387       || (ch == 0x00B5)
5388       || (ch == 0x00B7)
5389       || (ch >= 0x02B0 && ch <= 0x02B8)
5390       || (ch == 0x02BB)
5391       || (ch >= 0x02BD && ch <= 0x02C1)
5392       || (ch >= 0x02D0 && ch <= 0x02D1)
5393       || (ch >= 0x02E0 && ch <= 0x02E4)
5394       || (ch == 0x037A)
5395       || (ch == 0x0559)
5396       || (ch == 0x093D)
5397       || (ch == 0x0B3D)
5398       || (ch == 0x1FBE)
5399       || (ch >= 0x203F && ch <= 0x2040)
5400       || (ch == 0x2102)
5401       || (ch == 0x2107)
5402       || (ch >= 0x210A && ch <= 0x2113)
5403       || (ch == 0x2115)
5404       || (ch >= 0x2118 && ch <= 0x211D)
5405       || (ch == 0x2124)
5406       || (ch == 0x2126)
5407       || (ch == 0x2128)
5408       || (ch >= 0x212A && ch <= 0x2131)
5409       || (ch >= 0x2133 && ch <= 0x2138)
5410       || (ch >= 0x2160 && ch <= 0x2182)
5411       || (ch >= 0x3005 && ch <= 0x3007)
5412       || (ch >= 0x3021 && ch <= 0x3029)
5413      )
5414     return UC_IDENTIFIER_START;
5415   return UC_IDENTIFIER_INVALID;
5416 }
5417 
5418 /* The Java Language Specification, 3rd edition, §3.6.
5419    https://docs.oracle.com/javase/specs/jls/se6/html/lexical.html#3.6  */
5420 static bool
5421 is_java_whitespace (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
5422 {
5423   return (ch == ' ' || ch == '\t' || ch == '\f'
5424           || ch == '\n' || ch == '\r');
5425 }
5426 
5427 /* The Java Language Specification, 3rd edition, §3.8.
5428    https://docs.oracle.com/javase/specs/jls/se6/html/lexical.html#3.8
5429    and Character.isJavaIdentifierStart and Character.isJavaIdentifierPart  */
5430 static int
5431 java_ident_category (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
5432 {
5433   /* FIXME: Check this against Sun's JDK implementation.  */
5434   if (is_category_L (ch) /* = Character.isLetter(ch) */
5435       || is_category_Nl (ch) /* = Character.getType(ch)==LETTER_NUMBER */
5436       || is_category_Sc (ch) /* currency symbol */
5437       || is_category_Pc (ch) /* connector punctuation */
5438      )
5439     return UC_IDENTIFIER_START;
5440   if (is_category_Nd (ch) /* digit */
5441       || is_category_Mc (ch) /* combining mark */
5442       || is_category_Mn (ch) /* non-spacing mark */
5443      )
5444     return UC_IDENTIFIER_VALID;
5445   if ((ch >= 0x0000 && ch <= 0x0008)
5446       || (ch >= 0x000E && ch <= 0x001B)
5447       || (ch >= 0x007F && ch <= 0x009F)
5448       || is_category_Cf (ch) /* = Character.getType(ch)==FORMAT */
5449      )
5450     return UC_IDENTIFIER_IGNORABLE;
5451   return UC_IDENTIFIER_INVALID;
5452 }
5453 
5454 /* Construction of sparse 3-level tables.  */
5455 #define TABLE identsyntax_table
5456 #define ELEMENT uint8_t
5457 #define DEFAULT UC_IDENTIFIER_INVALID
5458 #define xmalloc malloc
5459 #define xrealloc realloc
5460 #include "3level.h"
5461 
5462 /* Output an identifier syntax categorization in a three-level bitmap.  */
5463 static void
5464 output_ident_category (const char *filename, int (*predicate) (unsigned int), const char *name, const char *version)
     /* [previous][next][first][last][top][bottom][index][help] */
5465 {
5466   FILE *stream;
5467   unsigned int ch, i;
5468   struct identsyntax_table t;
5469   unsigned int level1_offset, level2_offset, level3_offset;
5470 
5471   stream = fopen (filename, "w");
5472   if (stream == NULL)
5473     {
5474       fprintf (stderr, "cannot open '%s' for writing\n", filename);
5475       exit (1);
5476     }
5477 
5478   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
5479   fprintf (stream, "/* Language syntax properties of Unicode characters.  */\n");
5480   fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
5481            version);
5482   fprintf (stream, "\n");
5483 
5484   fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
5485   fprintf (stream, "\n");
5486   output_library_license (stream, false);
5487   fprintf (stream, "\n");
5488 
5489   t.p = 7; /* or 8 */
5490   t.q = 5; /* or 4 */
5491   identsyntax_table_init (&t);
5492 
5493   for (ch = 0; ch < 0x110000; ch++)
5494     {
5495       int syntaxcode = predicate (ch);
5496 
5497       assert (syntaxcode <= 0x03);
5498 
5499       if (syntaxcode != UC_IDENTIFIER_INVALID)
5500         identsyntax_table_add (&t, ch, syntaxcode);
5501     }
5502 
5503   identsyntax_table_finalize (&t);
5504 
5505   /* Offsets in t.result, in memory of this process.  */
5506   level1_offset =
5507     5 * sizeof (uint32_t);
5508   level2_offset =
5509     5 * sizeof (uint32_t)
5510     + t.level1_size * sizeof (uint32_t);
5511   level3_offset =
5512     5 * sizeof (uint32_t)
5513     + t.level1_size * sizeof (uint32_t)
5514     + (t.level2_size << t.q) * sizeof (uint32_t);
5515 
5516   for (i = 0; i < 5; i++)
5517     fprintf (stream, "#define identsyntax_header_%d %d\n", i,
5518              ((uint32_t *) t.result)[i]);
5519   fprintf (stream, "static const\n");
5520   fprintf (stream, "struct\n");
5521   fprintf (stream, "  {\n");
5522   fprintf (stream, "    int level1[%zu];\n", t.level1_size);
5523   fprintf (stream, "    short level2[%zu << %d];\n", t.level2_size, t.q);
5524   fprintf (stream, "    unsigned short level3[%zu * %d];\n", t.level3_size,
5525            (1 << t.p) * 2 / 16);
5526   fprintf (stream, "  }\n");
5527   fprintf (stream, "%s =\n", name);
5528   fprintf (stream, "{\n");
5529   fprintf (stream, "  {");
5530   if (t.level1_size > 8)
5531     fprintf (stream, "\n   ");
5532   for (i = 0; i < t.level1_size; i++)
5533     {
5534       uint32_t offset;
5535       if (i > 0 && (i % 8) == 0)
5536         fprintf (stream, "\n   ");
5537       offset = ((uint32_t *) (t.result + level1_offset))[i];
5538       if (offset == 0)
5539         fprintf (stream, " %5d", -1);
5540       else
5541         fprintf (stream, " %5zu",
5542                  (offset - level2_offset) / sizeof (uint32_t));
5543       if (i+1 < t.level1_size)
5544         fprintf (stream, ",");
5545     }
5546   if (t.level1_size > 8)
5547     fprintf (stream, "\n ");
5548   fprintf (stream, " },\n");
5549   fprintf (stream, "  {");
5550   if (t.level2_size << t.q > 8)
5551     fprintf (stream, "\n   ");
5552   for (i = 0; i < t.level2_size << t.q; i++)
5553     {
5554       uint32_t offset;
5555       if (i > 0 && (i % 8) == 0)
5556         fprintf (stream, "\n   ");
5557       offset = ((uint32_t *) (t.result + level2_offset))[i];
5558       if (offset == 0)
5559         fprintf (stream, " %5d", -1);
5560       else
5561         fprintf (stream, " %5zu",
5562                  (offset - level3_offset) / sizeof (uint8_t));
5563       if (i+1 < t.level2_size << t.q)
5564         fprintf (stream, ",");
5565     }
5566   if (t.level2_size << t.q > 8)
5567     fprintf (stream, "\n ");
5568   fprintf (stream, " },\n");
5569   /* Pack the level3 array.  Each entry needs 2 bits only.  */
5570   fprintf (stream, "  {");
5571   if ((t.level3_size << t.p) * 2 / 16 > 8)
5572     fprintf (stream, "\n   ");
5573   for (i = 0; i < (t.level3_size << t.p) * 2 / 16; i++)
5574     {
5575       if (i > 0 && (i % 8) == 0)
5576         fprintf (stream, "\n   ");
5577       fprintf (stream, " 0x%04x",
5578                (((uint8_t *) (t.result + level3_offset))[8 * i] << 0)
5579                | (((uint8_t *) (t.result + level3_offset))[8 * i + 1] << 2)
5580                | (((uint8_t *) (t.result + level3_offset))[8 * i + 2] << 4)
5581                | (((uint8_t *) (t.result + level3_offset))[8 * i + 3] << 6)
5582                | (((uint8_t *) (t.result + level3_offset))[8 * i + 4] << 8)
5583                | (((uint8_t *) (t.result + level3_offset))[8 * i + 5] << 10)
5584                | (((uint8_t *) (t.result + level3_offset))[8 * i + 6] << 12)
5585                | (((uint8_t *) (t.result + level3_offset))[8 * i + 7] << 14));
5586       if (i+1 < (t.level3_size << t.p) * 2 / 16)
5587         fprintf (stream, ",");
5588     }
5589   if ((t.level3_size << t.p) * 2 / 16 > 8)
5590     fprintf (stream, "\n ");
5591   fprintf (stream, " }\n");
5592   fprintf (stream, "};\n");
5593 
5594   if (ferror (stream) || fclose (stream))
5595     {
5596       fprintf (stderr, "error writing to '%s'\n", filename);
5597       exit (1);
5598     }
5599 }
5600 
5601 static void
5602 output_ident_properties (const char *version)
     /* [previous][next][first][last][top][bottom][index][help] */
5603 {
5604 #define PROPERTY(P) \
5605   debug_output_predicate ("unictype/sy_" #P ".txt", is_ ## P); \
5606   output_predicate_test ("../tests/unictype/test-sy_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
5607   output_predicate ("unictype/sy_" #P ".h", is_ ## P, "u_" #P, "Language syntax properties", version);
5608   PROPERTY(c_whitespace)
5609   PROPERTY(java_whitespace)
5610 #undef PROPERTY
5611 
5612   output_ident_category ("unictype/sy_c_ident.h", c_ident_category, "u_c_ident", version);
5613   output_ident_category ("unictype/sy_java_ident.h", java_ident_category, "u_java_ident", version);
5614 }
5615 
5616 /* ========================================================================= */
5617 
5618 /* Like ISO C <ctype.h> and <wctype.h>.  Compatible to glibc's
5619    glibc/localedata/locales/i18n file, generated by
5620    glibc/localedata/gen-unicode-ctype.c.  */
5621 
5622 /* Character mappings.  */
5623 
5624 static unsigned int
5625 to_upper (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
5626 {
5627   if (unicode_attributes[ch].name != NULL
5628       && unicode_attributes[ch].upper != NONE)
5629     return unicode_attributes[ch].upper;
5630   else
5631     return ch;
5632 }
5633 
5634 static unsigned int
5635 to_lower (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
5636 {
5637   if (unicode_attributes[ch].name != NULL
5638       && unicode_attributes[ch].lower != NONE)
5639     return unicode_attributes[ch].lower;
5640   else
5641     return ch;
5642 }
5643 
5644 static unsigned int
5645 to_title (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
5646 {
5647   if (unicode_attributes[ch].name != NULL
5648       && unicode_attributes[ch].title != NONE)
5649     return unicode_attributes[ch].title;
5650   else
5651     return ch;
5652 }
5653 
5654 /* Character class properties.  */
5655 
5656 static bool
5657 is_upper (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
5658 {
5659   return (to_lower (ch) != ch);
5660 }
5661 
5662 static bool
5663 is_lower (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
5664 {
5665   return (to_upper (ch) != ch)
5666          /* <U00DF> is lowercase, but without simple to_upper mapping.  */
5667          || (ch == 0x00DF);
5668 }
5669 
5670 static bool
5671 is_alpha (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
5672 {
5673   return (unicode_attributes[ch].name != NULL
5674           && ((unicode_attributes[ch].category[0] == 'L'
5675                /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
5676                   <U0E2F>, <U0E46> should belong to is_punct.  */
5677                && (ch != 0x0E2F) && (ch != 0x0E46))
5678               /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
5679                  <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha.  */
5680               || (ch == 0x0E31)
5681               || (ch >= 0x0E34 && ch <= 0x0E3A)
5682               || (ch >= 0x0E47 && ch <= 0x0E4E)
5683               /* Avoid warning for <U0345>.  */
5684               || (ch == 0x0345)
5685               /* Avoid warnings for <U2160>..<U217F>.  */
5686               || (unicode_attributes[ch].category[0] == 'N'
5687                   && unicode_attributes[ch].category[1] == 'l')
5688               /* Avoid warnings for <U24B6>..<U24E9>.  */
5689               || (unicode_attributes[ch].category[0] == 'S'
5690                   && unicode_attributes[ch].category[1] == 'o'
5691                   && strstr (unicode_attributes[ch].name, " LETTER ")
5692                      != NULL)
5693               /* Consider all the non-ASCII digits as alphabetic.
5694                  ISO C 99 forbids us to have them in category "digit",
5695                  but we want iswalnum to return true on them.  */
5696               || (unicode_attributes[ch].category[0] == 'N'
5697                   && unicode_attributes[ch].category[1] == 'd'
5698                   && !(ch >= 0x0030 && ch <= 0x0039))));
5699 }
5700 
5701 static bool
5702 is_digit (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
5703 {
5704 #if 0
5705   return (unicode_attributes[ch].name != NULL
5706           && unicode_attributes[ch].category[0] == 'N'
5707           && unicode_attributes[ch].category[1] == 'd');
5708   /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
5709      a zero.  Must add <0> in front of them by hand.  */
5710 #else
5711   /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
5712      takes it away:
5713      7.25.2.1.5:
5714         The iswdigit function tests for any wide character that corresponds
5715         to a decimal-digit character (as defined in 5.2.1).
5716      5.2.1:
5717         the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
5718    */
5719   return (ch >= 0x0030 && ch <= 0x0039);
5720 #endif
5721 }
5722 
5723 static bool
5724 is_alnum (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
5725 {
5726   return is_alpha (ch) || is_digit (ch);
5727 }
5728 
5729 static bool
5730 is_blank (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
5731 {
5732   return (ch == 0x0009 /* '\t' */
5733           /* Category Zs without mention of "<noBreak>" */
5734           || (unicode_attributes[ch].name != NULL
5735               && unicode_attributes[ch].category[0] == 'Z'
5736               && unicode_attributes[ch].category[1] == 's'
5737               && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
5738 }
5739 
5740 static bool
5741 is_space (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
5742 {
5743   /* Don't make U+00A0 a space. Non-breaking space means that all programs
5744      should treat it like a punctuation character, not like a space. */
5745   return (ch == 0x0020 /* ' ' */
5746           || ch == 0x000C /* '\f' */
5747           || ch == 0x000A /* '\n' */
5748           || ch == 0x000D /* '\r' */
5749           || ch == 0x0009 /* '\t' */
5750           || ch == 0x000B /* '\v' */
5751           /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
5752           || (unicode_attributes[ch].name != NULL
5753               && unicode_attributes[ch].category[0] == 'Z'
5754               && (unicode_attributes[ch].category[1] == 'l'
5755                   || unicode_attributes[ch].category[1] == 'p'
5756                   || (unicode_attributes[ch].category[1] == 's'
5757                       && !strstr (unicode_attributes[ch].decomposition,
5758                                   "<noBreak>")))));
5759 }
5760 
5761 static bool
5762 is_cntrl (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
5763 {
5764   return (unicode_attributes[ch].name != NULL
5765           && (strcmp (unicode_attributes[ch].name, "<control>") == 0
5766               /* Categories Zl and Zp */
5767               || (unicode_attributes[ch].category[0] == 'Z'
5768                   && (unicode_attributes[ch].category[1] == 'l'
5769                       || unicode_attributes[ch].category[1] == 'p'))));
5770 }
5771 
5772 static bool
5773 is_xdigit (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
5774 {
5775 #if 0
5776   return is_digit (ch)
5777          || (ch >= 0x0041 && ch <= 0x0046)
5778          || (ch >= 0x0061 && ch <= 0x0066);
5779 #else
5780   /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
5781      takes it away:
5782      7.25.2.1.12:
5783         The iswxdigit function tests for any wide character that corresponds
5784         to a hexadecimal-digit character (as defined in 6.4.4.1).
5785      6.4.4.1:
5786         hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
5787    */
5788   return (ch >= 0x0030 && ch <= 0x0039)
5789          || (ch >= 0x0041 && ch <= 0x0046)
5790          || (ch >= 0x0061 && ch <= 0x0066);
5791 #endif
5792 }
5793 
5794 static bool
5795 is_graph (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
5796 {
5797   return (unicode_attributes[ch].name != NULL
5798           && strcmp (unicode_attributes[ch].name, "<control>")
5799           && !is_space (ch));
5800 }
5801 
5802 static bool
5803 is_print (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
5804 {
5805   return (unicode_attributes[ch].name != NULL
5806           && strcmp (unicode_attributes[ch].name, "<control>")
5807           /* Categories Zl and Zp */
5808           && !(unicode_attributes[ch].name != NULL
5809                && unicode_attributes[ch].category[0] == 'Z'
5810                && (unicode_attributes[ch].category[1] == 'l'
5811                    || unicode_attributes[ch].category[1] == 'p')));
5812 }
5813 
5814 static bool
5815 is_punct (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
5816 {
5817 #if 0
5818   return (unicode_attributes[ch].name != NULL
5819           && unicode_attributes[ch].category[0] == 'P');
5820 #else
5821   /* The traditional POSIX definition of punctuation is every graphic,
5822      non-alphanumeric character.  */
5823   return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
5824 #endif
5825 }
5826 
5827 /* Output all properties.  */
5828 static void
5829 output_old_ctype (const char *version)
     /* [previous][next][first][last][top][bottom][index][help] */
5830 {
5831 #define PROPERTY(P) \
5832   debug_output_predicate ("unictype/ctype_" #P ".txt", is_ ## P); \
5833   output_predicate_test ("../tests/unictype/test-ctype_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
5834   output_predicate ("unictype/ctype_" #P ".h", is_ ## P, "u_is_" #P, "ISO C <ctype.h> like properties", version);
5835   PROPERTY(alnum)
5836   PROPERTY(alpha)
5837   PROPERTY(cntrl)
5838   PROPERTY(digit)
5839   PROPERTY(graph)
5840   PROPERTY(lower)
5841   PROPERTY(print)
5842   PROPERTY(punct)
5843   PROPERTY(space)
5844   PROPERTY(upper)
5845   PROPERTY(xdigit)
5846   PROPERTY(blank)
5847 #undef PROPERTY
5848 }
5849 
5850 #if 0
5851 
5852 static bool
5853 is_combining (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
5854 {
5855   /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
5856      file. In 3.0.1 it was identical to the union of the general categories
5857      "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
5858      PropList.txt file, so we take the latter definition.  */
5859   return (unicode_attributes[ch].name != NULL
5860           && unicode_attributes[ch].category[0] == 'M'
5861           && (unicode_attributes[ch].category[1] == 'n'
5862               || unicode_attributes[ch].category[1] == 'c'
5863               || unicode_attributes[ch].category[1] == 'e'));
5864 }
5865 
5866 static bool
5867 is_combining_level3 (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
5868 {
5869   return is_combining (ch)
5870          && !(unicode_attributes[ch].combining[0] != '\0'
5871               && unicode_attributes[ch].combining[0] != '0'
5872               && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
5873 }
5874 
5875 /* Return the UCS symbol string for a Unicode character.  */
5876 static const char *
5877 ucs_symbol (unsigned int i)
     /* [previous][next][first][last][top][bottom][index][help] */
5878 {
5879   static char buf[11+1];
5880 
5881   sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
5882   return buf;
5883 }
5884 
5885 /* Return the UCS symbol range string for a Unicode characters interval.  */
5886 static const char *
5887 ucs_symbol_range (unsigned int low, unsigned int high)
     /* [previous][next][first][last][top][bottom][index][help] */
5888 {
5889   static char buf[24+1];
5890 
5891   strcpy (buf, ucs_symbol (low));
5892   strcat (buf, "..");
5893   strcat (buf, ucs_symbol (high));
5894   return buf;
5895 }
5896 
5897 /* Output a character class (= property) table.  */
5898 
5899 static void
5900 output_charclass (FILE *stream, const char *classname,
     /* [previous][next][first][last][top][bottom][index][help] */
5901                   bool (*func) (unsigned int))
5902 {
5903   char table[0x110000];
5904   unsigned int i;
5905   bool need_semicolon;
5906   const int max_column = 75;
5907   int column;
5908 
5909   for (i = 0; i < 0x110000; i++)
5910     table[i] = (int) func (i);
5911 
5912   fprintf (stream, "%s ", classname);
5913   need_semicolon = false;
5914   column = 1000;
5915   for (i = 0; i < 0x110000; )
5916     {
5917       if (!table[i])
5918         i++;
5919       else
5920         {
5921           unsigned int low, high;
5922           char buf[25];
5923 
5924           low = i;
5925           do
5926             i++;
5927           while (i < 0x110000 && table[i]);
5928           high = i - 1;
5929 
5930           if (low == high)
5931             strcpy (buf, ucs_symbol (low));
5932           else
5933             strcpy (buf, ucs_symbol_range (low, high));
5934 
5935           if (need_semicolon)
5936             {
5937               fprintf (stream, ";");
5938               column++;
5939             }
5940 
5941           if (column + strlen (buf) > max_column)
5942             {
5943               fprintf (stream, "/\n   ");
5944               column = 3;
5945             }
5946 
5947           fprintf (stream, "%s", buf);
5948           column += strlen (buf);
5949           need_semicolon = true;
5950         }
5951     }
5952   fprintf (stream, "\n");
5953 }
5954 
5955 /* Output a character mapping table.  */
5956 
5957 static void
5958 output_charmap (FILE *stream, const char *mapname,
     /* [previous][next][first][last][top][bottom][index][help] */
5959                 unsigned int (*func) (unsigned int))
5960 {
5961   char table[0x110000];
5962   unsigned int i;
5963   bool need_semicolon;
5964   const int max_column = 75;
5965   int column;
5966 
5967   for (i = 0; i < 0x110000; i++)
5968     table[i] = (func (i) != i);
5969 
5970   fprintf (stream, "%s ", mapname);
5971   need_semicolon = false;
5972   column = 1000;
5973   for (i = 0; i < 0x110000; i++)
5974     if (table[i])
5975       {
5976         char buf[25+1];
5977 
5978         strcpy (buf, "(");
5979         strcat (buf, ucs_symbol (i));
5980         strcat (buf, ",");
5981         strcat (buf, ucs_symbol (func (i)));
5982         strcat (buf, ")");
5983 
5984         if (need_semicolon)
5985           {
5986             fprintf (stream, ";");
5987             column++;
5988           }
5989 
5990         if (column + strlen (buf) > max_column)
5991           {
5992             fprintf (stream, "/\n   ");
5993             column = 3;
5994           }
5995 
5996         fprintf (stream, "%s", buf);
5997         column += strlen (buf);
5998         need_semicolon = true;
5999       }
6000   fprintf (stream, "\n");
6001 }
6002 
6003 /* Output the width table.  */
6004 
6005 static void
6006 output_widthmap (FILE *stream)
     /* [previous][next][first][last][top][bottom][index][help] */
6007 {
6008 }
6009 
6010 /* Output the tables to the given file.  */
6011 
6012 static void
6013 output_tables (const char *filename, const char *version)
     /* [previous][next][first][last][top][bottom][index][help] */
6014 {
6015   FILE *stream;
6016   unsigned int ch;
6017 
6018   stream = fopen (filename, "w");
6019   if (stream == NULL)
6020     {
6021       fprintf (stderr, "cannot open '%s' for writing\n", filename);
6022       exit (1);
6023     }
6024 
6025   fprintf (stream, "escape_char /\n");
6026   fprintf (stream, "comment_char %%\n");
6027   fprintf (stream, "\n");
6028   fprintf (stream, "%% Generated automatically by gen-uni-tables.c for Unicode %s.\n",
6029            version);
6030   fprintf (stream, "\n");
6031 
6032   fprintf (stream, "LC_IDENTIFICATION\n");
6033   fprintf (stream, "title     \"Unicode %s FDCC-set\"\n", version);
6034   fprintf (stream, "source    \"UnicodeData.txt, PropList.txt\"\n");
6035   fprintf (stream, "address   \"\"\n");
6036   fprintf (stream, "contact   \"\"\n");
6037   fprintf (stream, "email     \"bug-glibc@gnu.org\"\n");
6038   fprintf (stream, "tel       \"\"\n");
6039   fprintf (stream, "fax       \"\"\n");
6040   fprintf (stream, "language  \"\"\n");
6041   fprintf (stream, "territory \"Earth\"\n");
6042   fprintf (stream, "revision  \"%s\"\n", version);
6043   {
6044     time_t now;
6045     char date[11];
6046     now = time (NULL);
6047     strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
6048     fprintf (stream, "date      \"%s\"\n", date);
6049   }
6050   fprintf (stream, "category  \"unicode:2001\";LC_CTYPE\n");
6051   fprintf (stream, "END LC_IDENTIFICATION\n");
6052   fprintf (stream, "\n");
6053 
6054   /* Verification. */
6055   for (ch = 0; ch < 0x110000; ch++)
6056     {
6057       /* toupper restriction: "Only characters specified for the keywords
6058          lower and upper shall be specified.  */
6059       if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
6060         fprintf (stderr,
6061                  "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
6062                  ucs_symbol (ch), ch, to_upper (ch));
6063 
6064       /* tolower restriction: "Only characters specified for the keywords
6065          lower and upper shall be specified.  */
6066       if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
6067         fprintf (stderr,
6068                  "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
6069                  ucs_symbol (ch), ch, to_lower (ch));
6070 
6071       /* alpha restriction: "Characters classified as either upper or lower
6072          shall automatically belong to this class.  */
6073       if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
6074         fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
6075 
6076       /* alpha restriction: "No character specified for the keywords cntrl,
6077          digit, punct or space shall be specified."  */
6078       if (is_alpha (ch) && is_cntrl (ch))
6079         fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
6080       if (is_alpha (ch) && is_digit (ch))
6081         fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
6082       if (is_alpha (ch) && is_punct (ch))
6083         fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
6084       if (is_alpha (ch) && is_space (ch))
6085         fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
6086 
6087       /* space restriction: "No character specified for the keywords upper,
6088          lower, alpha, digit, graph or xdigit shall be specified."
6089          upper, lower, alpha already checked above.  */
6090       if (is_space (ch) && is_digit (ch))
6091         fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
6092       if (is_space (ch) && is_graph (ch))
6093         fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
6094       if (is_space (ch) && is_xdigit (ch))
6095         fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
6096 
6097       /* cntrl restriction: "No character specified for the keywords upper,
6098          lower, alpha, digit, punct, graph, print or xdigit shall be
6099          specified."  upper, lower, alpha already checked above.  */
6100       if (is_cntrl (ch) && is_digit (ch))
6101         fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
6102       if (is_cntrl (ch) && is_punct (ch))
6103         fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
6104       if (is_cntrl (ch) && is_graph (ch))
6105         fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
6106       if (is_cntrl (ch) && is_print (ch))
6107         fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
6108       if (is_cntrl (ch) && is_xdigit (ch))
6109         fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
6110 
6111       /* punct restriction: "No character specified for the keywords upper,
6112          lower, alpha, digit, cntrl, xdigit or as the <space> character shall
6113          be specified."  upper, lower, alpha, cntrl already checked above.  */
6114       if (is_punct (ch) && is_digit (ch))
6115         fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
6116       if (is_punct (ch) && is_xdigit (ch))
6117         fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
6118       if (is_punct (ch) && (ch == 0x0020))
6119         fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
6120 
6121       /* graph restriction: "No character specified for the keyword cntrl
6122          shall be specified."  Already checked above.  */
6123 
6124       /* print restriction: "No character specified for the keyword cntrl
6125          shall be specified."  Already checked above.  */
6126 
6127       /* graph - print relation: differ only in the <space> character.
6128          How is this possible if there are more than one space character?!
6129          I think susv2/xbd/locale.html should speak of "space characters",
6130          not "space character".  */
6131       if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
6132         fprintf (stderr,
6133                  "%s is print but not graph|<space>\n", ucs_symbol (ch));
6134       if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
6135         fprintf (stderr,
6136                  "%s is graph|<space> but not print\n", ucs_symbol (ch));
6137     }
6138 
6139   fprintf (stream, "LC_CTYPE\n");
6140   output_charclass (stream, "upper", is_upper);
6141   output_charclass (stream, "lower", is_lower);
6142   output_charclass (stream, "alpha", is_alpha);
6143   output_charclass (stream, "digit", is_digit);
6144   output_charclass (stream, "outdigit", is_outdigit);
6145   output_charclass (stream, "blank", is_blank);
6146   output_charclass (stream, "space", is_space);
6147   output_charclass (stream, "cntrl", is_cntrl);
6148   output_charclass (stream, "punct", is_punct);
6149   output_charclass (stream, "xdigit", is_xdigit);
6150   output_charclass (stream, "graph", is_graph);
6151   output_charclass (stream, "print", is_print);
6152   output_charclass (stream, "class \"combining\";", is_combining);
6153   output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
6154   output_charmap (stream, "toupper", to_upper);
6155   output_charmap (stream, "tolower", to_lower);
6156   output_charmap (stream, "map \"totitle\";", to_title);
6157   output_widthmap (stream);
6158   fprintf (stream, "END LC_CTYPE\n");
6159 
6160   if (ferror (stream) || fclose (stream))
6161     {
6162       fprintf (stderr, "error writing to '%s'\n", filename);
6163       exit (1);
6164     }
6165 }
6166 
6167 #endif
6168 
6169 /* ========================================================================= */
6170 
6171 /* The width property from the EastAsianWidth.txt file.
6172    Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na".  */
6173 const char * unicode_width[0x110000];
6174 
6175 /* Stores in unicode_width[] the width property from the EastAsianWidth.txt
6176    file.  */
6177 static void
6178 fill_width (const char *width_filename)
     /* [previous][next][first][last][top][bottom][index][help] */
6179 {
6180   unsigned int i, j;
6181   FILE *stream;
6182   char field0[FIELDLEN];
6183   char field1[FIELDLEN];
6184   char field2[FIELDLEN];
6185   int lineno = 0;
6186 
6187   for (i = 0; i < 0x110000; i++)
6188     unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
6189 
6190   stream = fopen (width_filename, "r");
6191   if (stream == NULL)
6192     {
6193       fprintf (stderr, "error during fopen of '%s'\n", width_filename);
6194       exit (1);
6195     }
6196 
6197   for (;;)
6198     {
6199       int n;
6200       int c;
6201 
6202       lineno++;
6203       c = getc (stream);
6204       if (c == EOF)
6205         break;
6206       if (c == '#')
6207         {
6208           do c = getc (stream); while (c != EOF && c != '\n');
6209           continue;
6210         }
6211       ungetc (c, stream);
6212       n = getfield (stream, field0, ';');
6213       n += getfield (stream, field1, ' ');
6214       n += getfield (stream, field2, '\n');
6215       if (n == 0)
6216         break;
6217       if (n != 3)
6218         {
6219           fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
6220           exit (1);
6221         }
6222       i = strtoul (field0, NULL, 16);
6223       if (strstr (field0, "..") != NULL)
6224         {
6225           /* Deal with a range.  */
6226           j = strtoul (strstr (field0, "..") + 2, NULL, 16);
6227           for (; i <= j; i++)
6228             unicode_width[i] = strdup (field1);
6229         }
6230       else
6231         {
6232           /* Single character line.  */
6233           unicode_width[i] = strdup (field1);
6234         }
6235     }
6236 
6237   if (ferror (stream) || fclose (stream))
6238     {
6239       fprintf (stderr, "error reading from '%s'\n", width_filename);
6240       exit (1);
6241     }
6242 }
6243 
6244 /* ========================================================================= */
6245 
6246 /* Non-spacing attribute and width.  */
6247 
6248 /* The non-spacing attribute table consists of:
6249    - Non-spacing characters; generated from PropList.txt or
6250      "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
6251    - Format control characters; generated from
6252      "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
6253    - Zero width characters; generated from
6254      "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
6255  */
6256 
6257 static bool
6258 is_nonspacing (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
6259 {
6260   return (unicode_attributes[ch].name != NULL
6261           && (get_bidi_category (ch) == UC_BIDI_NSM
6262               || is_category_Cc (ch) || is_category_Cf (ch)
6263               || strncmp (unicode_attributes[ch].name, "ZERO WIDTH ", 11) == 0));
6264 }
6265 
6266 static void
6267 output_nonspacing_property (const char *filename)
     /* [previous][next][first][last][top][bottom][index][help] */
6268 {
6269   FILE *stream;
6270   int ind[0x110000 / 0x200];
6271   unsigned int i;
6272   unsigned int i_max;
6273   int next_ind;
6274 
6275   stream = fopen (filename, "w");
6276   if (stream == NULL)
6277     {
6278       fprintf (stderr, "cannot open '%s' for writing\n", filename);
6279       exit (1);
6280     }
6281 
6282   next_ind = 0;
6283   for (i = 0; i < 0x110000 / 0x200; i++)
6284     {
6285       bool nontrivial = false;
6286       unsigned int ch;
6287 
6288       if (i != 0xe0000 / 0x200) /* The 0xe0000 block is handled by code.  */
6289         for (ch = i * 0x200; ch < (i + 1) * 0x200; ch++)
6290           if (is_nonspacing (ch))
6291             {
6292               nontrivial = true;
6293               break;
6294             }
6295       if (nontrivial)
6296         ind[i] = next_ind++;
6297       else
6298         ind[i] = -1;
6299     }
6300 
6301   fprintf (stream, "static const unsigned char nonspacing_table_data[%d*64] = {\n",
6302            next_ind);
6303   i_max = 0;
6304   for (i = 0; i < 0x110000 / 0x200; i++)
6305     {
6306       bool nontrivial = (ind[i] >= 0);
6307 
6308       if (nontrivial)
6309         {
6310           unsigned int j;
6311 
6312           fprintf (stream, "  /* 0x%04x-0x%04x */\n", i * 0x200, (i + 1) * 0x200 - 1);
6313           for (j = 0; j < 8; j++)
6314             {
6315               unsigned int k;
6316 
6317               fprintf (stream, " ");
6318               for (k = 0; k < 8; k++)
6319                 {
6320                   unsigned int l;
6321                   unsigned char bits = 0;
6322 
6323                   for (l = 0; l < 8; l++)
6324                     {
6325                       unsigned int ch = i * 0x200 + j * 0x40 + k * 8 + l;
6326 
6327                       if (is_nonspacing (ch))
6328                         bits |= 1 << l;
6329                     }
6330                   fprintf (stream, " 0x%02x%c", bits,
6331                            ind[i] + 1 == next_ind && j == 8 - 1 && k == 8 - 1 ? ' ' : ',');
6332                 }
6333               fprintf (stream, " /* 0x%04x-0x%04x */\n",
6334                        i * 0x200 + j * 0x40, i * 0x200 + (j + 1) * 0x40 - 1);
6335             }
6336           i_max = i;
6337         }
6338     }
6339   fprintf (stream, "};\n");
6340 
6341   i_max = ((i_max + 8 - 1) / 8) * 8;
6342   fprintf (stream, "static const signed char nonspacing_table_ind[%u] = {\n",
6343            i_max);
6344   {
6345     unsigned int j;
6346 
6347     for (j = 0; j < i_max / 8; j++)
6348       {
6349         unsigned int k;
6350 
6351         fprintf (stream, " ");
6352         for (k = 0; k < 8; k++)
6353           {
6354             i = j * 8 + k;
6355             fprintf (stream, " %2d%c", ind[i],
6356                      j == i_max / 8 - 1 && k == 8 - 1 ? ' ' : ',');
6357           }
6358         fprintf (stream, " /* 0x%04x-0x%04x */\n",
6359                  j * 8 * 0x200, (j + 1) * 8 * 0x200 - 1);
6360       }
6361   }
6362   fprintf (stream, "};\n");
6363 
6364   if (ferror (stream) || fclose (stream))
6365     {
6366       fprintf (stderr, "error writing to '%s'\n", filename);
6367       exit (1);
6368     }
6369 }
6370 
6371 /* Returns the width of ch as one of 0, '0', '1', '2', 'A'.  */
6372 static char
6373 symbolic_width (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
6374 {
6375   /* Test for unassigned character.  */
6376   if (is_property_unassigned_code_value (ch))
6377     {
6378       /* Unicode TR#11 section "Unassigned and Private-Use Characters".  */
6379       if (ch >= 0xE000 && ch <= 0xF8FF) /* Private Use */
6380         return 'A';
6381       if ((ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs block */
6382           || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A block */
6383           || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs block */
6384           || (ch >= 0x20000 && ch <= 0x2FFFF) /* Supplementary Ideographic Plane */
6385           || (ch >= 0x30000 && ch <= 0x3FFFF) /* Tertiary Ideographic Plane */)
6386         return '2';
6387       return 0;
6388     }
6389   else
6390     {
6391       /* Test for non-spacing or control character.  */
6392       if (is_category_Cc (ch) && ch < 0x00A0)
6393         return 0;
6394       if (is_nonspacing (ch))
6395         return '0';
6396       /* Test for double-width character.  */
6397       if (unicode_width[ch] != NULL
6398           && (strcmp (unicode_width[ch], "W") == 0
6399               || strcmp (unicode_width[ch], "F") == 0))
6400         return '2';
6401       /* Test for half-width character.  */
6402       if (unicode_width[ch] != NULL
6403           && strcmp (unicode_width[ch], "H") == 0)
6404         return '1';
6405     }
6406   /* In ancient CJK encodings, Cyrillic and most other characters are
6407      double-width as well.  */
6408   if (ch >= 0x00A1 && ch < 0x10000)
6409     return 'A';
6410   return '1';
6411 }
6412 
6413 static void
6414 output_width_property_test (const char *filename)
     /* [previous][next][first][last][top][bottom][index][help] */
6415 {
6416   FILE *stream;
6417   unsigned int interval_start, interval_end, ch;
6418   char interval_value;
6419 
6420   stream = fopen (filename, "w");
6421   if (stream == NULL)
6422     {
6423       fprintf (stderr, "cannot open '%s' for writing\n", filename);
6424       exit (1);
6425     }
6426 
6427   interval_value = 0;
6428   interval_start = interval_end = 0; /* avoid GCC warning */
6429   for (ch = 0; ch < 0x110000; ch++)
6430     {
6431       char value = symbolic_width (ch);
6432       if (value != 0) /* skip Cc control characters and unassigned characters */
6433         {
6434           if (value == interval_value)
6435             /* Extend the interval.  */
6436             interval_end = ch;
6437           else
6438             {
6439               /* Terminate the interval.  */
6440               if (interval_value != 0)
6441                 {
6442                   if (interval_end == interval_start)
6443                     fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value);
6444                   else
6445                     fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value);
6446                 }
6447               /* Start a new interval.  */
6448               interval_start = interval_end = ch;
6449               interval_value = value;
6450             }
6451         }
6452     }
6453   /* Terminate the last interval.  */
6454   if (interval_value != 0)
6455     {
6456       if (interval_end == interval_start)
6457         fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value);
6458       else
6459         fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value);
6460     }
6461 
6462   if (ferror (stream) || fclose (stream))
6463     {
6464       fprintf (stderr, "error writing to '%s'\n", filename);
6465       exit (1);
6466     }
6467 }
6468 
6469 /* ========================================================================= */
6470 
6471 /* Line breaking classification.
6472    Updated for Unicode TR #14 revision 26.  */
6473 
6474 enum
6475 {
6476   /* Values >= 30 are resolved at run time. */
6477   LBP_BK = 30, /* mandatory break */
6478 /*LBP_CR,         carriage return - not used here because it's a DOSism */
6479 /*LBP_LF,         line feed - not used here because it's a DOSism */
6480   LBP_CM = 31, /* attached characters and combining marks */
6481 /*LBP_NL,         next line - not used here because it's equivalent to LBP_BK */
6482 /*LBP_SG,         surrogates - not used here because they are not characters */
6483   LBP_WJ =  0, /* word joiner */
6484   LBP_ZW = 32, /* zero width space */
6485   LBP_GL =  1, /* non-breaking (glue) */
6486   LBP_SP = 33, /* space */
6487   LBP_B2 =  2, /* break opportunity before and after */
6488   LBP_BA =  3, /* break opportunity after */
6489   LBP_BB =  4, /* break opportunity before */
6490   LBP_HY =  5, /* hyphen */
6491   LBP_CB = 34, /* contingent break opportunity */
6492   LBP_CL =  6, /* closing punctuation */
6493   LBP_CP =  7, /* closing parenthesis */
6494   LBP_EX =  8, /* exclamation/interrogation */
6495   LBP_IN =  9, /* inseparable */
6496   LBP_NS = 10, /* non starter */
6497   LBP_OP = 11, /* opening punctuation */
6498   LBP_QU = 12, /* ambiguous quotation */
6499   LBP_IS = 13, /* infix separator (numeric) */
6500   LBP_NU = 14, /* numeric */
6501   LBP_PO = 15, /* postfix (numeric) */
6502   LBP_PR = 16, /* prefix (numeric) */
6503   LBP_SY = 17, /* symbols allowing breaks */
6504   LBP_AI = 35, /* ambiguous (alphabetic or ideograph) */
6505   LBP_AL = 18, /* ordinary alphabetic and symbol characters */
6506 /*LBP_CJ,         conditional Japanese starter, resolved to NS */
6507   LBP_H2 = 19, /* Hangul LV syllable */
6508   LBP_H3 = 20, /* Hangul LVT syllable */
6509   LBP_HL = 25, /* Hebrew letter */
6510   LBP_ID = 21, /* ideographic */
6511   LBP_JL = 22, /* Hangul L Jamo */
6512   LBP_JV = 23, /* Hangul V Jamo */
6513   LBP_JT = 24, /* Hangul T Jamo */
6514   LBP_RI = 26, /* regional indicator */
6515   LBP_SA = 36, /* complex context (South East Asian) */
6516   LBP_ZWJ = 27, /* zero width joiner */
6517   LBP_EB = 28, /* emoji base */
6518   LBP_EM = 29, /* emoji modifier */
6519   LBP_XX = 37  /* unknown */
6520 };
6521 
6522 /* Returns the line breaking classification for ch, as a bit mask.  */
6523 static int64_t
6524 get_lbp (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
6525 {
6526   int64_t attr = 0;
6527 
6528   /* U+20BC..U+20CF are reserved for prefixes.  */
6529   if (unicode_attributes[ch].name == NULL && (ch >= 0x20BC && ch <= 0x20CF))
6530     return (int64_t) 1 << LBP_PR;
6531 
6532   if (unicode_attributes[ch].name != NULL)
6533     {
6534       /* mandatory break */
6535       if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */
6536           || ch == 0x000C /* form feed */
6537           || ch == 0x000B /* line tabulation */
6538           || ch == 0x2028 /* LINE SEPARATOR */
6539           || ch == 0x2029 /* PARAGRAPH SEPARATOR */)
6540         attr |= (int64_t) 1 << LBP_BK;
6541 
6542       if (ch == 0x2060 /* WORD JOINER */
6543           || ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */)
6544         attr |= (int64_t) 1 << LBP_WJ;
6545 
6546       /* zero width space */
6547       if (ch == 0x200B /* ZERO WIDTH SPACE */)
6548         attr |= (int64_t) 1 << LBP_ZW;
6549 
6550       /* zero width joiner */
6551       if (ch == 0x200D /* ZERO WIDTH JOINER */)
6552         attr |= (int64_t) 1 << LBP_ZWJ;
6553 
6554       /* emoji base */
6555       if (ch == 0x261D /* WHITE UP POINTING INDEX */
6556           || ch == 0x26F9 /* PERSON WITH BALL */
6557           || (ch >= 0x270A && ch <= 0x270D) /* RAISED FIST..WRITING HAND */
6558           || ch == 0x1F385 /* FATHER CHRISTMAS */
6559           || (ch >= 0x1F3C3 && ch <= 0x1F3C4) /* RUNNER..SURFER */
6560           || (ch >= 0x1F3CA && ch <= 0x1F3CB) /* SWIMMER..WEIGHT LIFTER */
6561           || (ch >= 0x1F442 && ch <= 0x1F443) /* EAR..NOSE */
6562           || (ch >= 0x1F446 && ch <= 0x1F450) /* WHITE UP POINTING BACKHAND INDEX..OPEN HANDS SIGN */
6563           || (ch >= 0x1F466 && ch <= 0x1F469) /* BOY..WOMAN */
6564           || ch == 0x1F46E /* POLICE OFFICER */
6565           || (ch >= 0x1F470 && ch <= 0x1F478) /* BRIDE WITH VEIL..PRINCESS */
6566           || ch == 0x1F47C /* BABY ANGEL */
6567           || (ch >= 0x1F481 && ch <= 0x1F483) /* INFORMATION DESK PERSON..DANCER */
6568           || (ch >= 0x1F485 && ch <= 0x1F487) /* NAIL POLISH..HAIRCUT */
6569           || ch == 0x1F4AA /* FLEXED BICEPS */
6570           || ch == 0x1F575 /* SLEUTH OR SPY */
6571           || ch == 0x1F57A /* MAN DANCING */
6572           || ch == 0x1F590 /* RAISED HAND WITH FINGERS SPLAYED */
6573           || (ch >= 0x1F595 && ch <= 0x1F596) /* REVERSED HAND WITH MIDDLE FINGER EXTENDED..RAISED HAND WITH PART BETWEEN MIDDLE AND RING FINGERS */
6574           || (ch >= 0x1F645 && ch <= 0x1F647) /* FACE WITH NO GOOD GESTURE..PERSON BOWING DEEPLY */
6575           || (ch >= 0x1F64B && ch <= 0x1F64F) /* HAPPY PERSON RAISING ONE HAND..PERSON WITH FOLDED HANDS */
6576           || ch == 0x1F6A3 /* ROWBOAT */
6577           || (ch >= 0x1F6B4 && ch <= 0x1F6B6) /* BICYCLIST..PEDESTRIAN */
6578           || ch == 0x1F6C0 /* BATH */
6579           || (ch >= 0x1F918 && ch <= 0x1F91E) /* SIGN OF THE HORNS..HAND WITH INDEX AND MIDDLE FINGERS CROSSED */
6580           || ch == 0x1F926 /* FACE PALM */
6581           || ch == 0x1F930 /* PREGNANT WOMAN */
6582           || (ch >= 0x1F933 && ch <= 0x1F939) /* SELFIE..JUGGLING */
6583           || (ch >= 0x1F93C && ch <= 0x1F93E) /* WRESTLERS..HANDBALL */)
6584         attr |= (int64_t) 1 << LBP_EB;
6585 
6586       if ((ch >= 0x1F3FB && ch <= 0x1F3FF) /* EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6 */)
6587         attr |= (int64_t) 1 << LBP_EM;
6588 
6589       /* non-breaking (glue) */
6590       if (ch == 0x00A0 /* NO-BREAK SPACE */
6591           || ch == 0x202F /* NARROW NO-BREAK SPACE */
6592           || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
6593           || ch == 0x034F /* COMBINING GRAPHEME JOINER */
6594           || ch == 0x2007 /* FIGURE SPACE */
6595           || ch == 0x2011 /* NON-BREAKING HYPHEN */
6596           || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
6597           || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
6598           || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
6599           || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */
6600           /* Extra characters for compatibility with Unicode LineBreak.txt.  */
6601           || ch == 0x0FD9 /* TIBETAN MARK LEADING MCHAN RTAGS */
6602           || ch == 0x0FDA /* TIBETAN MARK TRAILING MCHAN RTAGS */)
6603         attr |= (int64_t) 1 << LBP_GL;
6604 
6605       /* space */
6606       if (ch == 0x0020 /* SPACE */)
6607         attr |= (int64_t) 1 << LBP_SP;
6608 
6609       /* break opportunity before and after */
6610       if (ch == 0x2014 /* EM DASH */
6611           || ch == 0x2E3A /* TWO-EM DASH */
6612           || ch == 0x2E3B /* THREE-EM DASH */)
6613         attr |= (int64_t) 1 << LBP_B2;
6614 
6615       /* break opportunity after */
6616       if (/* Breaking Spaces */
6617           ch == 0x1680 /* OGHAM SPACE MARK */
6618           || ch == 0x2000 /* EN QUAD */
6619           || ch == 0x2001 /* EM QUAD */
6620           || ch == 0x2002 /* EN SPACE */
6621           || ch == 0x2003 /* EM SPACE */
6622           || ch == 0x2004 /* THREE-PER-EM SPACE */
6623           || ch == 0x2005 /* FOUR-PER-EM SPACE */
6624           || ch == 0x2006 /* SIX-PER-EM SPACE */
6625           || ch == 0x2008 /* PUNCTUATION SPACE */
6626           || ch == 0x2009 /* THIN SPACE */
6627           || ch == 0x200A /* HAIR SPACE */
6628           || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */
6629           || ch == 0x3000 /* IDEOGRAPHIC SPACE */
6630           /* Tabs */
6631           || ch == 0x0009 /* tab */
6632           /* Conditional Hyphens */
6633           || ch == 0x00AD /* SOFT HYPHEN */
6634           /* Breaking Hyphens */
6635           || ch == 0x058A /* ARMENIAN HYPHEN */
6636           || ch == 0x1400 /* CANADIAN SYLLABICS HYPHEN */
6637           || ch == 0x2010 /* HYPHEN */
6638           || ch == 0x2012 /* FIGURE DASH */
6639           || ch == 0x2013 /* EN DASH */
6640           /* Visible Word Dividers */
6641           || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */
6642           || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
6643           || ch == 0x1361 /* ETHIOPIC WORDSPACE */
6644           || ch == 0x17D8 /* KHMER SIGN BEYYAL */
6645           || ch == 0x17DA /* KHMER SIGN KOOMUUT */
6646           || ch == 0x2027 /* HYPHENATION POINT */
6647           || ch == 0x007C /* VERTICAL LINE */
6648           /* Historic Word Separators */
6649           || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */
6650           || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
6651           || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */
6652           || ch == 0x2056 /* THREE DOT PUNCTUATION */
6653           || ch == 0x2058 /* FOUR DOT PUNCTUATION */
6654           || ch == 0x2059 /* FIVE DOT PUNCTUATION */
6655           || ch == 0x205A /* TWO DOT PUNCTUATION */
6656           || ch == 0x205B /* FOUR DOT MARK */
6657           || ch == 0x205D /* TRICOLON */
6658           || ch == 0x205E /* VERTICAL FOUR DOTS */
6659           || ch == 0x2E19 /* PALM BRANCH */
6660           || ch == 0x2E2A /* TWO DOTS OVER ONE DOT PUNCTUATION */
6661           || ch == 0x2E2B /* ONE DOT OVER TWO DOTS PUNCTUATION */
6662           || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */
6663           || ch == 0x2E2D /* FIVE DOT PUNCTUATION */
6664           || ch == 0x2E30 /* RING POINT */
6665           || ch == 0x2E31 /* WORD SEPARATOR MIDDLE DOT */
6666           || ch == 0x2E33 /* RAISED DOT */
6667           || ch == 0x2E34 /* RAISED COMMA */
6668           || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */
6669           || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */
6670           || ch == 0x10102 /* AEGEAN CHECK MARK */
6671           || ch == 0x1039F /* UGARITIC WORD DIVIDER */
6672           || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */
6673           || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */
6674           || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */
6675           /* Dandas */
6676           || ch == 0x0964 /* DEVANAGARI DANDA */
6677           || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */
6678           || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
6679           || ch == 0x0E5B /* THAI CHARACTER KHOMUT */
6680           || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */
6681           || ch == 0x104B /* MYANMAR SIGN SECTION */
6682           || ch == 0x1735 /* PHILIPPINE SINGLE PUNCTUATION */
6683           || ch == 0x1736 /* PHILIPPINE DOUBLE PUNCTUATION */
6684           || ch == 0x17D4 /* KHMER SIGN KHAN */
6685           || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */
6686           || ch == 0x1B5E /* BALINESE CARIK SIKI */
6687           || ch == 0x1B5F /* BALINESE CARIK PAREREN */
6688           || ch == 0xA8CE /* SAURASHTRA DANDA */
6689           || ch == 0xA8CF /* SAURASHTRA DOUBLE DANDA */
6690           || ch == 0xAA5D /* CHAM PUNCTUATION DANDA */
6691           || ch == 0xAA5E /* CHAM PUNCTUATION DOUBLE DANDA */
6692           || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */
6693           || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */
6694           || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */
6695           /* Tibetan */
6696           || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */
6697           || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */
6698           || ch == 0x0F85 /* TIBETAN MARK PALUTA */
6699           || ch == 0x0FBE /* TIBETAN KU RU KHA */
6700           || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */
6701           || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */
6702           /* Other Terminating Punctuation */
6703           || ch == 0x1804 /* MONGOLIAN COLON */
6704           || ch == 0x1805 /* MONGOLIAN FOUR DOTS */
6705           || ch == 0x1B5A /* BALINESE PANTI */
6706           || ch == 0x1B5B /* BALINESE PAMADA */
6707           || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */
6708           || ch == 0x1B60 /* BALINESE PAMENENG */
6709           || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */
6710           || ch == 0x1C3C /* LEPCHA PUNCTUATION NYET THYOOM TA-ROL */
6711           || ch == 0x1C3D /* LEPCHA PUNCTUATION CER-WA */
6712           || ch == 0x1C3E /* LEPCHA PUNCTUATION TSHOOK CER-WA */
6713           || ch == 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */
6714           || ch == 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */
6715           || ch == 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */
6716           || ch == 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */
6717           || ch == 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */
6718           || ch == 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */
6719           || ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */
6720           || (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */
6721           || ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */
6722           || ch == 0x2E43 /* DASH WITH LEFT UPTURN */
6723           || ch == 0x2E44 /* DOUBLE SUSPENSION MARK */
6724           || ch == 0x2E3C /* STENOGRAPHIC FULL STOP */
6725           || ch == 0x2E3D /* VERTICAL SIX DOTS */
6726           || ch == 0x2E3E /* WIGGLY VERTICAL LINE */
6727           || ch == 0x2E40 /* DOUBLE HYPHEN */
6728           || ch == 0x2E41 /* REVERSED COMMA */
6729           || ch == 0xA60D /* VAI COMMA */
6730           || ch == 0xA60F /* VAI QUESTION MARK */
6731           || ch == 0xA92E /* KAYAH LI SIGN CWI */
6732           || ch == 0xA92F /* KAYAH LI SIGN SHYA */
6733           || ch == 0x10A50 /* KHAROSHTHI PUNCTUATION DOT */
6734           || ch == 0x10A51 /* KHAROSHTHI PUNCTUATION SMALL CIRCLE */
6735           || ch == 0x10A52 /* KHAROSHTHI PUNCTUATION CIRCLE */
6736           || ch == 0x10A53 /* KHAROSHTHI PUNCTUATION CRESCENT BAR */
6737           || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */
6738           || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */
6739           /* Extra characters for compatibility with Unicode LineBreak.txt.  */
6740           || ch == 0x2D70 /* TIFINAGH SEPARATOR MARK */
6741           || ch == 0xA4FE /* LISU PUNCTUATION COMMA */
6742           || ch == 0xA4FF /* LISU PUNCTUATION FULL STOP */
6743           || ch == 0xA6F3 /* BAMUM FULL STOP */
6744           || ch == 0xA6F4 /* BAMUM COLON */
6745           || ch == 0xA6F5 /* BAMUM COMMA */
6746           || ch == 0xA6F6 /* BAMUM SEMICOLON */
6747           || ch == 0xA6F7 /* BAMUM QUESTION MARK */
6748           || ch == 0xA9C7 /* JAVANESE PADA PANGKAT */
6749           || ch == 0xA9C8 /* JAVANESE PADA LINGSA */
6750           || ch == 0xA9C9 /* JAVANESE PADA LUNGSI */
6751           || ch == 0xAAF0 /* MEETEI MAYEK CHEIKHAN */
6752           || ch == 0xAAF1 /* MEETEI MAYEK AHANG KHUDAM */
6753           || ch == 0xABEB /* MEETEI MAYEK CHEIKHEI */
6754           || ch == 0x10857 /* IMPERIAL ARAMAIC SECTION SIGN */
6755           || (ch >= 0x10AF0 && ch <= 0x10AF5) /* MANICHAEAN PUNCTUATION STAR..MANICHAEAN PUNCTUATION TWO DOTS */
6756           || ch == 0x10B39 /* AVESTAN ABBREVIATION MARK */
6757           || ch == 0x10B3A /* TINY TWO DOTS OVER ONE DOT PUNCTUATION */
6758           || ch == 0x10B3B /* SMALL TWO DOTS OVER ONE DOT PUNCTUATION */
6759           || ch == 0x10B3C /* LARGE TWO DOTS OVER ONE DOT PUNCTUATION */
6760           || ch == 0x10B3D /* LARGE ONE DOT OVER TWO DOTS PUNCTUATION */
6761           || ch == 0x10B3E /* LARGE TWO RINGS OVER ONE RING PUNCTUATION */
6762           || ch == 0x10B3F /* LARGE ONE RING OVER TWO RINGS PUNCTUATION */
6763           || ch == 0x11047 /* BRAHMI DANDA */
6764           || ch == 0x11048 /* BRAHMI DOUBLE DANDA */
6765           || ch == 0x110BE /* KAITHI SECTION MARK */
6766           || ch == 0x110BF /* KAITHI DOUBLE SECTION MARK */
6767           || ch == 0x110C0 /* KAITHI DANDA */
6768           || ch == 0x110C1 /* KAITHI DOUBLE DANDA */
6769           || ch == 0x11140 /* CHAKMA SECTION MARK */
6770           || ch == 0x11141 /* CHAKMA DANDA */
6771           || ch == 0x11142 /* CHAKMA DOUBLE DANDA */
6772           || ch == 0x11143 /* CHAKMA QUESTION MARK */
6773           || ch == 0x111C5 /* SHARADA DANDA */
6774           || ch == 0x111C6 /* SHARADA DOUBLE DANDA */
6775           || ch == 0x111C8 /* SHARADA SEPARATOR */
6776           || (ch >= 0x111DD && ch <= 0x111DF) /* SHARADA CONTINUATION SIGN..SHARADA SECTION MARK-2 */
6777           || ch == 0x11238 /* KHOJKI DANDA */
6778           || ch == 0x11239 /* KHOJKI DOUBLE DANDA */
6779           || ch == 0x1123B /* KHOJKI SECTION MARK */
6780           || ch == 0x1123C /* KHOJKI DOUBLE SECTION MARK */
6781           || ch == 0x112A9 /* MULTANI SECTION MARK */
6782           || (ch >= 0x1144B && ch <= 0x1144E) /* NEWA DANDA..NEWA GAP FILLER */
6783           || ch == 0x1145B /* NEWA PLACEHOLDER MARK */
6784           || ch == 0x115C2 /* SIDDHAM DANDA */
6785           || ch == 0x115C3 /* SIDDHAM DOUBLE DANDA */
6786           || (ch >= 0x115C9 && ch <= 0x115D7) /* SIDDHAM END OF TEXT MARK..SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES */
6787           || ch == 0x11641 /* MODI DANDA */
6788           || ch == 0x11642 /* MODI DOUBLE DANDA */
6789           || (ch >= 0x1173C && ch <= 0x1173E) /* AHOM SIGN SMALL SECTION..AHOM SIGN RULAI */
6790           || (ch >= 0x11C41 && ch <= 0x11C45) /* BHAIKSUKI DANDA..BHAIKSUKI GAP FILLER-2 */
6791           || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
6792           || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
6793           || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */
6794           || ch == 0x12474 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON */
6795           || ch == 0x16A6E /* MRO DANDA */
6796           || ch == 0x16A6F /* MRO DOUBLE DANDA */
6797           || ch == 0x16AF5 /* BASSA VAH FULL STOP */
6798           || ch == 0x16B37 /* PAHAWH HMONG SIGN VOS THOM */
6799           || ch == 0x16B38 /* PAHAWH HMONG SIGN VOS TSHAB CEEB */
6800           || ch == 0x16B39 /* PAHAWH HMONG SIGN CIM CHEEM */
6801           || ch == 0x16B44 /* PAHAWH HMONG SIGN XAUS */
6802           || ch == 0x1BC9F /* DUPLOYAN PUNCTUATION CHINOOK FULL STOP */
6803           || (ch >= 0x1DA87 && ch <= 0x1DA8A) /* SIGNWRITING COMMA..SIGNWRITING COLON */)
6804         attr |= (int64_t) 1 << LBP_BA;
6805 
6806       /* break opportunity before */
6807       if (ch == 0x00B4 /* ACUTE ACCENT */
6808           || ch == 0x1FFD /* GREEK OXIA */
6809           || ch == 0x02DF /* MODIFIER LETTER CROSS ACCENT */
6810           || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
6811           || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
6812           || ch == 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */
6813           || ch == 0x0F02 /* TIBETAN MARK GTER YIG MGO -UM RNAM BCAD MA */
6814           || ch == 0x0F03 /* TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA */
6815           || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */
6816           || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */
6817           || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */
6818           || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */
6819           || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */
6820           || ch == 0x0FD0 /* TIBETAN MARK BSKA- SHOG GI MGO RGYAN */
6821           || ch == 0x0FD1 /* TIBETAN MARK MNYAM YIG GI MGO RGYAN */
6822           || ch == 0x0FD3 /* TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA */
6823           || ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */
6824           || ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */
6825           || ch == 0xA8FC /* DEVANAGARI SIGN SIDDHAM */
6826           || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */
6827           || ch == 0x11175 /* MAHAJANI SECTION MARK */
6828           || ch == 0x111DB /* SHARADA SIGN SIDDHAM */
6829           || ch == 0x115C1 /* SIDDHAM SIGN SIDDHAM */
6830           || (ch >= 0x11660 && ch <= 0x1166C) /* MONGOLIAN BIRGA WITH ORNAMENT..MONGOLIAN TURNED SWIRL BIRGA WITH DOUBLE ORNAMENT */
6831           || ch == 0x11C70 /* MARCHEN HEAD MARK */)
6832         attr |= (int64_t) 1 << LBP_BB;
6833 
6834       /* hyphen */
6835       if (ch == 0x002D /* HYPHEN-MINUS */)
6836         attr |= (int64_t) 1 << LBP_HY;
6837 
6838       /* contingent break opportunity */
6839       if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
6840         attr |= (int64_t) 1 << LBP_CB;
6841 
6842       /* closing parenthesis */
6843       if (ch == 0x0029 /* RIGHT PARENTHESIS */
6844           || ch == 0x005D /* RIGHT SQUARE BRACKET */)
6845         attr |= (int64_t) 1 << LBP_CP;
6846 
6847       /* closing punctuation */
6848       if ((unicode_attributes[ch].category[0] == 'P'
6849            && unicode_attributes[ch].category[1] == 'e'
6850            && !(attr & ((int64_t) 1 << LBP_CP)))
6851           || ch == 0x3001 /* IDEOGRAPHIC COMMA */
6852           || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
6853           || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */
6854           || ch == 0xFE12 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP */
6855           || ch == 0xFE50 /* SMALL COMMA */
6856           || ch == 0xFE52 /* SMALL FULL STOP */
6857           || ch == 0xFF0C /* FULLWIDTH COMMA */
6858           || ch == 0xFF0E /* FULLWIDTH FULL STOP */
6859           || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
6860           || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */
6861           /* Extra characters for compatibility with Unicode LineBreak.txt.  */
6862           || ch == 0x1325B /* EGYPTIAN HIEROGLYPH O006D */
6863           || ch == 0x1325C /* EGYPTIAN HIEROGLYPH O006E */
6864           || ch == 0x1325D /* EGYPTIAN HIEROGLYPH O006F */
6865           || ch == 0x13282 /* EGYPTIAN HIEROGLYPH O033A */
6866           || ch == 0x13287 /* EGYPTIAN HIEROGLYPH O036B */
6867           || ch == 0x13289 /* EGYPTIAN HIEROGLYPH O036D */
6868           || ch == 0x1337A /* EGYPTIAN HIEROGLYPH V011B */
6869           || ch == 0x1337B /* EGYPTIAN HIEROGLYPH V011C */
6870           || ch == 0x145CF /* ANATOLIAN HIEROGLYPH A410A END LOGOGRAM MARK */)
6871         attr |= (int64_t) 1 << LBP_CL;
6872 
6873       /* exclamation/interrogation */
6874       if (ch == 0x0021 /* EXCLAMATION MARK */
6875           || ch == 0x003F /* QUESTION MARK */
6876           || ch == 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */
6877           || ch == 0x061B /* ARABIC SEMICOLON */
6878           || ch == 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */
6879           || ch == 0x061F /* ARABIC QUESTION MARK */
6880           || ch == 0x06D4 /* ARABIC FULL STOP */
6881           || ch == 0x07F9 /* NKO EXCLAMATION MARK */
6882           || ch == 0x0F0D /* TIBETAN MARK SHAD */
6883           || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */
6884           || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */
6885           || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */
6886           || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */
6887           || ch == 0x0F14 /* TIBETAN MARK GTER TSHEG */
6888           || ch == 0x1802 /* MONGOLIAN COMMA */
6889           || ch == 0x1803 /* MONGOLIAN FULL STOP */
6890           || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
6891           || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
6892           || ch == 0x1944 /* LIMBU EXCLAMATION MARK */
6893           || ch == 0x1945 /* LIMBU QUESTION MARK */
6894           || ch == 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */
6895           || ch == 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */
6896           || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
6897           || ch == 0x2CFE /* COPTIC FULL STOP */
6898           || ch == 0x2E2E /* REVERSED QUESTION MARK */
6899           || ch == 0xA60E /* VAI FULL STOP */
6900           || ch == 0xA876 /* PHAGS-PA MARK SHAD */
6901           || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */
6902           || ch == 0xFE15 /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */
6903           || ch == 0xFE16 /* PRESENTATION FORM FOR VERTICAL QUESTION MARK */
6904           || ch == 0xFE56 /* SMALL QUESTION MARK */
6905           || ch == 0xFE57 /* SMALL EXCLAMATION MARK */
6906           || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
6907           || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */
6908           || ch == 0x115C4 /* SIDDHAM SEPARATOR DOT */
6909           || ch == 0x115C5 /* SIDDHAM SEPARATOR BAR */
6910           || ch == 0x11C71 /* MARCHEN MARK SHAD */)
6911         attr |= (int64_t) 1 << LBP_EX;
6912 
6913       /* inseparable */
6914       if (ch == 0x2024 /* ONE DOT LEADER */
6915           || ch == 0x2025 /* TWO DOT LEADER */
6916           || ch == 0x2026 /* HORIZONTAL ELLIPSIS */
6917           || ch == 0x22EF /* MIDLINE HORIZONTAL ELLIPSIS */
6918           || ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */
6919           || ch == 0x10AF6 /* MANICHAEAN PUNCTUATION LINE FILLER */)
6920         attr |= (int64_t) 1 << LBP_IN;
6921 
6922       /* non starter */
6923       if (ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
6924           || ch == 0x203C /* DOUBLE EXCLAMATION MARK */
6925           || ch == 0x203D /* INTERROBANG */
6926           || ch == 0x2047 /* DOUBLE QUESTION MARK */
6927           || ch == 0x2048 /* QUESTION EXCLAMATION MARK */
6928           || ch == 0x2049 /* EXCLAMATION QUESTION MARK */
6929           || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */
6930           || ch == 0x301C /* WAVE DASH */
6931           || ch == 0x303C /* MASU MARK */
6932           || ch == 0x303B /* VERTICAL IDEOGRAPHIC ITERATION MARK */
6933           || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
6934           || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
6935           || ch == 0x309D /* HIRAGANA ITERATION MARK */
6936           || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */
6937           || ch == 0x30A0 /* KATAKANA-HIRAGANA DOUBLE HYPHEN */
6938           || ch == 0x30FB /* KATAKANA MIDDLE DOT */
6939           || ch == 0x30FC /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */
6940           || ch == 0x30FD /* KATAKANA ITERATION MARK */
6941           || ch == 0x30FE /* KATAKANA VOICED ITERATION MARK */
6942           || ch == 0xA015 /* YI SYLLABLE WU */
6943           || ch == 0xFE54 /* SMALL SEMICOLON */
6944           || ch == 0xFE55 /* SMALL COLON */
6945           || ch == 0xFF1A /* FULLWIDTH COLON */
6946           || ch == 0xFF1B /* FULLWIDTH SEMICOLON */
6947           || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
6948           || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
6949           || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
6950           || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
6951           || ch == 0x16FE0 /* TANGUT ITERATION MARK */
6952           || ch == 0x1F679 /* HEAVY INTERROBANG ORNAMENT */
6953           || ch == 0x1F67A /* SANS-SERIF INTERROBANG ORNAMENT */
6954           || ch == 0x1F67B /* HEAVY SANS-SERIF INTERROBANG ORNAMENT */
6955           || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
6956           || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
6957         attr |= (int64_t) 1 << LBP_NS;
6958 
6959       /* opening punctuation */
6960       if ((unicode_attributes[ch].category[0] == 'P'
6961            && unicode_attributes[ch].category[1] == 's')
6962           || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
6963           || ch == 0x00BF /* INVERTED QUESTION MARK */
6964           || ch == 0x2E18 /* INVERTED INTERROBANG */
6965           /* Extra characters for compatibility with Unicode LineBreak.txt.  */
6966           || ch == 0x13258 /* EGYPTIAN HIEROGLYPH O006A */
6967           || ch == 0x13259 /* EGYPTIAN HIEROGLYPH O006B */
6968           || ch == 0x1325A /* EGYPTIAN HIEROGLYPH O006C */
6969           || ch == 0x13286 /* EGYPTIAN HIEROGLYPH O036A */
6970           || ch == 0x13288 /* EGYPTIAN HIEROGLYPH O036C */
6971           || ch == 0x13379 /* EGYPTIAN HIEROGLYPH V011A */
6972           || ch == 0x145CE /* ANATOLIAN HIEROGLYPH A410 BEGIN LOGOGRAM MARK */
6973           || (ch >= 0x1E95E && ch <= 0x1E95F) /* ADLAM INITIAL EXCLAMATION MARK..ADLAM INITIAL QUESTION MARK */)
6974         attr |= (int64_t) 1 << LBP_OP;
6975 
6976       /* ambiguous quotation */
6977       if ((unicode_attributes[ch].category[0] == 'P'
6978            && (unicode_attributes[ch].category[1] == 'f'
6979                || unicode_attributes[ch].category[1] == 'i'))
6980           || ch == 0x0022 /* QUOTATION MARK */
6981           || ch == 0x0027 /* APOSTROPHE */
6982           || ch == 0x275B /* HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT */
6983           || ch == 0x275C /* HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT */
6984           || ch == 0x275D /* HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
6985           || ch == 0x275E /* HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
6986           || ch == 0x275F /* HEAVY LOW SINGLE COMMA QUOTATION MARK ORNAMENT */
6987           || ch == 0x2760 /* HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT */
6988           || ch == 0x2E00 /* RIGHT ANGLE SUBSTITUTION MARKER */
6989           || ch == 0x2E01 /* RIGHT ANGLE DOTTED SUBSTITUTION MARKER */
6990           || ch == 0x2E06 /* RAISED INTERPOLATION MARKER */
6991           || ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */
6992           || ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */
6993           || ch == 0x2E0B /* RAISED SQUARE */
6994           || ch == 0x1F676 /* SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT  */
6995           || ch == 0x1F677 /* SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
6996           || ch == 0x1F678 /* SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT */)
6997         attr |= (int64_t) 1 << LBP_QU;
6998 
6999       /* infix separator (numeric) */
7000       if (ch == 0x002C /* COMMA */
7001           || ch == 0x002E /* FULL STOP */
7002           || ch == 0x003A /* COLON */
7003           || ch == 0x003B /* SEMICOLON */
7004           || ch == 0x037E /* GREEK QUESTION MARK */
7005           || ch == 0x0589 /* ARMENIAN FULL STOP */
7006           || ch == 0x060C /* ARABIC COMMA */
7007           || ch == 0x060D /* ARABIC DATE SEPARATOR */
7008           || ch == 0x07F8 /* NKO COMMA */
7009           || ch == 0x2044 /* FRACTION SLASH */
7010           || ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */
7011           || ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */
7012           || ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */)
7013         attr |= (int64_t) 1 << LBP_IS;
7014 
7015       /* numeric */
7016       if ((unicode_attributes[ch].category[0] == 'N'
7017            && unicode_attributes[ch].category[1] == 'd'
7018            && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
7019           || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */
7020           || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */)
7021         attr |= (int64_t) 1 << LBP_NU;
7022 
7023       /* postfix (numeric) */
7024       if (ch == 0x0025 /* PERCENT SIGN */
7025           || ch == 0x00A2 /* CENT SIGN */
7026           || ch == 0x00B0 /* DEGREE SIGN */
7027           || ch == 0x060B /* AFGHANI SIGN */
7028           || ch == 0x066A /* ARABIC PERCENT SIGN */
7029           || ch == 0x2030 /* PER MILLE SIGN */
7030           || ch == 0x2031 /* PER TEN THOUSAND SIGN */
7031           || ch == 0x2032 /* PRIME */
7032           || ch == 0x2033 /* DOUBLE PRIME */
7033           || ch == 0x2034 /* TRIPLE PRIME */
7034           || ch == 0x2035 /* REVERSED PRIME */
7035           || ch == 0x2036 /* REVERSED DOUBLE PRIME */
7036           || ch == 0x2037 /* REVERSED TRIPLE PRIME */
7037           || ch == 0x20A7 /* PESETA SIGN */
7038           || ch == 0x20BB /* NORDIC MARK SIGN */
7039           || ch == 0x2103 /* DEGREE CELSIUS */
7040           || ch == 0x2109 /* DEGREE FAHRENHEIT */
7041           || ch == 0xFDFC /* RIAL SIGN */
7042           || ch == 0xFE6A /* SMALL PERCENT SIGN */
7043           || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */
7044           || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */
7045           /* Extra characters for compatibility with Unicode LineBreak.txt.  */
7046           || ch == 0x0609 /* ARABIC-INDIC PER MILLE SIGN */
7047           || ch == 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */
7048           || ch == 0x09F2 /* BENGALI RUPEE MARK */
7049           || ch == 0x09F3 /* BENGALI RUPEE SIGN */
7050           || ch == 0x09F9 /* BENGALI CURRENCY DENOMINATOR SIXTEEN */
7051           || ch == 0x0D79 /* MALAYALAM DATE MARK */
7052           || ch == 0x20B6 /* LIVRE TOURNOIS SIGN */
7053           || ch == 0x20BE /* LARI SIGN */
7054           || ch == 0xA838 /* NORTH INDIC RUPEE MARK */)
7055         attr |= (int64_t) 1 << LBP_PO;
7056 
7057       /* prefix (numeric) */
7058       if ((unicode_attributes[ch].category[0] == 'S'
7059            && unicode_attributes[ch].category[1] == 'c')
7060           || ch == 0x002B /* PLUS SIGN */
7061           || ch == 0x005C /* REVERSE SOLIDUS */
7062           || ch == 0x00B1 /* PLUS-MINUS SIGN */
7063           || ch == 0x2116 /* NUMERO SIGN */
7064           || ch == 0x2212 /* MINUS SIGN */
7065           || ch == 0x2213 /* MINUS-OR-PLUS SIGN */)
7066         if (!(attr & ((int64_t) 1 << LBP_PO)))
7067           attr |= (int64_t) 1 << LBP_PR;
7068 
7069       /* symbols allowing breaks */
7070       if (ch == 0x002F /* SOLIDUS */)
7071         attr |= (int64_t) 1 << LBP_SY;
7072 
7073       if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0)
7074         attr |= (int64_t) 1 << LBP_H2;
7075 
7076       if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0)
7077         attr |= (int64_t) 1 << LBP_H3;
7078 
7079       if ((ch >= 0x05D0 && ch <= 0x05F2) || ch == 0xFB1D
7080           || (ch >= 0xFB1F && ch <= 0xFB28) || (ch >= 0xFB2A && ch <= 0xFB4F))
7081         attr |= (int64_t) 1 << LBP_HL;
7082 
7083       if ((ch >= 0x1100 && ch <= 0x115F) || (ch >= 0xA960 && ch <= 0xA97C))
7084         attr |= (int64_t) 1 << LBP_JL;
7085 
7086       if ((ch >= 0x1160 && ch <= 0x11A7) || (ch >= 0xD7B0 && ch <= 0xD7C6))
7087         attr |= (int64_t) 1 << LBP_JV;
7088 
7089       if ((ch >= 0x11A8 && ch <= 0x11FF) || (ch >= 0xD7CB && ch <= 0xD7FB))
7090         attr |= (int64_t) 1 << LBP_JT;
7091 
7092       /* regional indicator */
7093       if (ch >= 0x1F1E6 && ch <= 0x1F1FF)
7094         attr |= (int64_t) 1 << LBP_RI;
7095 
7096       /* complex context (South East Asian) */
7097       if (((unicode_attributes[ch].category[0] == 'C'
7098             && unicode_attributes[ch].category[1] == 'f')
7099            || (unicode_attributes[ch].category[0] == 'L'
7100                && (unicode_attributes[ch].category[1] == 'm'
7101                    || unicode_attributes[ch].category[1] == 'o'))
7102            || (unicode_attributes[ch].category[0] == 'M'
7103                && (unicode_attributes[ch].category[1] == 'c'
7104                    || unicode_attributes[ch].category[1] == 'n')
7105                && ch != 0x1A7F /* TAI THAM COMBINING CRYPTOGRAMMIC DOT */)
7106            /* Extra characters for compatibility with Unicode LineBreak.txt.  */
7107            || ch == 0x109E /* MYANMAR SYMBOL SHAN ONE */
7108            || ch == 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */
7109            || ch == 0x19DA /* NEW TAI LUE THAM DIGIT ONE */
7110            || ch == 0x19DE /* NEW TAI LUE SIGN LAE */
7111            || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */
7112            || (ch >= 0x1AA0 && ch <= 0x1AAD) /* TAI THAM SIGN */
7113            || (ch >= 0xA9E0 && ch <= 0xA9EF) /* Myanmar */
7114            || (ch >= 0xA9FA && ch <= 0xA9FE) /* Myanmar */
7115            || (ch >= 0xAA77 && ch <= 0xAA79) /* MYANMAR SYMBOL AITON */
7116            || (ch >= 0xAADE && ch <= 0xAADF) /* TAI VIET SYMBOL */
7117            || (ch >= 0x1173A && ch <= 0x1173B) /* Ahom */
7118            || ch == 0x1173F /* Ahom */)
7119           && ((ch >= 0x0E00 && ch <= 0x0EFF) /* Thai, Lao */
7120               || (ch >= 0x1000 && ch <= 0x109F) /* Myanmar */
7121               || (ch >= 0x1780 && ch <= 0x17FF) /* Khmer */
7122               || (ch >= 0x1950 && ch <= 0x19DF) /* Tai Le, New Tai Lue */
7123               || (ch >= 0x1A20 && ch <= 0x1AAF) /* Tai Tham */
7124               || (ch >= 0xA9E0 && ch <= 0xA9EF) /* Myanmar */
7125               || (ch >= 0xA9FA && ch <= 0xA9FE) /* Myanmar */
7126               || (ch >= 0xAA60 && ch <= 0xAADF) /* Myanmar Extended-A, Tai Viet */
7127               || (ch >= 0x11700 && ch <= 0x11719) /* Ahom */
7128               || (ch >= 0x1171D && ch <= 0x1172B) /* Ahom */
7129               || (ch >= 0x1173A && ch <= 0x1173B) /* Ahom */
7130               || ch == 0x1173F /* Ahom */))
7131         attr |= (int64_t) 1 << LBP_SA;
7132 
7133       /* attached characters and combining marks */
7134       if ((unicode_attributes[ch].category[0] == 'M'
7135            && (unicode_attributes[ch].category[1] == 'c'
7136                || unicode_attributes[ch].category[1] == 'e'
7137                || unicode_attributes[ch].category[1] == 'n'))
7138           || (unicode_attributes[ch].category[0] == 'C'
7139               && (unicode_attributes[ch].category[1] == 'c'
7140                   || unicode_attributes[ch].category[1] == 'f')
7141               && ch != 0x110BD /* KAITHI NUMBER SIGN */
7142               && ch != 0x08E2 /* ARABIC DISPUTED END OF AYAH */)
7143           || ch == 0x3035 /* VERTICAL KANA REPEAT MARK LOWER HALF */)
7144         if (!(attr & (((int64_t) 1 << LBP_BK) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_WJ) | ((int64_t) 1 << LBP_ZW) | ((int64_t) 1 << LBP_ZWJ))))
7145           attr |= (int64_t) 1 << LBP_CM;
7146 
7147       /* ideographic */
7148       if (ch == 0x231A /* WATCH */
7149           || ch == 0x231B /* HOURGLASS */
7150           || ch == 0x23F0 /* ALARM CLOCK */
7151           || ch == 0x23F1 /* STOPWATCH */
7152           || ch == 0x23F2 /* TIMER CLOCK */
7153           || ch == 0x23F3 /* HOURGLASS WITH FLOWING SAND */
7154           || ch == 0x2600 /* BLACK SUN WITH RAYS */
7155           || ch == 0x2601 /* CLOUD */
7156           || ch == 0x2602 /* UMBRELLA */
7157           || ch == 0x2603 /* SNOWMAN */
7158           || ch == 0x2614 /* UMBRELLA WITH RAIN DROPS */
7159           || ch == 0x2615 /* HOT BEVERAGE */
7160           || ch == 0x2618 /* SHAMROCK */
7161           || ch == 0x261A /* BLACK LEFT POINTING INDEX */
7162           || ch == 0x261B /* BLACK RIGHT POINTING INDEX */
7163           || ch == 0x261C /* WHITE LEFT POINTING INDEX */
7164           || ch == 0x261D /* WHITE UP POINTING INDEX */
7165           || ch == 0x261E /* WHITE RIGHT POINTING INDEX */
7166           || ch == 0x261F /* WHITE DOWN POINTING INDEX */
7167           || ch == 0x2639 /* WHITE FROWNING FACE */
7168           || ch == 0x263A /* WHITE SMILING FACE */
7169           || ch == 0x263B /* BLACK SMILING FACE */
7170           || ch == 0x2668 /* HOT SPRINGS */
7171           || ch == 0x267F /* WHEELCHAIR SYMBOL */
7172           || ch == 0x26BD /* SOCCER BALL */
7173           || ch == 0x26BE /* BASEBALL */
7174           || ch == 0x26BF /* SQUARED KEY */
7175           || ch == 0x26C0 /* WHITE DRAUGHTS MAN */
7176           || ch == 0x26C1 /* WHITE DRAUGHTS KING */
7177           || ch == 0x26C2 /* BLACK DRAUGHTS MAN */
7178           || ch == 0x26C3 /* BLACK DRAUGHTS KING */
7179           || ch == 0x26C4 /* SNOWMAN WITHOUT SNOW */
7180           || ch == 0x26C5 /* SUN BEHIND CLOUD */
7181           || ch == 0x26C6 /* RAIN */
7182           || ch == 0x26C7 /* BLACK SNOWMAN */
7183           || ch == 0x26C8 /* THUNDER CLOUD AND RAIN */
7184           || ch == 0x26CD /* DISABLED CAR */
7185           || ch == 0x26CF /* PICK */
7186           || ch == 0x26D0 /* CAR SLIDING */
7187           || ch == 0x26D1 /* HELMET WITH WHITE CROSS */
7188           || ch == 0x26D3 /* CHAINS */
7189           || ch == 0x26D4 /* NO ENTRY */
7190           || ch == 0x26D8 /* BLACK LEFT LANE MERGE */
7191           || ch == 0x26D9 /* WHITE LEFT LANE MERGE */
7192           || ch == 0x26DC /* LEFT CLOSED ENTRY */
7193           || ch == 0x26DF /* BLACK TRUCK */
7194           || ch == 0x26E0 /* RESTRICTED LEFT ENTRY-1 */
7195           || ch == 0x26E1 /* RESTRICTED LEFT ENTRY-2 */
7196           || ch == 0x26EA /* CHURCH */
7197           || ch == 0x26F1 /* UMBRELLA ON GROUND */
7198           || ch == 0x26F2 /* FOUNTAIN */
7199           || ch == 0x26F3 /* FLAG IN HOLE */
7200           || ch == 0x26F4 /* FERRY */
7201           || ch == 0x26F5 /* SAILBOAT */
7202           || ch == 0x26F7 /* SKIER */
7203           || ch == 0x26F8 /* ICE SKATE */
7204           || ch == 0x26F9 /* PERSON WITH BALL */
7205           || ch == 0x26FA /* TENT */
7206           || ch == 0x26FD /* FUEL PUMP */
7207           || ch == 0x26FE /* CUP ON BLACK SQUARE */
7208           || ch == 0x26FF /* WHITE FLAG WITH HORIZONTAL MIDDLE BLACK STRIPE */
7209           || ch == 0x2700 /* BLACK SAFETY SCISSORS */
7210           || ch == 0x2701 /* UPPER BLADE SCISSORS */
7211           || ch == 0x2702 /* BLACK SCISSORS */
7212           || ch == 0x2703 /* LOWER BLADE SCISSORS */
7213           || ch == 0x2704 /* WHITE SCISSORS */
7214           || ch == 0x2708 /* AIRPLANE */
7215           || ch == 0x2709 /* ENVELOPE */
7216           || ch == 0x270A /* RAISED FIST */
7217           || ch == 0x270B /* RAISED HAND */
7218           || ch == 0x270C /* VICTORY HAND */
7219           || ch == 0x270D /* WRITING HAND */
7220           || ch == 0x2764 /* HEAVY BLACK HEART */
7221           || (ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
7222           || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */
7223           || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */
7224           || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Ideograph Extension A */
7225           || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Ideograph */
7226           || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */
7227           || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */
7228           || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */
7229           || ch == 0xFE62 /* SMALL PLUS SIGN */
7230           || ch == 0xFE63 /* SMALL HYPHEN-MINUS */
7231           || ch == 0xFE64 /* SMALL LESS-THAN SIGN */
7232           || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */
7233           || ch == 0xFE66 /* SMALL EQUALS SIGN */
7234           || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */
7235           || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */
7236           || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
7237           || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
7238           || (ch >= 0x3000 && ch <= 0x33FF
7239               && !(attr & (((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_CM) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP))))
7240           /* Extra characters for compatibility with Unicode LineBreak.txt.  */
7241           || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
7242           || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
7243           || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */
7244           || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */
7245           || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */
7246           || ch == 0xFE45 /* SESAME DOT */
7247           || ch == 0xFE46 /* WHITE SESAME DOT */
7248           || ch == 0xFE49 /* DASHED OVERLINE */
7249           || ch == 0xFE4A /* CENTRELINE OVERLINE */
7250           || ch == 0xFE4B /* WAVY OVERLINE */
7251           || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */
7252           || ch == 0xFE4D /* DASHED LOW LINE */
7253           || ch == 0xFE4E /* CENTRELINE LOW LINE */
7254           || ch == 0xFE4F /* WAVY LOW LINE */
7255           || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */
7256           || ch == 0xFE58 /* SMALL EM DASH */
7257           || ch == 0xFE5F /* SMALL NUMBER SIGN */
7258           || ch == 0xFE60 /* SMALL AMPERSAND */
7259           || ch == 0xFE61 /* SMALL ASTERISK */
7260           || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */
7261           || ch == 0xFE6B /* SMALL COMMERCIAL AT */
7262           || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */
7263           || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */
7264           || ch == 0xFF06 /* FULLWIDTH AMPERSAND */
7265           || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */
7266           || ch == 0xFF0A /* FULLWIDTH ASTERISK */
7267           || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */
7268           || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */
7269           || ch == 0xFF0F /* FULLWIDTH SOLIDUS */
7270           || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */
7271           || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */
7272           || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */
7273           || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */
7274           || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */
7275           || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */
7276           || ch == 0xFF3F /* FULLWIDTH LOW LINE */
7277           || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */
7278           || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */
7279           || ch == 0xFF5E /* FULLWIDTH TILDE */
7280           || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */
7281           || ch == 0xFFE3 /* FULLWIDTH MACRON */
7282           || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */
7283           /* Extra characters for compatibility with Unicode LineBreak.txt.  */
7284           || ch == 0xFF66 /* Halfwidth Katakana */
7285           || (ch >= 0xFF71 && ch <= 0xFF9D) /* Halfwidth Katakana */
7286           || (ch >= 0xFFA0 && ch <= 0xFFBE) /* Halfwidth Hangul */
7287           || (ch >= 0xFFC2 && ch <= 0xFFC7) /* Halfwidth Hangul */
7288           || (ch >= 0xFFCA && ch <= 0xFFCF) /* Halfwidth Hangul */
7289           || (ch >= 0xFFD2 && ch <= 0xFFD7) /* Halfwidth Hangul */
7290           || (ch >= 0xFFDA && ch <= 0xFFDC) /* Halfwidth Hangul */
7291           || (ch >= 0x17000 && ch <= 0x187EC) /* Tangut Ideograph */
7292           || (ch >= 0x18800 && ch <= 0x18AF2) /* Tangut Ideograph */
7293           || (ch >= 0x1B000 && ch <= 0x1B001) /* Kana Supplement */
7294           || (ch >= 0x1F000 && ch <= 0x1F02B) /* Mahjong Tiles */
7295           || (ch >= 0x1F030 && ch <= 0x1F093) /* Domino Tiles */
7296           || (ch >= 0x1F0A0 && ch <= 0x1F0F5) /* Playing Cards */
7297           || (ch >= 0x1F200 && ch <= 0x1F248) /* Enclosed Ideographic Supplement */
7298           || (ch >= 0x1F250 && ch <= 0x1F251) /* Enclosed Ideographic Supplement */
7299           || (ch >= 0x1F300 && ch <= 0x1F5FF /* Miscellaneous Symbols and Pictographs */
7300               && ch != 0x1F3B5 && ch != 0x1F3B6 && ch != 0x1F3BC
7301               && ch != 0x1F4A0 && ch != 0x1F4A2 && ch != 0x1F4A4
7302               && ch != 0x1F4AF && ch != 0x1F4B1 && ch != 0x1F4B2
7303               && !(ch >= 0x1F39C && ch <= 0x1F39D)
7304               && !(ch >= 0x1F3FB && ch <= 0x1F3FF)
7305               && !(ch >= 0x1F500 && ch <= 0x1F506)
7306               && !(ch >= 0x1F517 && ch <= 0x1F524)
7307               && !(ch >= 0x1F532 && ch <= 0x1F549)
7308               && !(ch >= 0x1F5D4 && ch <= 0x1F5DB)
7309               && !(ch >= 0x1F5F4 && ch <= 0x1F5F9))
7310           || (ch >= 0x1F600 && ch <= 0x1F64F) /* Emoticons */
7311           || (ch >= 0x1F680 && ch <= 0x1F6DF) /* Transport and Map Symbols */
7312           || (ch >= 0x1F6E0 && ch <= 0x1F6EC) /* Transport and Map Symbols */
7313           || (ch >= 0x1F6F0 && ch <= 0x1F6F6) /* Transport and Map Symbols */
7314           || (ch >= 0x1F900 && ch <= 0x1F9FF) /* Supplemental Symbols and Pictographs */
7315           || (ch >= 0x2A700 && ch <= 0x2B734) /* CJK Ideograph Extension C */
7316           || (ch >= 0x2B740 && ch <= 0x2B81D) /* CJK Ideograph Extension D */
7317           || (ch >= 0x2B820 && ch <= 0x2CEAF) /* CJK Ideograph Extension E */)
7318         if (!(attr & (((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_CM) | ((int64_t) 1 << LBP_EB))))
7319           {
7320             /* ambiguous (ideograph) ? */
7321             if ((unicode_width[ch] != NULL
7322                  && unicode_width[ch][0] == 'A'
7323                  && ch >= 0x2000
7324                  && ch != 0x2614
7325                  && ch != 0x2615
7326                  && ch != 0x261C
7327                  && ch != 0x261E
7328                  && ch != 0x2668
7329                  && ch != 0x26BE
7330                  && ch != 0x26BF
7331                  && !(ch >= 0x26C4 && ch <= 0x26C8)
7332                  && ch != 0x26CD
7333                  && ch != 0x26CF
7334                  && ch != 0x26D0
7335                  && ch != 0x26D1
7336                  && ch != 0x26D3
7337                  && ch != 0x26D4
7338                  && ch != 0x26D8
7339                  && ch != 0x26D9
7340                  && ch != 0x26DC
7341                  && ch != 0x26DF
7342                  && ch != 0x26E0
7343                  && ch != 0x26E1
7344                  && ch != 0x26EA
7345                  && !(ch >= 0x26F1 && ch <= 0x26F5)
7346                  && !(ch >= 0x26F7 && ch <= 0x26FA)
7347                  && !(ch >= 0x26FD && ch <= 0x26FF))
7348                 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
7349                 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */)
7350               attr |= (int64_t) 1 << LBP_AI;
7351             else
7352               attr |= (int64_t) 1 << LBP_ID;
7353           }
7354 
7355       /* ordinary alphabetic and symbol characters */
7356       if ((unicode_attributes[ch].category[0] == 'L'
7357            && (unicode_attributes[ch].category[1] == 'u'
7358                || unicode_attributes[ch].category[1] == 'l'
7359                || unicode_attributes[ch].category[1] == 't'
7360                || unicode_attributes[ch].category[1] == 'm'
7361                || unicode_attributes[ch].category[1] == 'o'))
7362           || (unicode_attributes[ch].category[0] == 'S'
7363               && (unicode_attributes[ch].category[1] == 'm'
7364                   || unicode_attributes[ch].category[1] == 'k'
7365                   || unicode_attributes[ch].category[1] == 'o'))
7366           || (unicode_attributes[ch].category[0] == 'N'
7367               && (unicode_attributes[ch].category[1] == 'l'
7368                   || unicode_attributes[ch].category[1] == 'o'))
7369           || (unicode_attributes[ch].category[0] == 'P'
7370               && (unicode_attributes[ch].category[1] == 'c'
7371                   || unicode_attributes[ch].category[1] == 'd'
7372                   || unicode_attributes[ch].category[1] == 'o'))
7373           || ch == 0x0600 /* ARABIC NUMBER SIGN */
7374           || ch == 0x0601 /* ARABIC SIGN SANAH */
7375           || ch == 0x0602 /* ARABIC FOOTNOTE MARKER */
7376           || ch == 0x0603 /* ARABIC SIGN SAFHA */
7377           || ch == 0x0604 /* ARABIC SIGN SAMVAT */
7378           || ch == 0x0605 /* ARABIC NUMBER MARK ABOVE */
7379           || ch == 0x06DD /* ARABIC END OF AYAH */
7380           || ch == 0x070F /* SYRIAC ABBREVIATION MARK */
7381           || ch == 0x08E2 /* ARABIC DISPUTED END OF AYAH */
7382           || ch == 0x2061 /* FUNCTION APPLICATION */
7383           || ch == 0x2062 /* INVISIBLE TIMES */
7384           || ch == 0x2063 /* INVISIBLE SEPARATOR */
7385           || ch == 0x2064 /* INVISIBLE PLUS */
7386           /* Extra characters for compatibility with Unicode LineBreak.txt.  */
7387           || ch == 0x110BD /* KAITHI NUMBER SIGN */)
7388         if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_HL) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_RI) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID) | ((int64_t) 1 << LBP_EB) | ((int64_t) 1 << LBP_EM)))
7389             && ch != 0x3035 /* VERTICAL KANA REPEAT MARK LOWER HALF */)
7390           {
7391             /* ambiguous (alphabetic) ? */
7392             if ((unicode_width[ch] != NULL
7393                  && unicode_width[ch][0] == 'A'
7394                  && ch >= 0x2000
7395                  /* Extra exceptions for compatibility with Unicode LineBreak.txt.  */
7396                  && ch != 0x2022 /* BULLET */
7397                  && ch != 0x203E /* OVERLINE */
7398                  && ch != 0x2126 /* OHM SIGN */
7399                  && ch != 0x2153 /* VULGAR FRACTION ONE THIRD */
7400                  && ch != 0x215C /* VULGAR FRACTION THREE EIGHTHS */
7401                  && ch != 0x215D /* VULGAR FRACTION FIVE EIGHTHS */
7402                  && ch != 0x21B8 /* NORTH WEST ARROW TO LONG BAR */
7403                  && ch != 0x21B9 /* LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR */
7404                  && ch != 0x21E7 /* UPWARDS WHITE ARROW */
7405                  && ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */
7406                  && ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */)
7407                 || ch == 0x00A7 /* SECTION SIGN */
7408                 || ch == 0x00A8 /* DIAERESIS */
7409                 || ch == 0x00AA /* FEMININE ORDINAL INDICATOR */
7410                 || ch == 0x00B2 /* SUPERSCRIPT TWO */
7411                 || ch == 0x00B3 /* SUPERSCRIPT THREE */
7412                 || ch == 0x00B6 /* PILCROW SIGN */
7413                 || ch == 0x00B7 /* MIDDLE DOT */
7414                 || ch == 0x00B8 /* CEDILLA */
7415                 || ch == 0x00B9 /* SUPERSCRIPT ONE */
7416                 || ch == 0x00BA /* MASCULINE ORDINAL INDICATOR */
7417                 || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */
7418                 || ch == 0x00BD /* VULGAR FRACTION ONE HALF */
7419                 || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */
7420                 || ch == 0x00D7 /* MULTIPLICATION SIGN */
7421                 || ch == 0x00F7 /* DIVISION SIGN */
7422                 || ch == 0x02C7 /* CARON */
7423                 || ch == 0x02C9 /* MODIFIER LETTER MACRON */
7424                 || ch == 0x02CA /* MODIFIER LETTER ACUTE ACCENT */
7425                 || ch == 0x02CB /* MODIFIER LETTER GRAVE ACCENT */
7426                 || ch == 0x02CD /* MODIFIER LETTER LOW MACRON */
7427                 || ch == 0x02D0 /* MODIFIER LETTER TRIANGULAR COLON */
7428                 || ch == 0x02D8 /* BREVE */
7429                 || ch == 0x02D9 /* DOT ABOVE */
7430                 || ch == 0x02DA /* RING ABOVE */
7431                 || ch == 0x02DB /* OGONEK */
7432                 || ch == 0x02DD /* DOUBLE ACUTE ACCENT */
7433                 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
7434                 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */
7435                 /* Extra characters for compatibility with Unicode LineBreak.txt.  */
7436                 || ch == 0x2155 /* VULGAR FRACTION ONE FIFTH */
7437                 || ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */
7438                 || ch == 0x2616 /* WHITE SHOGI PIECE */
7439                 || ch == 0x2617 /* BLACK SHOGI PIECE */
7440                 || ch == 0x2757 /* HEAVY EXCLAMATION MARK SYMBOL */
7441                 || ch == 0x2B55 /* HEAVY LARGE CIRCLE */
7442                 || ch == 0x1F10B /* DINGBAT CIRCLED SANS-SERIF DIGIT ZERO */
7443                 || ch == 0x1F18E /* NEGATIVE SQUARED AB */
7444                 || (ch >= 0x1F191 && ch <= 0x1F19A) /* SQUARED CL..SQUARED VS */
7445                 || ch == 0x1F10C /* DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ZERO */)
7446               attr |= (int64_t) 1 << LBP_AI;
7447             else
7448               attr |= (int64_t) 1 << LBP_AL;
7449             attr &= ~((int64_t) 1 << LBP_CM);
7450           }
7451     }
7452   else
7453     {
7454       /* Unassigned character.  */
7455       if ((ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A */
7456           || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs */
7457           || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs */
7458           || (ch >= 0x1F02C && ch <= 0x1F02F) /* reserved */
7459           || (ch >= 0x1F094 && ch <= 0x1F09F) /* reserved */
7460           || (ch >= 0x1F0AF && ch <= 0x1F0B0) /* reserved */
7461           || ch == 0x1F0C0 /* reserved */
7462           || ch == 0x1F0D0 /* reserved */
7463           || (ch >= 0x1F0F6 && ch <= 0x1F0FF) /* reserved */
7464           || (ch >= 0x1F10D && ch <= 0x1F10F) /* reserved */
7465           || ch == 0x1F12F /* reserved */
7466           || (ch >= 0x1F16C && ch <= 0x1F16F) /* reserved */
7467           || (ch >= 0x1F1AD && ch <= 0x1F1E5) /* reserved */
7468           || (ch >= 0x1F203 && ch <= 0x1F20F) /* reserved */
7469           || (ch >= 0x1F23C && ch <= 0x1F23F) /* reserved */
7470           || (ch >= 0x1F249 && ch <= 0x1F24F) /* reserved */
7471           || (ch >= 0x1F252 && ch <= 0x1F2FF) /* reserved */
7472           || (ch >= 0x1F6D3 && ch <= 0x1F6DF) /* reserved */
7473           || (ch >= 0x1F6ED && ch <= 0x1F6EF) /* reserved */
7474           || (ch >= 0x1F6F7 && ch <= 0x1F6FF) /* reserved */
7475           || (ch >= 0x1F774 && ch <= 0x1F77F) /* reserved */
7476           || (ch >= 0x1F7D5 && ch <= 0x1F7FF) /* reserved */
7477           || (ch >= 0x1F80C && ch <= 0x1F80F) /* reserved */
7478           || (ch >= 0x1F848 && ch <= 0x1F84F) /* reserved */
7479           || (ch >= 0x1F85A && ch <= 0x1F85F) /* reserved */
7480           || (ch >= 0x1F888 && ch <= 0x1F88F) /* reserved */
7481           || (ch >= 0x1F8AE && ch <= 0x1F90F) /* reserved */
7482           || ch == 0x1F91F /* reserved */
7483           || ch == 0x1F93F /* reserved */
7484           || (ch >= 0x1F928 && ch <= 0x1F92F) /* reserved */
7485           || (ch >= 0x1F931 && ch <= 0x1F932) /* reserved */
7486           || (ch >= 0x1F94C && ch <= 0x1F94F) /* reserved */
7487           || (ch >= 0x1F95F && ch <= 0x1F97F) /* reserved */
7488           || (ch >= 0x1F992 && ch <= 0x1F9BF) /* reserved */
7489           || (ch >= 0x1F9C1 && ch <= 0x1FFFD) /* reserved */
7490           || (ch >= 0x20000 && ch <= 0x2A6FF) /* CJK Unified Ideographs Extension B */
7491           || (ch >= 0x2A700 && ch <= 0x2F7FF) /* CJK Unified Ideographs Extension C,
7492                                                  Supplementary Ideographic Plane (Plane 2) outside of blocks */
7493           || (ch >= 0x2F800 && ch <= 0x2FFFD) /* CJK Compatibility Ideographs Supplement,
7494                                                  Supplementary Ideographic Plane (Plane 2) outside of blocks */
7495           || (ch >= 0x30000 && ch <= 0x3FFFD) /* Tertiary Ideographic Plane (Plane 3) outside of blocks */)
7496         attr |= (int64_t) 1 << LBP_ID;
7497     }
7498 
7499   if (attr == 0)
7500     /* unknown */
7501     attr |= (int64_t) 1 << LBP_XX;
7502 
7503   return attr;
7504 }
7505 
7506 /* Output the line breaking properties in a human readable format.  */
7507 static void
7508 debug_output_lbp (FILE *stream)
     /* [previous][next][first][last][top][bottom][index][help] */
7509 {
7510   unsigned int i;
7511 
7512   for (i = 0; i < 0x110000; i++)
7513     {
7514       int64_t attr = get_lbp (i);
7515       if (attr != (int64_t) 1 << LBP_XX)
7516         {
7517           fprintf (stream, "0x%04X", i);
7518 #define PRINT_BIT(attr,bit) \
7519   if (attr & ((int64_t) 1 << bit)) fprintf (stream, " " #bit);
7520           PRINT_BIT(attr,LBP_BK);
7521           PRINT_BIT(attr,LBP_CM);
7522           PRINT_BIT(attr,LBP_WJ);
7523           PRINT_BIT(attr,LBP_ZW);
7524           PRINT_BIT(attr,LBP_GL);
7525           PRINT_BIT(attr,LBP_SP);
7526           PRINT_BIT(attr,LBP_B2);
7527           PRINT_BIT(attr,LBP_BA);
7528           PRINT_BIT(attr,LBP_BB);
7529           PRINT_BIT(attr,LBP_HY);
7530           PRINT_BIT(attr,LBP_CB);
7531           PRINT_BIT(attr,LBP_CL);
7532           PRINT_BIT(attr,LBP_CP);
7533           PRINT_BIT(attr,LBP_EX);
7534           PRINT_BIT(attr,LBP_IN);
7535           PRINT_BIT(attr,LBP_NS);
7536           PRINT_BIT(attr,LBP_OP);
7537           PRINT_BIT(attr,LBP_QU);
7538           PRINT_BIT(attr,LBP_IS);
7539           PRINT_BIT(attr,LBP_NU);
7540           PRINT_BIT(attr,LBP_PO);
7541           PRINT_BIT(attr,LBP_PR);
7542           PRINT_BIT(attr,LBP_SY);
7543           PRINT_BIT(attr,LBP_AI);
7544           PRINT_BIT(attr,LBP_AL);
7545           PRINT_BIT(attr,LBP_H2);
7546           PRINT_BIT(attr,LBP_H3);
7547           PRINT_BIT(attr,LBP_HL);
7548           PRINT_BIT(attr,LBP_ID);
7549           PRINT_BIT(attr,LBP_JL);
7550           PRINT_BIT(attr,LBP_JV);
7551           PRINT_BIT(attr,LBP_JT);
7552           PRINT_BIT(attr,LBP_RI);
7553           PRINT_BIT(attr,LBP_SA);
7554           PRINT_BIT(attr,LBP_ZWJ);
7555           PRINT_BIT(attr,LBP_EB);
7556           PRINT_BIT(attr,LBP_EM);
7557           PRINT_BIT(attr,LBP_XX);
7558 #undef PRINT_BIT
7559           fprintf (stream, "\n");
7560         }
7561     }
7562 }
7563 
7564 static void
7565 debug_output_lbrk_tables (const char *filename)
     /* [previous][next][first][last][top][bottom][index][help] */
7566 {
7567   FILE *stream;
7568 
7569   stream = fopen (filename, "w");
7570   if (stream == NULL)
7571     {
7572       fprintf (stderr, "cannot open '%s' for writing\n", filename);
7573       exit (1);
7574     }
7575 
7576   debug_output_lbp (stream);
7577 
7578   if (ferror (stream) || fclose (stream))
7579     {
7580       fprintf (stderr, "error writing to '%s'\n", filename);
7581       exit (1);
7582     }
7583 }
7584 
7585 /* The line breaking property from the LineBreak.txt file.  */
7586 int unicode_org_lbp[0x110000];
7587 
7588 /* Stores in unicode_org_lbp[] the line breaking property from the
7589    LineBreak.txt file.  */
7590 static void
7591 fill_org_lbp (const char *linebreak_filename)
     /* [previous][next][first][last][top][bottom][index][help] */
7592 {
7593   unsigned int i, j;
7594   FILE *stream;
7595   char field0[FIELDLEN];
7596   char field1[FIELDLEN];
7597   char field2[FIELDLEN];
7598   int lineno = 0;
7599 
7600   for (i = 0; i < 0x110000; i++)
7601     unicode_org_lbp[i] = LBP_XX;
7602 
7603   stream = fopen (linebreak_filename, "r");
7604   if (stream == NULL)
7605     {
7606       fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename);
7607       exit (1);
7608     }
7609 
7610   for (;;)
7611     {
7612       int n;
7613       int c;
7614       int value;
7615 
7616       lineno++;
7617       c = getc (stream);
7618       if (c == EOF)
7619         break;
7620       if (c == '#')
7621         {
7622           do c = getc (stream); while (c != EOF && c != '\n');
7623           continue;
7624         }
7625       ungetc (c, stream);
7626       n = getfield (stream, field0, ';');
7627       n += getfield (stream, field1, ' ');
7628       n += getfield (stream, field2, '\n');
7629       if (n == 0)
7630         break;
7631       if (n != 3)
7632         {
7633           fprintf (stderr, "short line in '%s':%d\n", linebreak_filename,
7634                    lineno);
7635           exit (1);
7636         }
7637 #define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
7638       if (false) {}
7639       TRY(LBP_BK)
7640       TRY(LBP_CM)
7641       TRY(LBP_WJ)
7642       TRY(LBP_ZW)
7643       TRY(LBP_GL)
7644       TRY(LBP_SP)
7645       TRY(LBP_B2)
7646       TRY(LBP_BA)
7647       TRY(LBP_BB)
7648       TRY(LBP_HY)
7649       TRY(LBP_CB)
7650       TRY(LBP_CL)
7651       TRY(LBP_CP)
7652       TRY(LBP_EX)
7653       TRY(LBP_IN)
7654       TRY(LBP_NS)
7655       TRY(LBP_OP)
7656       TRY(LBP_QU)
7657       TRY(LBP_IS)
7658       TRY(LBP_NU)
7659       TRY(LBP_PO)
7660       TRY(LBP_PR)
7661       TRY(LBP_SY)
7662       TRY(LBP_AI)
7663       TRY(LBP_AL)
7664       TRY(LBP_H2)
7665       TRY(LBP_H3)
7666       TRY(LBP_HL)
7667       TRY(LBP_ID)
7668       TRY(LBP_JL)
7669       TRY(LBP_JV)
7670       TRY(LBP_JT)
7671       TRY(LBP_RI)
7672       TRY(LBP_SA)
7673       TRY(LBP_ZWJ)
7674       TRY(LBP_EB)
7675       TRY(LBP_EM)
7676       TRY(LBP_XX)
7677 #undef TRY
7678       else if (strcmp (field1, "LF") == 0) value = LBP_BK;
7679       else if (strcmp (field1, "CR") == 0) value = LBP_BK;
7680       else if (strcmp (field1, "NL") == 0) value = LBP_BK;
7681       else if (strcmp (field1, "SG") == 0) value = LBP_XX;
7682       else if (strcmp (field1, "CJ") == 0) value = LBP_NS;
7683       else
7684         {
7685           fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",
7686                    field1, linebreak_filename, lineno);
7687           exit (1);
7688         }
7689       i = strtoul (field0, NULL, 16);
7690       if (strstr (field0, "..") != NULL)
7691         {
7692           /* Deal with a range.  */
7693           j = strtoul (strstr (field0, "..") + 2, NULL, 16);
7694           for (; i <= j; i++)
7695             unicode_org_lbp[i] = value;
7696         }
7697       else
7698         {
7699           /* Single character line.  */
7700           unicode_org_lbp[i] = value;
7701         }
7702     }
7703 
7704   if (ferror (stream) || fclose (stream))
7705     {
7706       fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
7707       exit (1);
7708     }
7709 }
7710 
7711 /* Output the line breaking properties in a human readable format.  */
7712 static void
7713 debug_output_org_lbp (FILE *stream)
     /* [previous][next][first][last][top][bottom][index][help] */
7714 {
7715   unsigned int i;
7716 
7717   for (i = 0; i < 0x110000; i++)
7718     {
7719       int attr = unicode_org_lbp[i];
7720       if (attr != LBP_XX)
7721         {
7722           fprintf (stream, "0x%04X", i);
7723 #define PRINT_BIT(attr,bit) \
7724   if (attr == bit) fprintf (stream, " " #bit);
7725           PRINT_BIT(attr,LBP_BK);
7726           PRINT_BIT(attr,LBP_CM);
7727           PRINT_BIT(attr,LBP_WJ);
7728           PRINT_BIT(attr,LBP_ZW);
7729           PRINT_BIT(attr,LBP_GL);
7730           PRINT_BIT(attr,LBP_SP);
7731           PRINT_BIT(attr,LBP_B2);
7732           PRINT_BIT(attr,LBP_BA);
7733           PRINT_BIT(attr,LBP_BB);
7734           PRINT_BIT(attr,LBP_HY);
7735           PRINT_BIT(attr,LBP_CB);
7736           PRINT_BIT(attr,LBP_CL);
7737           PRINT_BIT(attr,LBP_CP);
7738           PRINT_BIT(attr,LBP_EX);
7739           PRINT_BIT(attr,LBP_IN);
7740           PRINT_BIT(attr,LBP_NS);
7741           PRINT_BIT(attr,LBP_OP);
7742           PRINT_BIT(attr,LBP_QU);
7743           PRINT_BIT(attr,LBP_IS);
7744           PRINT_BIT(attr,LBP_NU);
7745           PRINT_BIT(attr,LBP_PO);
7746           PRINT_BIT(attr,LBP_PR);
7747           PRINT_BIT(attr,LBP_SY);
7748           PRINT_BIT(attr,LBP_AI);
7749           PRINT_BIT(attr,LBP_AL);
7750           PRINT_BIT(attr,LBP_H2);
7751           PRINT_BIT(attr,LBP_H3);
7752           PRINT_BIT(attr,LBP_HL);
7753           PRINT_BIT(attr,LBP_ID);
7754           PRINT_BIT(attr,LBP_JL);
7755           PRINT_BIT(attr,LBP_JV);
7756           PRINT_BIT(attr,LBP_JT);
7757           PRINT_BIT(attr,LBP_RI);
7758           PRINT_BIT(attr,LBP_SA);
7759           PRINT_BIT(attr,LBP_ZWJ);
7760           PRINT_BIT(attr,LBP_EB);
7761           PRINT_BIT(attr,LBP_EM);
7762           PRINT_BIT(attr,LBP_XX);
7763 #undef PRINT_BIT
7764           fprintf (stream, "\n");
7765         }
7766     }
7767 }
7768 
7769 static void
7770 debug_output_org_lbrk_tables (const char *filename)
     /* [previous][next][first][last][top][bottom][index][help] */
7771 {
7772   FILE *stream;
7773 
7774   stream = fopen (filename, "w");
7775   if (stream == NULL)
7776     {
7777       fprintf (stderr, "cannot open '%s' for writing\n", filename);
7778       exit (1);
7779     }
7780 
7781   debug_output_org_lbp (stream);
7782 
7783   if (ferror (stream) || fclose (stream))
7784     {
7785       fprintf (stderr, "error writing to '%s'\n", filename);
7786       exit (1);
7787     }
7788 }
7789 
7790 /* Construction of sparse 3-level tables.  */
7791 #define TABLE lbp_table
7792 #define ELEMENT unsigned char
7793 #define DEFAULT LBP_XX
7794 #define xmalloc malloc
7795 #define xrealloc realloc
7796 #include "3level.h"
7797 
7798 static void
7799 output_lbp (FILE *stream1, FILE *stream2)
     /* [previous][next][first][last][top][bottom][index][help] */
7800 {
7801   unsigned int i;
7802   struct lbp_table t;
7803   unsigned int level1_offset, level2_offset, level3_offset;
7804 
7805   t.p = 7;
7806   t.q = 9;
7807   lbp_table_init (&t);
7808 
7809   for (i = 0; i < 0x110000; i++)
7810     {
7811       int64_t attr = get_lbp (i);
7812 
7813       /* Now attr should contain exactly one bit.  */
7814       assert (attr != 0 && (attr & (attr - 1)) == 0);
7815 
7816       if (attr != (int64_t) 1 << LBP_XX)
7817         {
7818           unsigned int log2_attr;
7819           for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
7820 
7821           lbp_table_add (&t, i, log2_attr);
7822         }
7823     }
7824 
7825   lbp_table_finalize (&t);
7826 
7827   level1_offset =
7828     5 * sizeof (uint32_t);
7829   level2_offset =
7830     5 * sizeof (uint32_t)
7831     + t.level1_size * sizeof (uint32_t);
7832   level3_offset =
7833     5 * sizeof (uint32_t)
7834     + t.level1_size * sizeof (uint32_t)
7835     + (t.level2_size << t.q) * sizeof (uint32_t);
7836 
7837   for (i = 0; i < 5; i++)
7838     fprintf (stream1, "#define lbrkprop_header_%d %d\n", i,
7839              ((uint32_t *) t.result)[i]);
7840   fprintf (stream1, "\n");
7841   fprintf (stream1, "typedef struct\n");
7842   fprintf (stream1, "  {\n");
7843   fprintf (stream1, "    int level1[%zu];\n", t.level1_size);
7844   fprintf (stream1, "    int level2[%zu << %d];\n", t.level2_size, t.q);
7845   fprintf (stream1, "    unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
7846   fprintf (stream1, "  }\n");
7847   fprintf (stream1, "lbrkprop_t;\n");
7848   fprintf (stream1, "extern const lbrkprop_t unilbrkprop;\n");
7849 
7850   fprintf (stream2, "const lbrkprop_t unilbrkprop =\n");
7851   fprintf (stream2, "{\n");
7852   fprintf (stream2, "  {");
7853   if (t.level1_size > 8)
7854     fprintf (stream2, "\n   ");
7855   for (i = 0; i < t.level1_size; i++)
7856     {
7857       uint32_t offset;
7858       if (i > 0 && (i % 8) == 0)
7859         fprintf (stream2, "\n   ");
7860       offset = ((uint32_t *) (t.result + level1_offset))[i];
7861       if (offset == 0)
7862         fprintf (stream2, " %5d", -1);
7863       else
7864         fprintf (stream2, " %5zu",
7865                  (offset - level2_offset) / sizeof (uint32_t));
7866       if (i+1 < t.level1_size)
7867         fprintf (stream2, ",");
7868     }
7869   if (t.level1_size > 8)
7870     fprintf (stream2, "\n ");
7871   fprintf (stream2, " },\n");
7872   fprintf (stream2, "  {");
7873   if (t.level2_size << t.q > 8)
7874     fprintf (stream2, "\n   ");
7875   for (i = 0; i < t.level2_size << t.q; i++)
7876     {
7877       uint32_t offset;
7878       if (i > 0 && (i % 8) == 0)
7879         fprintf (stream2, "\n   ");
7880       offset = ((uint32_t *) (t.result + level2_offset))[i];
7881       if (offset == 0)
7882         fprintf (stream2, " %5d", -1);
7883       else
7884         fprintf (stream2, " %5zu",
7885                  (offset - level3_offset) / sizeof (unsigned char));
7886       if (i+1 < t.level2_size << t.q)
7887         fprintf (stream2, ",");
7888     }
7889   if (t.level2_size << t.q > 8)
7890     fprintf (stream2, "\n ");
7891   fprintf (stream2, " },\n");
7892   fprintf (stream2, "  {");
7893   if (t.level3_size << t.p > 8)
7894     fprintf (stream2, "\n   ");
7895   for (i = 0; i < t.level3_size << t.p; i++)
7896     {
7897       unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
7898       const char *value_string;
7899       switch (value)
7900         {
7901 #define CASE(x) case x: value_string = #x; break;
7902           CASE(LBP_BK);
7903           CASE(LBP_CM);
7904           CASE(LBP_WJ);
7905           CASE(LBP_ZW);
7906           CASE(LBP_GL);
7907           CASE(LBP_SP);
7908           CASE(LBP_B2);
7909           CASE(LBP_BA);
7910           CASE(LBP_BB);
7911           CASE(LBP_HY);
7912           CASE(LBP_CB);
7913           CASE(LBP_CL);
7914           CASE(LBP_CP);
7915           CASE(LBP_EX);
7916           CASE(LBP_IN);
7917           CASE(LBP_NS);
7918           CASE(LBP_OP);
7919           CASE(LBP_QU);
7920           CASE(LBP_IS);
7921           CASE(LBP_NU);
7922           CASE(LBP_PO);
7923           CASE(LBP_PR);
7924           CASE(LBP_SY);
7925           CASE(LBP_AI);
7926           CASE(LBP_AL);
7927           CASE(LBP_H2);
7928           CASE(LBP_H3);
7929           CASE(LBP_HL);
7930           CASE(LBP_ID);
7931           CASE(LBP_JL);
7932           CASE(LBP_JV);
7933           CASE(LBP_JT);
7934           CASE(LBP_RI);
7935           CASE(LBP_SA);
7936           CASE(LBP_ZWJ);
7937           CASE(LBP_EB);
7938           CASE(LBP_EM);
7939           CASE(LBP_XX);
7940 #undef CASE
7941           default:
7942             abort ();
7943         }
7944       if (i > 0 && (i % 8) == 0)
7945         fprintf (stream2, "\n   ");
7946       fprintf (stream2, " %s%s", value_string,
7947                (i+1 < t.level3_size << t.p ? "," : ""));
7948     }
7949   if (t.level3_size << t.p > 8)
7950     fprintf (stream2, "\n ");
7951   fprintf (stream2, " }\n");
7952   fprintf (stream2, "};\n");
7953 }
7954 
7955 static void
7956 output_lbrk_tables (const char *filename1, const char *filename2, const char *version)
     /* [previous][next][first][last][top][bottom][index][help] */
7957 {
7958   const char *filenames[2];
7959   FILE *streams[2];
7960   size_t i;
7961 
7962   filenames[0] = filename1;
7963   filenames[1] = filename2;
7964 
7965   for (i = 0; i < 2; i++)
7966     {
7967       streams[i] = fopen (filenames[i], "w");
7968       if (streams[i] == NULL)
7969         {
7970           fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
7971           exit (1);
7972         }
7973     }
7974 
7975   for (i = 0; i < 2; i++)
7976     {
7977       FILE *stream = streams[i];
7978 
7979       fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7980       fprintf (stream, "/* Line breaking properties of Unicode characters.  */\n");
7981       fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
7982                version);
7983       fprintf (stream, "\n");
7984 
7985       fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.\n");
7986       fprintf (stream, "\n");
7987       output_library_license (stream, false);
7988       fprintf (stream, "\n");
7989     }
7990 
7991   output_lbp (streams[0], streams[1]);
7992 
7993   for (i = 0; i < 2; i++)
7994     {
7995       if (ferror (streams[i]) || fclose (streams[i]))
7996         {
7997           fprintf (stderr, "error writing to '%s'\n", filenames[i]);
7998           exit (1);
7999         }
8000     }
8001 }
8002 
8003 /* ========================================================================= */
8004 
8005 /* Word break property.
8006    Updated for Unicode TR #29 revision 17.  */
8007 
8008 /* Possible values of the Word_Break property.  */
8009 enum
8010 {
8011   WBP_OTHER        = 0,
8012   WBP_CR           = 11,
8013   WBP_LF           = 12,
8014   WBP_NEWLINE      = 10,
8015   WBP_EXTEND       = 8,
8016   WBP_FORMAT       = 9,
8017   WBP_KATAKANA     = 1,
8018   WBP_ALETTER      = 2,
8019   WBP_MIDNUMLET    = 3,
8020   WBP_MIDLETTER    = 4,
8021   WBP_MIDNUM       = 5,
8022   WBP_NUMERIC      = 6,
8023   WBP_EXTENDNUMLET = 7,
8024   WBP_RI           = 13,
8025   WBP_DQ           = 14,
8026   WBP_SQ           = 15,
8027   WBP_HL           = 16,
8028   WBP_ZWJ          = 17,
8029   WBP_EB           = 18,
8030   WBP_EM           = 19,
8031   WBP_GAZ          = 20,
8032   WBP_EBG          = 21
8033 };
8034 
8035 /* Returns the word breaking property for ch, as a bit mask.  */
8036 static int
8037 get_wbp (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
8038 {
8039   int attr = 0;
8040 
8041   if (unicode_attributes[ch].name != NULL)
8042     {
8043       if (ch == 0x000D)
8044         attr |= 1 << WBP_CR;
8045 
8046       if (ch == 0x000A)
8047         attr |= 1 << WBP_LF;
8048 
8049       if (ch == 0x000B || ch == 0x000C
8050           || ch == 0x0085
8051           || ch == 0x2028 || ch == 0x2029)
8052         attr |= 1 << WBP_NEWLINE;
8053 
8054       if (((unicode_properties[ch] >> PROP_GRAPHEME_EXTEND) & 1) != 0
8055           || ((unicode_properties[ch] >> PROP_OTHER_GRAPHEME_EXTEND) & 1) != 0
8056           || (unicode_attributes[ch].category != NULL
8057               && strcmp (unicode_attributes[ch].category, "Mc") == 0))
8058         attr |= 1 << WBP_EXTEND;
8059 
8060       if (unicode_attributes[ch].category != NULL
8061           && strcmp (unicode_attributes[ch].category, "Cf") == 0
8062           && ch != 0x200B && ch != 0x200C && ch != 0x200D
8063           && !(ch >= 0xe0020 && ch <= 0xe007f))
8064         attr |= 1 << WBP_FORMAT;
8065 
8066       if ((unicode_scripts[ch] < numscripts
8067            && strcmp (scripts[unicode_scripts[ch]], "Katakana") == 0)
8068           || (ch >= 0x3031 && ch <= 0x3035)
8069           || ch == 0x309B || ch == 0x309C || ch == 0x30A0 || ch == 0x30FC
8070           || ch == 0xFF70)
8071         attr |= 1 << WBP_KATAKANA;
8072 
8073       if ((unicode_scripts[ch] < numscripts
8074            && strcmp (scripts[unicode_scripts[ch]], "Hebrew") == 0)
8075           && strcmp (unicode_attributes[ch].category, "Lo") == 0)
8076         attr |= 1 << WBP_HL;
8077 
8078       if ((((unicode_properties[ch] >> PROP_ALPHABETIC) & 1) != 0
8079            || ch == 0x05F3)
8080           && ((unicode_properties[ch] >> PROP_IDEOGRAPHIC) & 1) == 0
8081           && (attr & (1 << WBP_KATAKANA)) == 0
8082           && ((get_lbp (ch) >> LBP_SA) & 1) == 0
8083           && !(unicode_scripts[ch] < numscripts
8084                && strcmp (scripts[unicode_scripts[ch]], "Hiragana") == 0)
8085           && (attr & (1 << WBP_EXTEND)) == 0
8086           && (attr & (1 << WBP_HL)) == 0)
8087         attr |= 1 << WBP_ALETTER;
8088 
8089       if (is_WBP_MIDNUMLET (ch))
8090         attr |= 1 << WBP_MIDNUMLET;
8091 
8092       if (is_WBP_MIDLETTER (ch))
8093         attr |= 1 << WBP_MIDLETTER;
8094 
8095       if ((((get_lbp (ch) >> LBP_IS) & 1) != 0
8096            || ch == 0x066C || ch == 0xFE50 || ch == 0xFE54 || ch == 0xFF0C
8097            || ch == 0xFF1B)
8098           && ch != 0x003A && ch != 0xFE13 && ch != 0x002E)
8099         attr |= 1 << WBP_MIDNUM;
8100 
8101       if (((get_lbp (ch) >> LBP_NU) & 1) != 0
8102           && ch != 0x066C)
8103         attr |= 1 << WBP_NUMERIC;
8104 
8105       if ((unicode_attributes[ch].category != NULL
8106            && strcmp (unicode_attributes[ch].category, "Pc") == 0)
8107           || ch == 0x202F /* NARROW NO-BREAK SPACE */)
8108         attr |= 1 << WBP_EXTENDNUMLET;
8109 
8110       if (((get_lbp (ch) >> LBP_RI) & 1) != 0)
8111         attr |= 1 << WBP_RI;
8112 
8113       if (ch == 0x0022)
8114         attr |= 1 << WBP_DQ;
8115 
8116       if (ch == 0x0027)
8117         attr |= 1 << WBP_SQ;
8118 
8119       if (ch == 0x200D)
8120         attr |= 1 << WBP_ZWJ;
8121 
8122       if (ch >= 0x1F466 && ch <= 0x1F469)
8123         attr |= 1 << WBP_EBG;
8124       else if (((get_lbp (ch) >> LBP_EB) & 1) != 0)
8125         attr |= 1 << WBP_EB;
8126 
8127       if (((get_lbp (ch) >> LBP_EM) & 1) != 0)
8128         attr |= 1 << WBP_EM;
8129 
8130       if (ch == 0x2764 || ch == 0x1F48B || ch == 0x1F5E8)
8131         attr |= 1 << WBP_GAZ;
8132     }
8133 
8134   if (attr == 0)
8135     /* other */
8136     attr |= 1 << WBP_OTHER;
8137 
8138   return attr;
8139 }
8140 
8141 /* Output the word break property in a human readable format.  */
8142 static void
8143 debug_output_wbp (FILE *stream)
     /* [previous][next][first][last][top][bottom][index][help] */
8144 {
8145   unsigned int i;
8146 
8147   for (i = 0; i < 0x110000; i++)
8148     {
8149       int attr = get_wbp (i);
8150       if (attr != 1 << WBP_OTHER)
8151         {
8152           fprintf (stream, "0x%04X", i);
8153           if (attr & (1 << WBP_CR))
8154             fprintf (stream, " CR");
8155           if (attr & (1 << WBP_LF))
8156             fprintf (stream, " LF");
8157           if (attr & (1 << WBP_NEWLINE))
8158             fprintf (stream, " Newline");
8159           if (attr & (1 << WBP_EXTEND))
8160             fprintf (stream, " Extend");
8161           if (attr & (1 << WBP_FORMAT))
8162             fprintf (stream, " Format");
8163           if (attr & (1 << WBP_KATAKANA))
8164             fprintf (stream, " Katakana");
8165           if (attr & (1 << WBP_ALETTER))
8166             fprintf (stream, " ALetter");
8167           if (attr & (1 << WBP_MIDNUMLET))
8168             fprintf (stream, " MidNumLet");
8169           if (attr & (1 << WBP_MIDLETTER))
8170             fprintf (stream, " MidLetter");
8171           if (attr & (1 << WBP_MIDNUM))
8172             fprintf (stream, " MidNum");
8173           if (attr & (1 << WBP_NUMERIC))
8174             fprintf (stream, " Numeric");
8175           if (attr & (1 << WBP_EXTENDNUMLET))
8176             fprintf (stream, " ExtendNumLet");
8177           if (attr & (1 << WBP_RI))
8178             fprintf (stream, " Regional_Indicator");
8179           if (attr & (1 << WBP_DQ))
8180             fprintf (stream, " Double_Quote");
8181           if (attr & (1 << WBP_SQ))
8182             fprintf (stream, " Single_Quote");
8183           if (attr & (1 << WBP_HL))
8184             fprintf (stream, " Hebrew_Letter");
8185           if (attr & (1 << WBP_ZWJ))
8186             fprintf (stream, " ZWJ");
8187           if (attr & (1 << WBP_EB))
8188             fprintf (stream, " E_Base");
8189           if (attr & (1 << WBP_EM))
8190             fprintf (stream, " E_Modifier");
8191           if (attr & (1 << WBP_GAZ))
8192             fprintf (stream, " Glue_After_Zwj");
8193           if (attr & (1 << WBP_EBG))
8194             fprintf (stream, " E_Base_GAZ");
8195          fprintf (stream, "\n");
8196         }
8197     }
8198 }
8199 
8200 static void
8201 debug_output_wbrk_tables (const char *filename)
     /* [previous][next][first][last][top][bottom][index][help] */
8202 {
8203   FILE *stream;
8204 
8205   stream = fopen (filename, "w");
8206   if (stream == NULL)
8207     {
8208       fprintf (stderr, "cannot open '%s' for writing\n", filename);
8209       exit (1);
8210     }
8211 
8212   debug_output_wbp (stream);
8213 
8214   if (ferror (stream) || fclose (stream))
8215     {
8216       fprintf (stderr, "error writing to '%s'\n", filename);
8217       exit (1);
8218     }
8219 }
8220 
8221 /* The word break property from the WordBreakProperty.txt file.  */
8222 int unicode_org_wbp[0x110000];
8223 
8224 /* Stores in unicode_org_wbp[] the word break property from the
8225    WordBreakProperty.txt file.  */
8226 static void
8227 fill_org_wbp (const char *wordbreakproperty_filename)
     /* [previous][next][first][last][top][bottom][index][help] */
8228 {
8229   unsigned int i;
8230   FILE *stream;
8231 
8232   for (i = 0; i < 0x110000; i++)
8233     unicode_org_wbp[i] = WBP_OTHER;
8234 
8235   stream = fopen (wordbreakproperty_filename, "r");
8236   if (stream == NULL)
8237     {
8238       fprintf (stderr, "error during fopen of '%s'\n", wordbreakproperty_filename);
8239       exit (1);
8240     }
8241 
8242   for (;;)
8243     {
8244       char buf[200+1];
8245       unsigned int i1, i2;
8246       char padding[200+1];
8247       char propname[200+1];
8248       int propvalue;
8249 
8250       if (fscanf (stream, "%200[^\n]\n", buf) < 1)
8251         break;
8252 
8253       if (buf[0] == '\0' || buf[0] == '#')
8254         continue;
8255 
8256       if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
8257         {
8258           if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
8259             {
8260               fprintf (stderr, "parse error in '%s'\n",
8261                        wordbreakproperty_filename);
8262               exit (1);
8263             }
8264           i2 = i1;
8265         }
8266 #define PROP(name,value) \
8267       if (strcmp (propname, name) == 0) propvalue = value; else
8268       PROP ("CR", WBP_CR)
8269       PROP ("LF", WBP_LF)
8270       PROP ("Newline", WBP_NEWLINE)
8271       PROP ("Extend", WBP_EXTEND)
8272       PROP ("Format", WBP_FORMAT)
8273       PROP ("Katakana", WBP_KATAKANA)
8274       PROP ("ALetter", WBP_ALETTER)
8275       PROP ("MidNumLet", WBP_MIDNUMLET)
8276       PROP ("MidLetter", WBP_MIDLETTER)
8277       PROP ("MidNum", WBP_MIDNUM)
8278       PROP ("Numeric", WBP_NUMERIC)
8279       PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
8280       PROP ("Regional_Indicator", WBP_RI)
8281       PROP ("Double_Quote", WBP_DQ)
8282       PROP ("Single_Quote", WBP_SQ)
8283       PROP ("Hebrew_Letter", WBP_HL)
8284       PROP ("ZWJ", WBP_ZWJ)
8285       PROP ("E_Base", WBP_EB)
8286       PROP ("E_Modifier", WBP_EM)
8287       PROP ("Glue_After_Zwj", WBP_GAZ)
8288       PROP ("E_Base_GAZ", WBP_EBG)
8289 #undef PROP
8290         {
8291           fprintf (stderr, "unknown property value '%s' in '%s'\n", propname,
8292                    wordbreakproperty_filename);
8293           exit (1);
8294         }
8295       assert (i1 <= i2 && i2 < 0x110000);
8296 
8297       for (i = i1; i <= i2; i++)
8298         unicode_org_wbp[i] = propvalue;
8299     }
8300 
8301   if (ferror (stream) || fclose (stream))
8302     {
8303       fprintf (stderr, "error reading from '%s'\n", wordbreakproperty_filename);
8304       exit (1);
8305     }
8306 }
8307 
8308 /* Output the word break property in a human readable format.  */
8309 static void
8310 debug_output_org_wbp (FILE *stream)
     /* [previous][next][first][last][top][bottom][index][help] */
8311 {
8312   unsigned int i;
8313 
8314   for (i = 0; i < 0x110000; i++)
8315     {
8316       int propvalue = unicode_org_wbp[i];
8317       if (propvalue != WBP_OTHER)
8318         {
8319           fprintf (stream, "0x%04X", i);
8320 #define PROP(name,value) \
8321           if (propvalue == value) fprintf (stream, " " name); else
8322           PROP ("CR", WBP_CR)
8323           PROP ("LF", WBP_LF)
8324           PROP ("Newline", WBP_NEWLINE)
8325           PROP ("Extend", WBP_EXTEND)
8326           PROP ("Format", WBP_FORMAT)
8327           PROP ("Katakana", WBP_KATAKANA)
8328           PROP ("ALetter", WBP_ALETTER)
8329           PROP ("MidNumLet", WBP_MIDNUMLET)
8330           PROP ("MidLetter", WBP_MIDLETTER)
8331           PROP ("MidNum", WBP_MIDNUM)
8332           PROP ("Numeric", WBP_NUMERIC)
8333           PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
8334           PROP ("Regional_Indicator", WBP_RI)
8335           PROP ("Double_Quote", WBP_DQ)
8336           PROP ("Single_Quote", WBP_SQ)
8337           PROP ("Hebrew_Letter", WBP_HL)
8338           PROP ("ZWJ", WBP_ZWJ)
8339           PROP ("E_Base", WBP_EB)
8340           PROP ("E_Modifier", WBP_EM)
8341           PROP ("Glue_After_Zwj", WBP_GAZ)
8342           PROP ("E_Base_GAZ", WBP_EBG)
8343 #undef PROP
8344           fprintf (stream, " ??");
8345           fprintf (stream, "\n");
8346         }
8347     }
8348 }
8349 
8350 static void
8351 debug_output_org_wbrk_tables (const char *filename)
     /* [previous][next][first][last][top][bottom][index][help] */
8352 {
8353   FILE *stream;
8354 
8355   stream = fopen (filename, "w");
8356   if (stream == NULL)
8357     {
8358       fprintf (stderr, "cannot open '%s' for writing\n", filename);
8359       exit (1);
8360     }
8361 
8362   debug_output_org_wbp (stream);
8363 
8364   if (ferror (stream) || fclose (stream))
8365     {
8366       fprintf (stderr, "error writing to '%s'\n", filename);
8367       exit (1);
8368     }
8369 }
8370 
8371 /* Construction of sparse 3-level tables.  */
8372 #define TABLE wbp_table
8373 #define ELEMENT unsigned char
8374 #define DEFAULT WBP_OTHER
8375 #define xmalloc malloc
8376 #define xrealloc realloc
8377 #include "3level.h"
8378 
8379 static void
8380 output_wbp (FILE *stream)
     /* [previous][next][first][last][top][bottom][index][help] */
8381 {
8382   unsigned int i;
8383   struct wbp_table t;
8384   unsigned int level1_offset, level2_offset, level3_offset;
8385 
8386   t.p = 7;
8387   t.q = 9;
8388   wbp_table_init (&t);
8389 
8390   for (i = 0; i < 0x110000; i++)
8391     {
8392       int attr = get_wbp (i);
8393 
8394       /* Now attr should contain exactly one bit.  */
8395       assert (attr != 0 && (attr & (attr - 1)) == 0);
8396 
8397       if (attr != 1 << WBP_OTHER)
8398         {
8399           unsigned int log2_attr;
8400           for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
8401 
8402           wbp_table_add (&t, i, log2_attr);
8403         }
8404     }
8405 
8406   wbp_table_finalize (&t);
8407 
8408   level1_offset =
8409     5 * sizeof (uint32_t);
8410   level2_offset =
8411     5 * sizeof (uint32_t)
8412     + t.level1_size * sizeof (uint32_t);
8413   level3_offset =
8414     5 * sizeof (uint32_t)
8415     + t.level1_size * sizeof (uint32_t)
8416     + (t.level2_size << t.q) * sizeof (uint32_t);
8417 
8418   for (i = 0; i < 5; i++)
8419     fprintf (stream, "#define wbrkprop_header_%d %d\n", i,
8420              ((uint32_t *) t.result)[i]);
8421   fprintf (stream, "\n");
8422   fprintf (stream, "typedef struct\n");
8423   fprintf (stream, "  {\n");
8424   fprintf (stream, "    int level1[%zu];\n", t.level1_size);
8425   fprintf (stream, "    int level2[%zu << %d];\n", t.level2_size, t.q);
8426   fprintf (stream, "    unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
8427   fprintf (stream, "  }\n");
8428   fprintf (stream, "wbrkprop_t;\n");
8429   fprintf (stream, "static const wbrkprop_t uniwbrkprop =\n");
8430   fprintf (stream, "{\n");
8431   fprintf (stream, "  {");
8432   if (t.level1_size > 8)
8433     fprintf (stream, "\n   ");
8434   for (i = 0; i < t.level1_size; i++)
8435     {
8436       uint32_t offset;
8437       if (i > 0 && (i % 8) == 0)
8438         fprintf (stream, "\n   ");
8439       offset = ((uint32_t *) (t.result + level1_offset))[i];
8440       if (offset == 0)
8441         fprintf (stream, " %5d", -1);
8442       else
8443         fprintf (stream, " %5zu",
8444                  (offset - level2_offset) / sizeof (uint32_t));
8445       if (i+1 < t.level1_size)
8446         fprintf (stream, ",");
8447     }
8448   if (t.level1_size > 8)
8449     fprintf (stream, "\n ");
8450   fprintf (stream, " },\n");
8451   fprintf (stream, "  {");
8452   if (t.level2_size << t.q > 8)
8453     fprintf (stream, "\n   ");
8454   for (i = 0; i < t.level2_size << t.q; i++)
8455     {
8456       uint32_t offset;
8457       if (i > 0 && (i % 8) == 0)
8458         fprintf (stream, "\n   ");
8459       offset = ((uint32_t *) (t.result + level2_offset))[i];
8460       if (offset == 0)
8461         fprintf (stream, " %5d", -1);
8462       else
8463         fprintf (stream, " %5zu",
8464                  (offset - level3_offset) / sizeof (unsigned char));
8465       if (i+1 < t.level2_size << t.q)
8466         fprintf (stream, ",");
8467     }
8468   if (t.level2_size << t.q > 8)
8469     fprintf (stream, "\n ");
8470   fprintf (stream, " },\n");
8471   fprintf (stream, "  {");
8472   if (t.level3_size << t.p > 4)
8473     fprintf (stream, "\n   ");
8474   for (i = 0; i < t.level3_size << t.p; i++)
8475     {
8476       unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
8477       const char *value_string;
8478       switch (value)
8479         {
8480 #define CASE(x) case x: value_string = #x; break;
8481           CASE(WBP_OTHER);
8482           CASE(WBP_CR);
8483           CASE(WBP_LF);
8484           CASE(WBP_NEWLINE);
8485           CASE(WBP_EXTEND);
8486           CASE(WBP_FORMAT);
8487           CASE(WBP_KATAKANA);
8488           CASE(WBP_ALETTER);
8489           CASE(WBP_MIDNUMLET);
8490           CASE(WBP_MIDLETTER);
8491           CASE(WBP_MIDNUM);
8492           CASE(WBP_NUMERIC);
8493           CASE(WBP_EXTENDNUMLET);
8494           CASE(WBP_RI);
8495           CASE(WBP_DQ);
8496           CASE(WBP_SQ);
8497           CASE(WBP_HL);
8498           CASE(WBP_ZWJ);
8499           CASE(WBP_EB);
8500           CASE(WBP_EM);
8501           CASE(WBP_GAZ);
8502           CASE(WBP_EBG);
8503 #undef CASE
8504           default:
8505             abort ();
8506         }
8507       if (i > 0 && (i % 4) == 0)
8508         fprintf (stream, "\n   ");
8509       fprintf (stream, " %s%s", value_string,
8510                (i+1 < t.level3_size << t.p ? "," : ""));
8511     }
8512   if (t.level3_size << t.p > 4)
8513     fprintf (stream, "\n ");
8514   fprintf (stream, " }\n");
8515   fprintf (stream, "};\n");
8516 }
8517 
8518 static void
8519 output_wbrk_tables (const char *filename, const char *version)
     /* [previous][next][first][last][top][bottom][index][help] */
8520 {
8521   FILE *stream;
8522 
8523   stream = fopen (filename, "w");
8524   if (stream == NULL)
8525     {
8526       fprintf (stderr, "cannot open '%s' for writing\n", filename);
8527       exit (1);
8528     }
8529 
8530   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8531   fprintf (stream, "/* Word breaking properties of Unicode characters.  */\n");
8532   fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
8533            version);
8534   fprintf (stream, "\n");
8535 
8536   fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.\n");
8537   fprintf (stream, "\n");
8538   output_library_license (stream, false);
8539   fprintf (stream, "\n");
8540 
8541   output_wbp (stream);
8542 
8543   if (ferror (stream) || fclose (stream))
8544     {
8545       fprintf (stderr, "error writing to '%s'\n", filename);
8546       exit (1);
8547     }
8548 }
8549 
8550 /* ========================================================================= */
8551 
8552 /* Grapheme break property.
8553    Updated for Unicode TR #29 revision 29.  */
8554 
8555 /* Possible values of the Grapheme_Cluster_Break property.  */
8556 enum
8557 {
8558   GBP_OTHER        = 0,
8559   GBP_CR           = 1,
8560   GBP_LF           = 2,
8561   GBP_CONTROL      = 3,
8562   GBP_EXTEND       = 4,
8563   GBP_PREPEND      = 5,
8564   GBP_SPACINGMARK  = 6,
8565   GBP_L            = 7,
8566   GBP_V            = 8,
8567   GBP_T            = 9,
8568   GBP_LV           = 10,
8569   GBP_LVT          = 11,
8570   GBP_RI           = 12,
8571   GBP_ZWJ          = 13,
8572   GBP_EB           = 14,
8573   GBP_EM           = 15,
8574   GBP_GAZ          = 16,
8575   GBP_EBG          = 17
8576 };
8577 
8578 /* Construction of sparse 3-level tables.  */
8579 #define TABLE gbp_table
8580 #define ELEMENT unsigned char
8581 #define DEFAULT GBP_OTHER
8582 #define xmalloc malloc
8583 #define xrealloc realloc
8584 #include "3level.h"
8585 
8586 /* The grapheme break property from the GraphemeBreakProperty.txt file.  */
8587 int unicode_org_gbp[0x110000];
8588 
8589 /* Output the unit test data for the grapheme break property.  */
8590 static void
8591 output_gbp_test (const char *filename)
     /* [previous][next][first][last][top][bottom][index][help] */
8592 {
8593   FILE *stream;
8594   bool need_comma;
8595   unsigned int ch;
8596 
8597   stream = fopen (filename, "w");
8598   if (stream == NULL)
8599     {
8600       fprintf (stderr, "cannot open '%s' for writing\n", filename);
8601       exit (1);
8602     }
8603 
8604   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8605   fprintf (stream, "/* Test the Unicode grapheme break property functions.\n");
8606   fprintf (stream, "   Copyright (C) 2010 Free Software Foundation, Inc.\n");
8607   fprintf (stream, "\n");
8608   output_tests_license (stream);
8609   fprintf (stream, "\n");
8610 
8611   need_comma = false;
8612   for (ch = 0; ch < 0x110000; ch++)
8613     {
8614       int gbp = unicode_org_gbp[ch];
8615       const char *gbp_string;
8616 
8617       while (ch + 1 < 0x110000 && unicode_org_gbp[ch + 1] == gbp)
8618         ch++;
8619 
8620       switch (gbp)
8621         {
8622 #define CASE(x) case x: gbp_string = #x; break;
8623       CASE (GBP_OTHER)
8624       CASE (GBP_CR)
8625       CASE (GBP_LF)
8626       CASE (GBP_CONTROL)
8627       CASE (GBP_EXTEND)
8628       CASE (GBP_PREPEND)
8629       CASE (GBP_SPACINGMARK)
8630       CASE (GBP_L)
8631       CASE (GBP_V)
8632       CASE (GBP_T)
8633       CASE (GBP_LV)
8634       CASE (GBP_LVT)
8635       CASE (GBP_RI)
8636       CASE (GBP_ZWJ)
8637       CASE (GBP_EB)
8638       CASE (GBP_EM)
8639       CASE (GBP_GAZ)
8640       CASE (GBP_EBG)
8641 #undef CASE
8642         default:
8643           abort ();
8644         }
8645 
8646       if (need_comma)
8647         fprintf (stream, ",\n");
8648       fprintf (stream, "{ 0x%04X, %s }", ch + 1, gbp_string);
8649 
8650       need_comma = true;
8651     }
8652   fprintf (stream, "\n");
8653 
8654   if (ferror (stream) || fclose (stream))
8655     {
8656       fprintf (stderr, "error writing to '%s'\n", filename);
8657       exit (1);
8658     }
8659 }
8660 
8661 /* Output the per-character grapheme break property table.  */
8662 static void
8663 output_gbp_table (const char *filename, const char *version)
     /* [previous][next][first][last][top][bottom][index][help] */
8664 {
8665   FILE *stream;
8666   unsigned int ch, i;
8667   struct gbp_table t;
8668   unsigned int level1_offset, level2_offset, level3_offset;
8669 
8670   stream = fopen (filename, "w");
8671   if (stream == NULL)
8672     {
8673       fprintf (stderr, "cannot open '%s' for writing\n", filename);
8674       exit (1);
8675     }
8676 
8677   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8678   fprintf (stream, "/* Grapheme break property of Unicode characters.  */\n");
8679   fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
8680            version);
8681   fprintf (stream, "\n");
8682 
8683   fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
8684   fprintf (stream, "\n");
8685   output_library_license (stream, false);
8686   fprintf (stream, "\n");
8687 
8688   t.p = 7;
8689   t.q = 9;
8690   gbp_table_init (&t);
8691 
8692   for (ch = 0; ch < 0x110000; ch++)
8693     gbp_table_add (&t, ch, unicode_org_gbp[ch]);
8694 
8695   gbp_table_finalize (&t);
8696 
8697   /* Offsets in t.result, in memory of this process.  */
8698   level1_offset =
8699     5 * sizeof (uint32_t);
8700   level2_offset =
8701     5 * sizeof (uint32_t)
8702     + t.level1_size * sizeof (uint32_t);
8703   level3_offset =
8704     5 * sizeof (uint32_t)
8705     + t.level1_size * sizeof (uint32_t)
8706     + (t.level2_size << t.q) * sizeof (uint32_t);
8707 
8708   for (i = 0; i < 5; i++)
8709     fprintf (stream, "#define gbrkprop_header_%d %d\n", i,
8710              ((uint32_t *) t.result)[i]);
8711   fprintf (stream, "static const\n");
8712   fprintf (stream, "struct\n");
8713   fprintf (stream, "  {\n");
8714   fprintf (stream, "    int level1[%zu];\n", t.level1_size);
8715   fprintf (stream, "    short level2[%zu << %d];\n", t.level2_size, t.q);
8716   fprintf (stream, "    unsigned char level3[%zu << %d];\n",
8717            t.level3_size, t.p);
8718   fprintf (stream, "  }\n");
8719   fprintf (stream, "unigbrkprop =\n");
8720   fprintf (stream, "{\n");
8721   fprintf (stream, "  {");
8722   if (t.level1_size > 8)
8723     fprintf (stream, "\n   ");
8724   for (i = 0; i < t.level1_size; i++)
8725     {
8726       uint32_t offset;
8727       if (i > 0 && (i % 8) == 0)
8728         fprintf (stream, "\n   ");
8729       offset = ((uint32_t *) (t.result + level1_offset))[i];
8730       if (offset == 0)
8731         fprintf (stream, " %5d", -1);
8732       else
8733         fprintf (stream, " %5zu",
8734                  (offset - level2_offset) / sizeof (uint32_t));
8735       if (i+1 < t.level1_size)
8736         fprintf (stream, ",");
8737     }
8738   if (t.level1_size > 8)
8739     fprintf (stream, "\n ");
8740   fprintf (stream, " },\n");
8741   fprintf (stream, "  {");
8742   if (t.level2_size << t.q > 8)
8743     fprintf (stream, "\n   ");
8744   for (i = 0; i < t.level2_size << t.q; i++)
8745     {
8746       uint32_t offset;
8747       if (i > 0 && (i % 8) == 0)
8748         fprintf (stream, "\n   ");
8749       offset = ((uint32_t *) (t.result + level2_offset))[i];
8750       if (offset == 0)
8751         fprintf (stream, " %5d", -1);
8752       else
8753         fprintf (stream, " %5zu",
8754                  (offset - level3_offset) / sizeof (uint8_t));
8755       if (i+1 < t.level2_size << t.q)
8756         fprintf (stream, ",");
8757     }
8758   if (t.level2_size << t.q > 8)
8759     fprintf (stream, "\n ");
8760   fprintf (stream, " },\n");
8761   fprintf (stream, "  {");
8762   if (t.level3_size << t.p > 4)
8763     fprintf (stream, "\n   ");
8764   for (i = 0; i < t.level3_size << t.p; i++)
8765     {
8766       unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
8767       const char *value_string;
8768       switch (value)
8769         {
8770 #define CASE(x) case x: value_string = #x; break;
8771       CASE (GBP_OTHER)
8772       CASE (GBP_CR)
8773       CASE (GBP_LF)
8774       CASE (GBP_CONTROL)
8775       CASE (GBP_EXTEND)
8776       CASE (GBP_PREPEND)
8777       CASE (GBP_SPACINGMARK)
8778       CASE (GBP_L)
8779       CASE (GBP_V)
8780       CASE (GBP_T)
8781       CASE (GBP_LV)
8782       CASE (GBP_LVT)
8783       CASE (GBP_RI)
8784       CASE (GBP_ZWJ)
8785       CASE (GBP_EB)
8786       CASE (GBP_EM)
8787       CASE (GBP_GAZ)
8788       CASE (GBP_EBG)
8789 #undef CASE
8790           default:
8791             abort ();
8792         }
8793       if (i > 0 && (i % 4) == 0)
8794         fprintf (stream, "\n   ");
8795       fprintf (stream, " %s%s", value_string,
8796                (i+1 < t.level3_size << t.p ? "," : ""));
8797     }
8798   if (t.level3_size << t.p > 4)
8799     fprintf (stream, "\n ");
8800   fprintf (stream, " }\n");
8801   fprintf (stream, "};\n");
8802 
8803   if (ferror (stream) || fclose (stream))
8804     {
8805       fprintf (stderr, "error writing to '%s'\n", filename);
8806       exit (1);
8807     }
8808 }
8809 
8810 /* Stores in unicode_org_gbp[] the grapheme breaking property from the
8811    GraphemeBreakProperty.txt file.  */
8812 static void
8813 fill_org_gbp (const char *graphemebreakproperty_filename)
     /* [previous][next][first][last][top][bottom][index][help] */
8814 {
8815   unsigned int i;
8816   FILE *stream;
8817   int lineno = 0;
8818 
8819   for (i = 0; i < 0x110000; i++)
8820     unicode_org_gbp[i] = GBP_OTHER;
8821 
8822   stream = fopen (graphemebreakproperty_filename, "r");
8823   if (stream == NULL)
8824     {
8825       fprintf (stderr, "error during fopen of '%s'\n",
8826                graphemebreakproperty_filename);
8827       exit (1);
8828     }
8829 
8830   for (;;)
8831     {
8832       char buf[200+1];
8833       unsigned int i1, i2;
8834       char padding[200+1];
8835       char propname[200+1];
8836       int propvalue;
8837 
8838       lineno++;
8839       if (fscanf (stream, "%200[^\n]\n", buf) < 1)
8840         break;
8841 
8842       if (buf[0] == '\0' || buf[0] == '#')
8843         continue;
8844 
8845       if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
8846         {
8847           if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
8848             {
8849               fprintf (stderr, "parse error in '%s'\n",
8850                        graphemebreakproperty_filename);
8851               exit (1);
8852             }
8853           i2 = i1;
8854         }
8855 #define PROP(name,value) \
8856       if (strcmp (propname, name) == 0) propvalue = value; else
8857       PROP ("CR", GBP_CR)
8858       PROP ("LF", GBP_LF)
8859       PROP ("Control", GBP_CONTROL)
8860       PROP ("Extend", GBP_EXTEND)
8861       PROP ("Prepend", GBP_PREPEND)
8862       PROP ("SpacingMark", GBP_SPACINGMARK)
8863       PROP ("L", GBP_L)
8864       PROP ("V", GBP_V)
8865       PROP ("T", GBP_T)
8866       PROP ("LV", GBP_LV)
8867       PROP ("LVT", GBP_LVT)
8868       PROP ("Regional_Indicator", GBP_RI)
8869       PROP ("ZWJ", GBP_ZWJ)
8870       PROP ("E_Base", GBP_EB)
8871       PROP ("E_Modifier", GBP_EM)
8872       PROP ("Glue_After_Zwj", GBP_GAZ)
8873       PROP ("E_Base_GAZ", GBP_EBG)
8874 #undef PROP
8875         {
8876           fprintf (stderr, "unknown property value '%s' in %s:%d\n", propname,
8877                    graphemebreakproperty_filename, lineno);
8878           exit (1);
8879         }
8880       assert (i1 <= i2 && i2 < 0x110000);
8881 
8882       for (i = i1; i <= i2; i++)
8883         unicode_org_gbp[i] = propvalue;
8884     }
8885 
8886   if (ferror (stream) || fclose (stream))
8887     {
8888       fprintf (stderr, "error reading from '%s'\n", graphemebreakproperty_filename);
8889       exit (1);
8890     }
8891 }
8892 
8893 /* ========================================================================= */
8894 
8895 /* Composition and decomposition.
8896    Updated for Unicode TR #15 revision 33.  */
8897 
8898 /* Maximum number of characters into which a single Unicode character can be
8899    decomposed.  */
8900 #define MAX_DECOMP_LENGTH 18
8901 
8902 enum
8903 {
8904   UC_DECOMP_CANONICAL,/*            Canonical decomposition.                  */
8905   UC_DECOMP_FONT,    /*   <font>    A font variant (e.g. a blackletter form). */
8906   UC_DECOMP_NOBREAK, /* <noBreak>   A no-break version of a space or hyphen.  */
8907   UC_DECOMP_INITIAL, /* <initial>   An initial presentation form (Arabic).    */
8908   UC_DECOMP_MEDIAL,  /*  <medial>   A medial presentation form (Arabic).      */
8909   UC_DECOMP_FINAL,   /*  <final>    A final presentation form (Arabic).       */
8910   UC_DECOMP_ISOLATED,/* <isolated>  An isolated presentation form (Arabic).   */
8911   UC_DECOMP_CIRCLE,  /*  <circle>   An encircled form.                        */
8912   UC_DECOMP_SUPER,   /*  <super>    A superscript form.                       */
8913   UC_DECOMP_SUB,     /*   <sub>     A subscript form.                         */
8914   UC_DECOMP_VERTICAL,/* <vertical>  A vertical layout presentation form.      */
8915   UC_DECOMP_WIDE,    /*   <wide>    A wide (or zenkaku) compatibility character. */
8916   UC_DECOMP_NARROW,  /*  <narrow>   A narrow (or hankaku) compatibility character. */
8917   UC_DECOMP_SMALL,   /*  <small>    A small variant form (CNS compatibility). */
8918   UC_DECOMP_SQUARE,  /*  <square>   A CJK squared font variant.               */
8919   UC_DECOMP_FRACTION,/* <fraction>  A vulgar fraction form.                   */
8920   UC_DECOMP_COMPAT   /*  <compat>   Otherwise unspecified compatibility character. */
8921 };
8922 
8923 /* Return the decomposition for a Unicode character (ignoring Hangul Jamo
8924    decompositions).  Return the type, or -1 for none.  */
8925 static int
8926 get_decomposition (unsigned int ch,
     /* [previous][next][first][last][top][bottom][index][help] */
8927                    unsigned int *lengthp, unsigned int decomposed[MAX_DECOMP_LENGTH])
8928 {
8929   const char *decomposition = unicode_attributes[ch].decomposition;
8930 
8931   if (decomposition != NULL && decomposition[0] != '\0')
8932     {
8933       int type = UC_DECOMP_CANONICAL;
8934       unsigned int length;
8935       char *endptr;
8936 
8937       if (decomposition[0] == '<')
8938         {
8939           const char *rangle;
8940           size_t typelen;
8941 
8942           rangle = strchr (decomposition + 1, '>');
8943           assert (rangle != NULL);
8944           typelen = rangle + 1 - decomposition;
8945 #define TYPE(t1,t2) \
8946           if (typelen == (sizeof (t1) - 1) && memcmp (decomposition, t1, typelen) == 0) \
8947             type = t2; \
8948           else
8949           TYPE ("<font>", UC_DECOMP_FONT)
8950           TYPE ("<noBreak>", UC_DECOMP_NOBREAK)
8951           TYPE ("<initial>", UC_DECOMP_INITIAL)
8952           TYPE ("<medial>", UC_DECOMP_MEDIAL)
8953           TYPE ("<final>", UC_DECOMP_FINAL)
8954           TYPE ("<isolated>", UC_DECOMP_ISOLATED)
8955           TYPE ("<circle>", UC_DECOMP_CIRCLE)
8956           TYPE ("<super>", UC_DECOMP_SUPER)
8957           TYPE ("<sub>", UC_DECOMP_SUB)
8958           TYPE ("<vertical>", UC_DECOMP_VERTICAL)
8959           TYPE ("<wide>", UC_DECOMP_WIDE)
8960           TYPE ("<narrow>", UC_DECOMP_NARROW)
8961           TYPE ("<small>", UC_DECOMP_SMALL)
8962           TYPE ("<square>", UC_DECOMP_SQUARE)
8963           TYPE ("<fraction>", UC_DECOMP_FRACTION)
8964           TYPE ("<compat>", UC_DECOMP_COMPAT)
8965             {
8966               fprintf (stderr, "unknown decomposition type %*s\n", (int)typelen, decomposition);
8967               exit (1);
8968             }
8969 #undef TYPE
8970           decomposition = rangle + 1;
8971           if (decomposition[0] == ' ')
8972             decomposition++;
8973         }
8974       for (length = 0; length < MAX_DECOMP_LENGTH; length++)
8975         {
8976           decomposed[length] = strtoul (decomposition, &endptr, 16);
8977           if (endptr == decomposition)
8978             break;
8979           decomposition = endptr;
8980           if (decomposition[0] == ' ')
8981             decomposition++;
8982         }
8983       /* Make sure that *DECOMPOSITION is not NULL-terminated.
8984          Otherwise MAX_DECOMP_LENGTH is too small.  */
8985       assert (*decomposition == '\0');
8986 
8987       *lengthp = length;
8988       return type;
8989     }
8990   else
8991     return -1;
8992 }
8993 
8994 /* Construction of sparse 3-level tables.  */
8995 #define TABLE decomp_table
8996 #define ELEMENT uint16_t
8997 #define DEFAULT (uint16_t)(-1)
8998 #define xmalloc malloc
8999 #define xrealloc realloc
9000 #include "3level.h"
9001 
9002 static void
9003 output_decomposition (FILE *stream1, FILE *stream2)
     /* [previous][next][first][last][top][bottom][index][help] */
9004 {
9005   struct decomp_table t;
9006   unsigned int level1_offset, level2_offset, level3_offset;
9007   unsigned int offset;
9008   unsigned int ch;
9009   unsigned int i;
9010 
9011   t.p = 5;
9012   t.q = 5;
9013   decomp_table_init (&t);
9014 
9015   fprintf (stream1, "extern const unsigned char gl_uninorm_decomp_chars_table[];\n");
9016   fprintf (stream1, "\n");
9017   fprintf (stream2, "const unsigned char gl_uninorm_decomp_chars_table[] =\n{");
9018   offset = 0;
9019 
9020   for (ch = 0; ch < 0x110000; ch++)
9021     {
9022       unsigned int length;
9023       unsigned int decomposed[MAX_DECOMP_LENGTH];
9024       int type = get_decomposition (ch, &length, decomposed);
9025 
9026       if (type >= 0)
9027         {
9028           assert (offset < (1 << 15));
9029           decomp_table_add (&t, ch, ((type == UC_DECOMP_CANONICAL ? 0 : 1) << 15) | offset);
9030 
9031           /* Produce length 3-bytes entries.  */
9032           /* We would need a special representation of zero-length entries.  */
9033           assert (length != 0);
9034           for (i = 0; i < length; i++)
9035             {
9036               if (offset > 0)
9037                 fprintf (stream2, ",");
9038               if ((offset % 4) == 0)
9039                 fprintf (stream2, "\n ");
9040               assert (decomposed[i] < (1 << 18));
9041               fprintf (stream2, " 0x%02X, 0x%02X, 0x%02X",
9042                        (((i+1 < length ? (1 << 23) : 0)
9043                          | (i == 0 ? (type << 18) : 0)
9044                          | decomposed[i]) >> 16) & 0xff,
9045                        (decomposed[i] >> 8) & 0xff,
9046                        decomposed[i] & 0xff);
9047               offset++;
9048             }
9049         }
9050     }
9051 
9052   fprintf (stream2, "\n};\n");
9053   fprintf (stream2, "\n");
9054 
9055   decomp_table_finalize (&t);
9056 
9057   level1_offset =
9058     5 * sizeof (uint32_t);
9059   level2_offset =
9060     5 * sizeof (uint32_t)
9061     + t.level1_size * sizeof (uint32_t);
9062   level3_offset =
9063     5 * sizeof (uint32_t)
9064     + t.level1_size * sizeof (uint32_t)
9065     + (t.level2_size << t.q) * sizeof (uint32_t);
9066 
9067   for (i = 0; i < 5; i++)
9068     fprintf (stream1, "#define decomp_header_%d %d\n", i,
9069              ((uint32_t *) t.result)[i]);
9070   fprintf (stream1, "\n");
9071   fprintf (stream1, "typedef struct\n");
9072   fprintf (stream1, "  {\n");
9073   fprintf (stream1, "    int level1[%zu];\n", t.level1_size);
9074   fprintf (stream1, "    int level2[%zu << %d];\n", t.level2_size, t.q);
9075   fprintf (stream1, "    unsigned short level3[%zu << %d];\n", t.level3_size, t.p);
9076   fprintf (stream1, "  }\n");
9077   fprintf (stream1, "decomp_index_table_t;\n");
9078   fprintf (stream1, "extern const decomp_index_table_t gl_uninorm_decomp_index_table;\n");
9079   fprintf (stream2, "const decomp_index_table_t gl_uninorm_decomp_index_table =\n");
9080   fprintf (stream2, "{\n");
9081   fprintf (stream2, "  {");
9082   if (t.level1_size > 8)
9083     fprintf (stream2, "\n   ");
9084   for (i = 0; i < t.level1_size; i++)
9085     {
9086       uint32_t offset;
9087       if (i > 0 && (i % 8) == 0)
9088         fprintf (stream2, "\n   ");
9089       offset = ((uint32_t *) (t.result + level1_offset))[i];
9090       if (offset == 0)
9091         fprintf (stream2, " %5d", -1);
9092       else
9093         fprintf (stream2, " %5zu",
9094                  (offset - level2_offset) / sizeof (uint32_t));
9095       if (i+1 < t.level1_size)
9096         fprintf (stream2, ",");
9097     }
9098   if (t.level1_size > 8)
9099     fprintf (stream2, "\n ");
9100   fprintf (stream2, " },\n");
9101   fprintf (stream2, "  {");
9102   if (t.level2_size << t.q > 8)
9103     fprintf (stream2, "\n   ");
9104   for (i = 0; i < t.level2_size << t.q; i++)
9105     {
9106       uint32_t offset;
9107       if (i > 0 && (i % 8) == 0)
9108         fprintf (stream2, "\n   ");
9109       offset = ((uint32_t *) (t.result + level2_offset))[i];
9110       if (offset == 0)
9111         fprintf (stream2, " %5d", -1);
9112       else
9113         fprintf (stream2, " %5zu",
9114                  (offset - level3_offset) / sizeof (uint16_t));
9115       if (i+1 < t.level2_size << t.q)
9116         fprintf (stream2, ",");
9117     }
9118   if (t.level2_size << t.q > 8)
9119     fprintf (stream2, "\n ");
9120   fprintf (stream2, " },\n");
9121   fprintf (stream2, "  {");
9122   if (t.level3_size << t.p > 8)
9123     fprintf (stream2, "\n   ");
9124   for (i = 0; i < t.level3_size << t.p; i++)
9125     {
9126       uint16_t value = ((uint16_t *) (t.result + level3_offset))[i];
9127       if (i > 0 && (i % 8) == 0)
9128         fprintf (stream2, "\n   ");
9129       fprintf (stream2, " %5d", value == (uint16_t)(-1) ? -1 : value);
9130       if (i+1 < t.level3_size << t.p)
9131         fprintf (stream2, ",");
9132     }
9133   if (t.level3_size << t.p > 8)
9134     fprintf (stream2, "\n ");
9135   fprintf (stream2, " }\n");
9136   fprintf (stream2, "};\n");
9137 }
9138 
9139 static void
9140 output_decomposition_tables (const char *filename1, const char *filename2, const char *version)
     /* [previous][next][first][last][top][bottom][index][help] */
9141 {
9142   const char *filenames[2];
9143   FILE *streams[2];
9144   size_t i;
9145 
9146   filenames[0] = filename1;
9147   filenames[1] = filename2;
9148 
9149   for (i = 0; i < 2; i++)
9150     {
9151       streams[i] = fopen (filenames[i], "w");
9152       if (streams[i] == NULL)
9153         {
9154           fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
9155           exit (1);
9156         }
9157     }
9158 
9159   for (i = 0; i < 2; i++)
9160     {
9161       FILE *stream = streams[i];
9162 
9163       fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
9164       fprintf (stream, "/* Decomposition of Unicode characters.  */\n");
9165       fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
9166                version);
9167       fprintf (stream, "\n");
9168 
9169       fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
9170       fprintf (stream, "\n");
9171       output_library_license (stream, true);
9172       fprintf (stream, "\n");
9173     }
9174 
9175   output_decomposition (streams[0], streams[1]);
9176 
9177   for (i = 0; i < 2; i++)
9178     {
9179       if (ferror (streams[i]) || fclose (streams[i]))
9180         {
9181           fprintf (stderr, "error writing to '%s'\n", filenames[i]);
9182           exit (1);
9183         }
9184     }
9185 }
9186 
9187 /* The "excluded from composition" property from the CompositionExclusions.txt file.  */
9188 char unicode_composition_exclusions[0x110000];
9189 
9190 static void
9191 fill_composition_exclusions (const char *compositionexclusions_filename)
     /* [previous][next][first][last][top][bottom][index][help] */
9192 {
9193   FILE *stream;
9194   unsigned int i;
9195 
9196   stream = fopen (compositionexclusions_filename, "r");
9197   if (stream == NULL)
9198     {
9199       fprintf (stderr, "error during fopen of '%s'\n", compositionexclusions_filename);
9200       exit (1);
9201     }
9202 
9203   for (i = 0; i < 0x110000; i++)
9204     unicode_composition_exclusions[i] = 0;
9205 
9206   for (;;)
9207     {
9208       char buf[200+1];
9209       unsigned int i;
9210 
9211       if (fscanf (stream, "%200[^\n]\n", buf) < 1)
9212         break;
9213 
9214       if (buf[0] == '\0' || buf[0] == '#')
9215         continue;
9216 
9217       if (sscanf (buf, "%X", &i) != 1)
9218         {
9219           fprintf (stderr, "parse error in '%s'\n", compositionexclusions_filename);
9220           exit (1);
9221         }
9222       assert (i < 0x110000);
9223 
9224       unicode_composition_exclusions[i] = 1;
9225     }
9226 
9227   if (ferror (stream) || fclose (stream))
9228     {
9229       fprintf (stderr, "error reading from '%s'\n", compositionexclusions_filename);
9230       exit (1);
9231     }
9232 }
9233 
9234 static void
9235 debug_output_composition_tables (const char *filename)
     /* [previous][next][first][last][top][bottom][index][help] */
9236 {
9237   FILE *stream;
9238   unsigned int ch;
9239 
9240   stream = fopen (filename, "w");
9241   if (stream == NULL)
9242     {
9243       fprintf (stderr, "cannot open '%s' for writing\n", filename);
9244       exit (1);
9245     }
9246 
9247   for (ch = 0; ch < 0x110000; ch++)
9248     {
9249       unsigned int length;
9250       unsigned int decomposed[MAX_DECOMP_LENGTH];
9251       int type = get_decomposition (ch, &length, decomposed);
9252 
9253       if (type == UC_DECOMP_CANONICAL
9254           /* Consider only binary decompositions.
9255              Exclude singleton decompositions.  */
9256           && length == 2)
9257         {
9258           unsigned int code1 = decomposed[0];
9259           unsigned int code2 = decomposed[1];
9260           unsigned int combined = ch;
9261 
9262           /* Exclude decompositions where the first part is not a starter,
9263              i.e. is not of canonical combining class 0.  */
9264           if (strcmp (unicode_attributes[code1].combining, "0") == 0
9265               /* Exclude characters listed in CompositionExclusions.txt.  */
9266               && !unicode_composition_exclusions[combined])
9267             {
9268               /* The combined character must now also be a starter.
9269                  Verify this.  */
9270               assert (strcmp (unicode_attributes[combined].combining, "0") == 0);
9271 
9272               fprintf (stream, "0x%04X\t0x%04X\t0x%04X\t%s\n",
9273                        code1,
9274                        code2,
9275                        combined,
9276                        unicode_attributes[code2].combining);
9277             }
9278         }
9279     }
9280 
9281   if (ferror (stream) || fclose (stream))
9282     {
9283       fprintf (stderr, "error writing to '%s'\n", filename);
9284       exit (1);
9285     }
9286 }
9287 
9288 static void
9289 output_composition_tables (const char *filename, const char *version)
     /* [previous][next][first][last][top][bottom][index][help] */
9290 {
9291   FILE *stream;
9292   unsigned int ch;
9293 
9294   stream = fopen (filename, "w");
9295   if (stream == NULL)
9296     {
9297       fprintf (stderr, "cannot open '%s' for writing\n", filename);
9298       exit (1);
9299     }
9300 
9301   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
9302   fprintf (stream, "/* Canonical composition of Unicode characters.  */\n");
9303   fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
9304            version);
9305   fprintf (stream, "\n");
9306 
9307   fprintf (stream, "/* Copyright (C) 2009 Free Software Foundation, Inc.\n");
9308   fprintf (stream, "\n");
9309   output_library_license (stream, true);
9310   fprintf (stream, "\n");
9311 
9312   /* The composition table is a set of mappings (code1, code2) -> combined,
9313      with 928 entries,
9314      367 values for code1 (from 0x003C to 0x30FD),
9315       54 values for code2 (from 0x0300 to 0x309A).
9316      For a fixed code1, there are from 1 to 19 possible values for code2.
9317      For a fixed code2, there are from 1 to 117 possible values for code1.
9318      This is a very sparse matrix.
9319 
9320      We want an O(1) hash lookup.
9321 
9322      We could implement the hash lookup by mapping (code1, code2) to a linear
9323      combination  mul1*code1 + mul2*code2, which is then used as an index into
9324      a 3-level table.  But this leads to a table of size 37 KB.
9325 
9326      We use gperf to implement the hash lookup, giving it the 928 sets of
9327      4 bytes (code1, code2) as input.  gperf generates a hash table of size
9328      1527, which is quite good (60% filled).  It requires an auxiliary table
9329      lookup in a table of size 0.5 KB.  The total tables size is 11 KB.  */
9330 
9331   fprintf (stream, "struct composition_rule { char codes[6]; };\n");
9332   fprintf (stream, "%%struct-type\n");
9333   fprintf (stream, "%%language=ANSI-C\n");
9334   fprintf (stream, "%%define slot-name codes\n");
9335   fprintf (stream, "%%define hash-function-name gl_uninorm_compose_hash\n");
9336   fprintf (stream, "%%define lookup-function-name gl_uninorm_compose_lookup\n");
9337   fprintf (stream, "%%compare-lengths\n");
9338   fprintf (stream, "%%compare-strncmp\n");
9339   fprintf (stream, "%%readonly-tables\n");
9340   fprintf (stream, "%%omit-struct-type\n");
9341   fprintf (stream, "%%%%\n");
9342 
9343   for (ch = 0; ch < 0x110000; ch++)
9344     {
9345       unsigned int length;
9346       unsigned int decomposed[MAX_DECOMP_LENGTH];
9347       int type = get_decomposition (ch, &length, decomposed);
9348 
9349       if (type == UC_DECOMP_CANONICAL
9350           /* Consider only binary decompositions.
9351              Exclude singleton decompositions.  */
9352           && length == 2)
9353         {
9354           unsigned int code1 = decomposed[0];
9355           unsigned int code2 = decomposed[1];
9356           unsigned int combined = ch;
9357 
9358           /* Exclude decompositions where the first part is not a starter,
9359              i.e. is not of canonical combining class 0.  */
9360           if (strcmp (unicode_attributes[code1].combining, "0") == 0
9361               /* Exclude characters listed in CompositionExclusions.txt.  */
9362               && !unicode_composition_exclusions[combined])
9363             {
9364               /* The combined character must now also be a starter.
9365                  Verify this.  */
9366               assert (strcmp (unicode_attributes[combined].combining, "0") == 0);
9367 
9368               fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n",
9369                        (code1 >> 16) & 0xff, (code1 >> 8) & 0xff, code1 & 0xff,
9370                        (code2 >> 16) & 0xff, (code2 >> 8) & 0xff, code2 & 0xff,
9371                        combined);
9372             }
9373         }
9374     }
9375 
9376   if (ferror (stream) || fclose (stream))
9377     {
9378       fprintf (stderr, "error writing to '%s'\n", filename);
9379       exit (1);
9380     }
9381 }
9382 
9383 /* ========================================================================= */
9384 
9385 /* Output the test for a simple character mapping table to the given file.  */
9386 
9387 static void
9388 output_simple_mapping_test (const char *filename,
     /* [previous][next][first][last][top][bottom][index][help] */
9389                             const char *function_name,
9390                             unsigned int (*func) (unsigned int),
9391                             const char *version)
9392 {
9393   FILE *stream;
9394   bool need_comma;
9395   unsigned int ch;
9396 
9397   stream = fopen (filename, "w");
9398   if (stream == NULL)
9399     {
9400       fprintf (stderr, "cannot open '%s' for writing\n", filename);
9401       exit (1);
9402     }
9403 
9404   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
9405   fprintf (stream, "/* Test the Unicode character mapping functions.\n");
9406   fprintf (stream, "   Copyright (C) 2009 Free Software Foundation, Inc.\n");
9407   fprintf (stream, "\n");
9408   output_tests_license (stream);
9409   fprintf (stream, "\n");
9410   fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
9411            version);
9412   fprintf (stream, "\n");
9413   fprintf (stream, "#include \"test-mapping-part1.h\"\n");
9414   fprintf (stream, "\n");
9415 
9416   need_comma = false;
9417   for (ch = 0; ch < 0x110000; ch++)
9418     {
9419       unsigned int value = func (ch);
9420 
9421       if (value != ch)
9422         {
9423           if (need_comma)
9424             fprintf (stream, ",\n");
9425           fprintf (stream, "    { 0x%04X, 0x%04X }", ch, value);
9426           need_comma = true;
9427         }
9428     }
9429   if (need_comma)
9430     fprintf (stream, "\n");
9431 
9432   fprintf (stream, "\n");
9433   fprintf (stream, "#define MAP(c) %s (c)\n", function_name);
9434   fprintf (stream, "#include \"test-mapping-part2.h\"\n");
9435 
9436   if (ferror (stream) || fclose (stream))
9437     {
9438       fprintf (stderr, "error writing to '%s'\n", filename);
9439       exit (1);
9440     }
9441 }
9442 
9443 /* Construction of sparse 3-level tables.  */
9444 #define TABLE mapping_table
9445 #define ELEMENT int32_t
9446 #define DEFAULT 0
9447 #define xmalloc malloc
9448 #define xrealloc realloc
9449 #include "3level.h"
9450 
9451 /* Output a simple character mapping table to the given file.  */
9452 
9453 static void
9454 output_simple_mapping (const char *filename,
     /* [previous][next][first][last][top][bottom][index][help] */
9455                        unsigned int (*func) (unsigned int),
9456                        const char *version)
9457 {
9458   FILE *stream;
9459   unsigned int ch, i;
9460   struct mapping_table t;
9461   unsigned int level1_offset, level2_offset, level3_offset;
9462 
9463   stream = fopen (filename, "w");
9464   if (stream == NULL)
9465     {
9466       fprintf (stderr, "cannot open '%s' for writing\n", filename);
9467       exit (1);
9468     }
9469 
9470   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
9471   fprintf (stream, "/* Simple character mapping of Unicode characters.  */\n");
9472   fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
9473            version);
9474   fprintf (stream, "\n");
9475 
9476   fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
9477   fprintf (stream, "\n");
9478   output_library_license (stream, false);
9479   fprintf (stream, "\n");
9480 
9481   t.p = 7;
9482   t.q = 9;
9483   mapping_table_init (&t);
9484 
9485   for (ch = 0; ch < 0x110000; ch++)
9486     {
9487       int value = (int) func (ch) - (int) ch;
9488 
9489       mapping_table_add (&t, ch, value);
9490     }
9491 
9492   mapping_table_finalize (&t);
9493 
9494   /* Offsets in t.result, in memory of this process.  */
9495   level1_offset =
9496     5 * sizeof (uint32_t);
9497   level2_offset =
9498     5 * sizeof (uint32_t)
9499     + t.level1_size * sizeof (uint32_t);
9500   level3_offset =
9501     5 * sizeof (uint32_t)
9502     + t.level1_size * sizeof (uint32_t)
9503     + (t.level2_size << t.q) * sizeof (uint32_t);
9504 
9505   for (i = 0; i < 5; i++)
9506     fprintf (stream, "#define mapping_header_%d %d\n", i,
9507              ((uint32_t *) t.result)[i]);
9508   fprintf (stream, "static const\n");
9509   fprintf (stream, "struct\n");
9510   fprintf (stream, "  {\n");
9511   fprintf (stream, "    int level1[%zu];\n", t.level1_size);
9512   fprintf (stream, "    short level2[%zu << %d];\n", t.level2_size, t.q);
9513   fprintf (stream, "    int level3[%zu << %d];\n", t.level3_size, t.p);
9514   fprintf (stream, "  }\n");
9515   fprintf (stream, "u_mapping =\n");
9516   fprintf (stream, "{\n");
9517   fprintf (stream, "  {");
9518   if (t.level1_size > 8)
9519     fprintf (stream, "\n   ");
9520   for (i = 0; i < t.level1_size; i++)
9521     {
9522       uint32_t offset;
9523       if (i > 0 && (i % 8) == 0)
9524         fprintf (stream, "\n   ");
9525       offset = ((uint32_t *) (t.result + level1_offset))[i];
9526       if (offset == 0)
9527         fprintf (stream, " %5d", -1);
9528       else
9529         fprintf (stream, " %5zu",
9530                  (offset - level2_offset) / sizeof (uint32_t));
9531       if (i+1 < t.level1_size)
9532         fprintf (stream, ",");
9533     }
9534   if (t.level1_size > 8)
9535     fprintf (stream, "\n ");
9536   fprintf (stream, " },\n");
9537   fprintf (stream, "  {");
9538   if (t.level2_size << t.q > 8)
9539     fprintf (stream, "\n   ");
9540   for (i = 0; i < t.level2_size << t.q; i++)
9541     {
9542       uint32_t offset;
9543       if (i > 0 && (i % 8) == 0)
9544         fprintf (stream, "\n   ");
9545       offset = ((uint32_t *) (t.result + level2_offset))[i];
9546       if (offset == 0)
9547         fprintf (stream, " %5d", -1);
9548       else
9549         fprintf (stream, " %5zu",
9550                  (offset - level3_offset) / sizeof (int32_t));
9551       if (i+1 < t.level2_size << t.q)
9552         fprintf (stream, ",");
9553     }
9554   if (t.level2_size << t.q > 8)
9555     fprintf (stream, "\n ");
9556   fprintf (stream, " },\n");
9557   fprintf (stream, "  {");
9558   if (t.level3_size << t.p > 8)
9559     fprintf (stream, "\n   ");
9560   for (i = 0; i < t.level3_size << t.p; i++)
9561     {
9562       if (i > 0 && (i % 8) == 0)
9563         fprintf (stream, "\n   ");
9564       fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
9565       if (i+1 < t.level3_size << t.p)
9566         fprintf (stream, ",");
9567     }
9568   if (t.level3_size << t.p > 8)
9569     fprintf (stream, "\n ");
9570   fprintf (stream, " }\n");
9571   fprintf (stream, "};\n");
9572 
9573   if (ferror (stream) || fclose (stream))
9574     {
9575       fprintf (stderr, "error writing to '%s'\n", filename);
9576       exit (1);
9577     }
9578 }
9579 
9580 /* ========================================================================= */
9581 
9582 /* A special casing context.
9583    A context is negated through x -> -x.  */
9584 enum
9585 {
9586   SCC_ALWAYS             = 0,
9587   SCC_FINAL_SIGMA,
9588   SCC_AFTER_SOFT_DOTTED,
9589   SCC_MORE_ABOVE,
9590   SCC_BEFORE_DOT,
9591   SCC_AFTER_I
9592 };
9593 
9594 /* A special casing rule.  */
9595 struct special_casing_rule
9596 {
9597   unsigned int code;
9598   unsigned int lower_mapping[3];
9599   unsigned int title_mapping[3];
9600   unsigned int upper_mapping[3];
9601   unsigned int casefold_mapping[3];
9602   const char *language;
9603   int context;
9604 };
9605 
9606 /* The special casing rules.  */
9607 struct special_casing_rule **casing_rules;
9608 unsigned int num_casing_rules;
9609 unsigned int allocated_casing_rules;
9610 
9611 static void
9612 add_casing_rule (struct special_casing_rule *new_rule)
     /* [previous][next][first][last][top][bottom][index][help] */
9613 {
9614   if (num_casing_rules == allocated_casing_rules)
9615     {
9616       allocated_casing_rules = 2 * allocated_casing_rules;
9617       if (allocated_casing_rules < 16)
9618         allocated_casing_rules = 16;
9619       casing_rules =
9620         (struct special_casing_rule **)
9621         realloc (casing_rules, allocated_casing_rules * sizeof (struct special_casing_rule *));
9622     }
9623   casing_rules[num_casing_rules++] = new_rule;
9624 }
9625 
9626 /* Stores in casing_rules the special casing rules found in
9627    specialcasing_filename.  */
9628 static void
9629 fill_casing_rules (const char *specialcasing_filename)
     /* [previous][next][first][last][top][bottom][index][help] */
9630 {
9631   FILE *stream;
9632 
9633   stream = fopen (specialcasing_filename, "r");
9634   if (stream == NULL)
9635     {
9636       fprintf (stderr, "error during fopen of '%s'\n", specialcasing_filename);
9637       exit (1);
9638     }
9639 
9640   casing_rules = NULL;
9641   num_casing_rules = 0;
9642   allocated_casing_rules = 0;
9643 
9644   for (;;)
9645     {
9646       char buf[200+1];
9647       char *scanptr;
9648       char *endptr;
9649       int i;
9650 
9651       unsigned int code;
9652       unsigned int lower_mapping[3];
9653       unsigned int title_mapping[3];
9654       unsigned int upper_mapping[3];
9655       char *language;
9656       int context;
9657 
9658       if (fscanf (stream, "%200[^\n]\n", buf) < 1)
9659         break;
9660 
9661       if (buf[0] == '\0' || buf[0] == '#')
9662         continue;
9663 
9664       /* Scan code.  */
9665       scanptr = buf;
9666       code = strtoul (scanptr, &endptr, 16);
9667       if (endptr == scanptr)
9668         {
9669           fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9670           exit (1);
9671         }
9672       scanptr = endptr;
9673       if (*scanptr != ';')
9674         {
9675           fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9676           exit (1);
9677         }
9678       scanptr++;
9679 
9680       /* Scan lower mapping.  */
9681       for (i = 0; i < 3; i++)
9682         lower_mapping[i] = 0;
9683       for (i = 0; i < 3; i++)
9684         {
9685           while (*scanptr == ' ')
9686             scanptr++;
9687           if (*scanptr == ';')
9688             break;
9689           lower_mapping[i] = strtoul (scanptr, &endptr, 16);
9690           if (endptr == scanptr)
9691             {
9692               fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9693               exit (1);
9694             }
9695           scanptr = endptr;
9696         }
9697       if (*scanptr != ';')
9698         {
9699           fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9700           exit (1);
9701         }
9702       scanptr++;
9703 
9704       /* Scan title mapping.  */
9705       for (i = 0; i < 3; i++)
9706         title_mapping[i] = 0;
9707       for (i = 0; i < 3; i++)
9708         {
9709           while (*scanptr == ' ')
9710             scanptr++;
9711           if (*scanptr == ';')
9712             break;
9713           title_mapping[i] = strtoul (scanptr, &endptr, 16);
9714           if (endptr == scanptr)
9715             {
9716               fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9717               exit (1);
9718             }
9719           scanptr = endptr;
9720         }
9721       if (*scanptr != ';')
9722         {
9723           fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9724           exit (1);
9725         }
9726       scanptr++;
9727 
9728       /* Scan upper mapping.  */
9729       for (i = 0; i < 3; i++)
9730         upper_mapping[i] = 0;
9731       for (i = 0; i < 3; i++)
9732         {
9733           while (*scanptr == ' ')
9734             scanptr++;
9735           if (*scanptr == ';')
9736             break;
9737           upper_mapping[i] = strtoul (scanptr, &endptr, 16);
9738           if (endptr == scanptr)
9739             {
9740               fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9741               exit (1);
9742             }
9743           scanptr = endptr;
9744         }
9745       if (*scanptr != ';')
9746         {
9747           fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9748           exit (1);
9749         }
9750       scanptr++;
9751 
9752       /* Scan language and context.  */
9753       language = NULL;
9754       context = SCC_ALWAYS;
9755       while (*scanptr == ' ')
9756         scanptr++;
9757       if (*scanptr != '\0' && *scanptr != '#')
9758         {
9759           const char *word_begin = scanptr;
9760           const char *word_end;
9761 
9762           while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
9763             scanptr++;
9764           word_end = scanptr;
9765 
9766           while (*scanptr == ' ')
9767             scanptr++;
9768 
9769           if (word_end - word_begin == 2)
9770             {
9771               language = (char *) malloc ((word_end - word_begin) + 1);
9772               memcpy (language, word_begin, 2);
9773               language[word_end - word_begin] = '\0';
9774               word_begin = word_end = NULL;
9775 
9776               if (*scanptr != '\0' && *scanptr != '#' &&  *scanptr != ';')
9777                 {
9778                   word_begin = scanptr;
9779                   while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
9780                     scanptr++;
9781                   word_end = scanptr;
9782                 }
9783             }
9784 
9785           if (word_end > word_begin)
9786             {
9787               bool negate = false;
9788 
9789               if (word_end - word_begin >= 4 && memcmp (word_begin, "Not_", 4) == 0)
9790                 {
9791                   word_begin += 4;
9792                   negate = true;
9793                 }
9794               if (word_end - word_begin == 11 && memcmp (word_begin, "Final_Sigma", 11) == 0)
9795                 context = SCC_FINAL_SIGMA;
9796               else if (word_end - word_begin == 17 && memcmp (word_begin, "After_Soft_Dotted", 17) == 0)
9797                 context = SCC_AFTER_SOFT_DOTTED;
9798               else if (word_end - word_begin == 10 && memcmp (word_begin, "More_Above", 10) == 0)
9799                 context = SCC_MORE_ABOVE;
9800               else if (word_end - word_begin == 10 && memcmp (word_begin, "Before_Dot", 10) == 0)
9801                 context = SCC_BEFORE_DOT;
9802               else if (word_end - word_begin == 7 && memcmp (word_begin, "After_I", 7) == 0)
9803                 context = SCC_AFTER_I;
9804               else
9805                 {
9806                   fprintf (stderr, "unknown context type in '%s'\n", specialcasing_filename);
9807                   exit (1);
9808                 }
9809               if (negate)
9810                 context = - context;
9811             }
9812 
9813           if (*scanptr != '\0' && *scanptr != '#' &&  *scanptr != ';')
9814             {
9815               fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
9816               exit (1);
9817             }
9818         }
9819 
9820       /* Store the rule.  */
9821       {
9822         struct special_casing_rule *new_rule =
9823           (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
9824         new_rule->code = code;
9825         new_rule->language = language;
9826         new_rule->context = context;
9827         memcpy (new_rule->lower_mapping, lower_mapping, sizeof (new_rule->lower_mapping));
9828         memcpy (new_rule->title_mapping, title_mapping, sizeof (new_rule->title_mapping));
9829         memcpy (new_rule->upper_mapping, upper_mapping, sizeof (new_rule->upper_mapping));
9830 
9831         add_casing_rule (new_rule);
9832       }
9833     }
9834 
9835   if (ferror (stream) || fclose (stream))
9836     {
9837       fprintf (stderr, "error reading from '%s'\n", specialcasing_filename);
9838       exit (1);
9839     }
9840 }
9841 
9842 /* A casefolding rule.  */
9843 struct casefold_rule
9844 {
9845   unsigned int code;
9846   unsigned int mapping[3];
9847   const char *language;
9848 };
9849 
9850 /* The casefolding rules.  */
9851 struct casefold_rule **casefolding_rules;
9852 unsigned int num_casefolding_rules;
9853 unsigned int allocated_casefolding_rules;
9854 
9855 /* Stores in casefolding_rules the case folding rules found in
9856    casefolding_filename.  */
9857 static void
9858 fill_casefolding_rules (const char *casefolding_filename)
     /* [previous][next][first][last][top][bottom][index][help] */
9859 {
9860   FILE *stream;
9861 
9862   stream = fopen (casefolding_filename, "r");
9863   if (stream == NULL)
9864     {
9865       fprintf (stderr, "error during fopen of '%s'\n", casefolding_filename);
9866       exit (1);
9867     }
9868 
9869   casefolding_rules = NULL;
9870   num_casefolding_rules = 0;
9871   allocated_casefolding_rules = 0;
9872 
9873   for (;;)
9874     {
9875       char buf[200+1];
9876       char *scanptr;
9877       char *endptr;
9878       int i;
9879 
9880       unsigned int code;
9881       char type;
9882       unsigned int mapping[3];
9883 
9884       if (fscanf (stream, "%200[^\n]\n", buf) < 1)
9885         break;
9886 
9887       if (buf[0] == '\0' || buf[0] == '#')
9888         continue;
9889 
9890       /* Scan code.  */
9891       scanptr = buf;
9892       code = strtoul (scanptr, &endptr, 16);
9893       if (endptr == scanptr)
9894         {
9895           fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9896           exit (1);
9897         }
9898       scanptr = endptr;
9899       if (*scanptr != ';')
9900         {
9901           fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9902           exit (1);
9903         }
9904       scanptr++;
9905 
9906       /* Scan type.  */
9907       while (*scanptr == ' ')
9908         scanptr++;
9909 
9910       switch (*scanptr)
9911         {
9912         case 'C': case 'F': case 'S': case 'T':
9913           type = *scanptr;
9914           break;
9915         default:
9916           fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9917           exit (1);
9918         }
9919       scanptr++;
9920       if (*scanptr != ';')
9921         {
9922           fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9923           exit (1);
9924         }
9925       scanptr++;
9926 
9927       /* Scan casefold mapping.  */
9928       for (i = 0; i < 3; i++)
9929         mapping[i] = 0;
9930       for (i = 0; i < 3; i++)
9931         {
9932           while (*scanptr == ' ')
9933             scanptr++;
9934           if (*scanptr == ';')
9935             break;
9936           mapping[i] = strtoul (scanptr, &endptr, 16);
9937           if (endptr == scanptr)
9938             {
9939               fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9940               exit (1);
9941             }
9942           scanptr = endptr;
9943         }
9944       if (*scanptr != ';')
9945         {
9946           fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
9947           exit (1);
9948         }
9949       scanptr++;
9950 
9951       /* Ignore rules of type 'S'; we use the rules of type 'F' instead.  */
9952       if (type != 'S')
9953         {
9954           const char * const *languages;
9955           unsigned int languages_count;
9956 
9957           /* Type 'T' indicates that the rule is applicable to Turkish
9958              languages only.  */
9959           if (type == 'T')
9960             {
9961               static const char * const turkish_languages[] = { "tr", "az" };
9962               languages = turkish_languages;
9963               languages_count = 2;
9964             }
9965           else
9966             {
9967               static const char * const all_languages[] = { NULL };
9968               languages = all_languages;
9969               languages_count = 1;
9970             }
9971 
9972           for (i = 0; i < languages_count; i++)
9973             {
9974               /* Store a new rule.  */
9975               struct casefold_rule *new_rule =
9976                 (struct casefold_rule *) malloc (sizeof (struct casefold_rule));
9977               new_rule->code = code;
9978               memcpy (new_rule->mapping, mapping, sizeof (new_rule->mapping));
9979               new_rule->language = languages[i];
9980 
9981               if (num_casefolding_rules == allocated_casefolding_rules)
9982                 {
9983                   allocated_casefolding_rules = 2 * allocated_casefolding_rules;
9984                   if (allocated_casefolding_rules < 16)
9985                     allocated_casefolding_rules = 16;
9986                   casefolding_rules =
9987                     (struct casefold_rule **)
9988                     realloc (casefolding_rules,
9989                              allocated_casefolding_rules * sizeof (struct casefold_rule *));
9990                 }
9991               casefolding_rules[num_casefolding_rules++] = new_rule;
9992             }
9993         }
9994     }
9995 
9996   if (ferror (stream) || fclose (stream))
9997     {
9998       fprintf (stderr, "error reading from '%s'\n", casefolding_filename);
9999       exit (1);
10000     }
10001 }
10002 
10003 /* Casefold mapping, when it maps to a single character.  */
10004 unsigned int unicode_casefold[0x110000];
10005 
10006 static unsigned int
10007 to_casefold (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
10008 {
10009   return unicode_casefold[ch];
10010 }
10011 
10012 /* Redistribute the casefolding_rules:
10013    - Rules that map to a single character, language independently, are stored
10014      in unicode_casefold.
10015    - Other rules are merged into casing_rules.  */
10016 static void
10017 redistribute_casefolding_rules (void)
     /* [previous][next][first][last][top][bottom][index][help] */
10018 {
10019   unsigned int ch, i, j;
10020 
10021   /* Fill unicode_casefold[].  */
10022   for (ch = 0; ch < 0x110000; ch++)
10023     unicode_casefold[ch] = ch;
10024   for (i = 0; i < num_casefolding_rules; i++)
10025     {
10026       struct casefold_rule *cfrule = casefolding_rules[i];
10027 
10028       if (cfrule->language == NULL && cfrule->mapping[1] == 0)
10029         {
10030           ch = cfrule->code;
10031           assert (ch < 0x110000);
10032           unicode_casefold[ch] = cfrule->mapping[0];
10033         }
10034     }
10035 
10036   /* Extend the special casing rules by filling in their casefold_mapping[]
10037      field.  */
10038   for (j = 0; j < num_casing_rules; j++)
10039     {
10040       struct special_casing_rule *rule = casing_rules[j];
10041       unsigned int k;
10042 
10043       rule->casefold_mapping[0] = to_casefold (rule->code);
10044       for (k = 1; k < 3; k++)
10045         rule->casefold_mapping[k] = 0;
10046     }
10047 
10048   /* Now merge the other casefolding rules into casing_rules.  */
10049   for (i = 0; i < num_casefolding_rules; i++)
10050     {
10051       struct casefold_rule *cfrule = casefolding_rules[i];
10052 
10053       if (!(cfrule->language == NULL && cfrule->mapping[1] == 0))
10054         {
10055           /* Find a rule that applies to the same code, same language, and it
10056              has context SCC_ALWAYS.  At the same time, update all rules that
10057              have the same code and same or more specific language.  */
10058           struct special_casing_rule *found_rule = NULL;
10059 
10060           for (j = 0; j < num_casing_rules; j++)
10061             {
10062               struct special_casing_rule *rule = casing_rules[j];
10063 
10064               if (rule->code == cfrule->code
10065                   && (cfrule->language == NULL
10066                       || (rule->language != NULL
10067                           && strcmp (rule->language, cfrule->language) == 0)))
10068                 {
10069                   memcpy (rule->casefold_mapping, cfrule->mapping,
10070                           sizeof (rule->casefold_mapping));
10071 
10072                   if ((cfrule->language == NULL
10073                        ? rule->language == NULL
10074                        : rule->language != NULL
10075                          && strcmp (rule->language, cfrule->language) == 0)
10076                       && rule->context == SCC_ALWAYS)
10077                     {
10078                       /* Found it.  */
10079                       found_rule = rule;
10080                     }
10081                 }
10082             }
10083 
10084           if (found_rule == NULL)
10085             {
10086               /* Create a new rule.  */
10087               struct special_casing_rule *new_rule =
10088                 (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
10089 
10090               /* Try to find a rule that applies to the same code, no language
10091                  restriction, and with context SCC_ALWAYS.  */
10092               for (j = 0; j < num_casing_rules; j++)
10093                 {
10094                   struct special_casing_rule *rule = casing_rules[j];
10095 
10096                   if (rule->code == cfrule->code
10097                       && rule->context == SCC_ALWAYS
10098                       && rule->language == NULL)
10099                     {
10100                       /* Found it.  */
10101                       found_rule = rule;
10102                       break;
10103                     }
10104                 }
10105 
10106               new_rule->code = cfrule->code;
10107               new_rule->language = cfrule->language;
10108               new_rule->context = SCC_ALWAYS;
10109               if (found_rule != NULL)
10110                 {
10111                   memcpy (new_rule->lower_mapping, found_rule->lower_mapping,
10112                           sizeof (new_rule->lower_mapping));
10113                   memcpy (new_rule->title_mapping, found_rule->title_mapping,
10114                           sizeof (new_rule->title_mapping));
10115                   memcpy (new_rule->upper_mapping, found_rule->upper_mapping,
10116                           sizeof (new_rule->upper_mapping));
10117                 }
10118               else
10119                 {
10120                   unsigned int k;
10121 
10122                   new_rule->lower_mapping[0] = to_lower (cfrule->code);
10123                   for (k = 1; k < 3; k++)
10124                     new_rule->lower_mapping[k] = 0;
10125                   new_rule->title_mapping[0] = to_title (cfrule->code);
10126                   for (k = 1; k < 3; k++)
10127                     new_rule->title_mapping[k] = 0;
10128                   new_rule->upper_mapping[0] = to_upper (cfrule->code);
10129                   for (k = 1; k < 3; k++)
10130                     new_rule->upper_mapping[k] = 0;
10131                 }
10132               memcpy (new_rule->casefold_mapping, cfrule->mapping,
10133                       sizeof (new_rule->casefold_mapping));
10134 
10135               add_casing_rule (new_rule);
10136             }
10137         }
10138     }
10139 }
10140 
10141 static int
10142 compare_casing_rules (const void *a, const void *b)
     /* [previous][next][first][last][top][bottom][index][help] */
10143 {
10144   struct special_casing_rule *a_rule = *(struct special_casing_rule **) a;
10145   struct special_casing_rule *b_rule = *(struct special_casing_rule **) b;
10146   unsigned int a_code = a_rule->code;
10147   unsigned int b_code = b_rule->code;
10148 
10149   if (a_code < b_code)
10150     return -1;
10151   if (a_code > b_code)
10152     return 1;
10153 
10154   /* Sort the more specific rules before the more general ones.  */
10155   return (- ((a_rule->language != NULL ? 1 : 0) + (a_rule->context != SCC_ALWAYS ? 1 : 0))
10156           + ((b_rule->language != NULL ? 1 : 0) + (b_rule->context != SCC_ALWAYS ? 1 : 0)));
10157 }
10158 
10159 static void
10160 sort_casing_rules (void)
     /* [previous][next][first][last][top][bottom][index][help] */
10161 {
10162   /* Sort the rules 1. by code, 2. by specificity.  */
10163   if (num_casing_rules > 1)
10164     qsort (casing_rules, num_casing_rules, sizeof (struct special_casing_rule *),
10165            compare_casing_rules);
10166 }
10167 
10168 /* Output the special casing rules.  */
10169 static void
10170 output_casing_rules (const char *filename, const char *version)
     /* [previous][next][first][last][top][bottom][index][help] */
10171 {
10172   FILE *stream;
10173   unsigned int i, j;
10174   unsigned int minor;
10175 
10176   stream = fopen (filename, "w");
10177   if (stream == NULL)
10178     {
10179       fprintf (stderr, "cannot open '%s' for writing\n", filename);
10180       exit (1);
10181     }
10182 
10183   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
10184   fprintf (stream, "/* Special casing rules of Unicode characters.  */\n");
10185   fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
10186            version);
10187   fprintf (stream, "\n");
10188 
10189   fprintf (stream, "/* Copyright (C) 2000-2021 Free Software Foundation, Inc.\n");
10190   fprintf (stream, "\n");
10191   output_library_license (stream, false);
10192   fprintf (stream, "\n");
10193 
10194   fprintf (stream, "struct special_casing_rule { char code[3]; };\n");
10195   fprintf (stream, "%%struct-type\n");
10196   fprintf (stream, "%%language=ANSI-C\n");
10197   fprintf (stream, "%%define slot-name code\n");
10198   fprintf (stream, "%%define hash-function-name gl_unicase_special_hash\n");
10199   fprintf (stream, "%%define lookup-function-name gl_unicase_special_lookup\n");
10200   fprintf (stream, "%%compare-lengths\n");
10201   fprintf (stream, "%%compare-strncmp\n");
10202   fprintf (stream, "%%readonly-tables\n");
10203   fprintf (stream, "%%omit-struct-type\n");
10204   fprintf (stream, "%%%%\n");
10205 
10206   minor = 0;
10207   for (i = 0; i < num_casing_rules; i++)
10208     {
10209       struct special_casing_rule *rule = casing_rules[i];
10210       int context;
10211 
10212       if (i > 0 && rule->code == casing_rules[i - 1]->code)
10213         minor += 1;
10214       else
10215         minor = 0;
10216 
10217       if (!(rule->code < 0x10000))
10218         {
10219           fprintf (stderr, "special rule #%u: code %u out of range\n", i, rule->code);
10220           exit (1);
10221         }
10222 
10223       fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\", ",
10224                (rule->code >> 8) & 0xff, rule->code & 0xff, minor);
10225 
10226       fprintf (stream, "%d, ",
10227                i + 1 < num_casing_rules && casing_rules[i + 1]->code == rule->code ? 1 : 0);
10228 
10229       context = rule->context;
10230       if (context < 0)
10231         {
10232           fprintf (stream, "-");
10233           context = - context;
10234         }
10235       else
10236         fprintf (stream, " ");
10237       switch (context)
10238         {
10239         case SCC_ALWAYS:
10240           fprintf (stream, "SCC_ALWAYS           ");
10241           break;
10242         case SCC_FINAL_SIGMA:
10243           fprintf (stream, "SCC_FINAL_SIGMA      ");
10244           break;
10245         case SCC_AFTER_SOFT_DOTTED:
10246           fprintf (stream, "SCC_AFTER_SOFT_DOTTED");
10247           break;
10248         case SCC_MORE_ABOVE:
10249           fprintf (stream, "SCC_MORE_ABOVE       ");
10250           break;
10251         case SCC_BEFORE_DOT:
10252           fprintf (stream, "SCC_BEFORE_DOT       ");
10253           break;
10254         case SCC_AFTER_I:
10255           fprintf (stream, "SCC_AFTER_I          ");
10256           break;
10257         default:
10258           abort ();
10259         }
10260       fprintf (stream, ", ");
10261 
10262       if (rule->language != NULL)
10263         {
10264           assert (strlen (rule->language) == 2);
10265           fprintf (stream, "{  '%c',  '%c' }, ", rule->language[0], rule->language[1]);
10266         }
10267       else
10268         fprintf (stream, "{ '\\0', '\\0' }, ");
10269 
10270       fprintf (stream, "{ ");
10271       for (j = 0; j < 3; j++)
10272         {
10273           if (j > 0)
10274             fprintf (stream, ", ");
10275           if (!(rule->upper_mapping[j] < 0x10000))
10276             {
10277               fprintf (stderr, "special rule #%u: upper mapping of code %u out of range\n", i, rule->code);
10278               exit (1);
10279             }
10280           if (rule->upper_mapping[j] != 0)
10281             fprintf (stream, "0x%04X", rule->upper_mapping[j]);
10282           else
10283             fprintf (stream, "     0");
10284         }
10285       fprintf (stream, " }, { ");
10286       for (j = 0; j < 3; j++)
10287         {
10288           if (j > 0)
10289             fprintf (stream, ", ");
10290           if (!(rule->lower_mapping[j] < 0x10000))
10291             {
10292               fprintf (stderr, "special rule #%u: lower mapping of code %u out of range\n", i, rule->code);
10293               exit (1);
10294             }
10295           if (rule->lower_mapping[j] != 0)
10296             fprintf (stream, "0x%04X", rule->lower_mapping[j]);
10297           else
10298             fprintf (stream, "     0");
10299         }
10300       fprintf (stream, " }, { ");
10301       for (j = 0; j < 3; j++)
10302         {
10303           if (j > 0)
10304             fprintf (stream, ", ");
10305           if (!(rule->title_mapping[j] < 0x10000))
10306             {
10307               fprintf (stderr, "special rule #%u: title mapping of code %u out of range\n", i, rule->code);
10308               exit (1);
10309             }
10310           if (rule->title_mapping[j] != 0)
10311             fprintf (stream, "0x%04X", rule->title_mapping[j]);
10312           else
10313             fprintf (stream, "     0");
10314         }
10315       fprintf (stream, " }, { ");
10316       for (j = 0; j < 3; j++)
10317         {
10318           if (j > 0)
10319             fprintf (stream, ", ");
10320           if (!(rule->casefold_mapping[j] < 0x10000))
10321             {
10322               fprintf (stderr, "special rule #%u: casefold mapping of code %u out of range\n", i, rule->code);
10323               exit (1);
10324             }
10325           if (rule->casefold_mapping[j] != 0)
10326             fprintf (stream, "0x%04X", rule->casefold_mapping[j]);
10327           else
10328             fprintf (stream, "     0");
10329         }
10330       fprintf (stream, " }\n");
10331     }
10332 
10333   if (ferror (stream) || fclose (stream))
10334     {
10335       fprintf (stderr, "error writing to '%s'\n", filename);
10336       exit (1);
10337     }
10338 }
10339 
10340 /* ========================================================================= */
10341 
10342 /* Quoting the Unicode standard:
10343      Definition: A character is defined to be "cased" if it has the Lowercase
10344      or Uppercase property or has a General_Category value of
10345      Titlecase_Letter.  */
10346 static bool
10347 is_cased (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
10348 {
10349   return (is_property_lowercase (ch)
10350           || is_property_uppercase (ch)
10351           || is_category_Lt (ch));
10352 }
10353 
10354 /* Quoting the Unicode standard:
10355      Definition: A character is defined to be "case-ignorable" if it has the
10356      value MidLetter {or the value MidNumLet} for the Word_Break property or
10357      its General_Category is one of Nonspacing_Mark (Mn), Enclosing_Mark (Me),
10358      Format (Cf), Modifier_Letter (Lm), or Modifier_Symbol (Sk).
10359    The text marked in braces was added in Unicode 5.1.0, see
10360    <https://www.unicode.org/versions/Unicode5.1.0/> section "Update of
10361    Definition of case-ignorable".   */
10362 /* Since this predicate is only used for the "Before C" and "After C"
10363    conditions of FINAL_SIGMA, we exclude the "cased" characters here.
10364    This simplifies the evaluation of the regular expressions
10365      \p{cased} (\p{case-ignorable})* C
10366    and
10367      C (\p{case-ignorable})* \p{cased}
10368  */
10369 static bool
10370 is_case_ignorable (unsigned int ch)
     /* [previous][next][first][last][top][bottom][index][help] */
10371 {
10372   return (unicode_org_wbp[ch] == WBP_MIDLETTER
10373           || unicode_org_wbp[ch] == WBP_MIDNUMLET
10374           || is_category_Mn (ch)
10375           || is_category_Me (ch)
10376           || is_category_Cf (ch)
10377           || is_category_Lm (ch)
10378           || is_category_Sk (ch))
10379          && !is_cased (ch);
10380 }
10381 
10382 /* ------------------------------------------------------------------------- */
10383 
10384 /* Output all case related properties.  */
10385 static void
10386 output_casing_properties (const char *version)
     /* [previous][next][first][last][top][bottom][index][help] */
10387 {
10388 #define PROPERTY(FN,P) \
10389   debug_output_predicate ("unicase/" #FN ".txt", is_ ## P); \
10390   output_predicate_test ("../tests/unicase/test-" #FN ".c", is_ ## P, "uc_is_" #P " (c)"); \
10391   output_predicate ("unicase/" #FN ".h", is_ ## P, "u_casing_property_" #P, "Casing Properties", version);
10392   PROPERTY(cased, cased)
10393   PROPERTY(ignorable, case_ignorable)
10394 #undef PROPERTY
10395 }
10396 
10397 /* ========================================================================= */
10398 
10399 int
10400 main (int argc, char * argv[])
     /* [previous][next][first][last][top][bottom][index][help] */
10401 {
10402   const char *unicodedata_filename;
10403   const char *proplist_filename;
10404   const char *derivedproplist_filename;
10405   const char *arabicshaping_filename;
10406   const char *scripts_filename;
10407   const char *blocks_filename;
10408   const char *proplist30_filename;
10409   const char *eastasianwidth_filename;
10410   const char *linebreak_filename;
10411   const char *wordbreakproperty_filename;
10412   const char *graphemebreakproperty_filename;
10413   const char *compositionexclusions_filename;
10414   const char *specialcasing_filename;
10415   const char *casefolding_filename;
10416   const char *version;
10417 
10418   if (argc != 16)
10419     {
10420       fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt ArabicShaping.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt GraphemeBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n",
10421                argv[0]);
10422       exit (1);
10423     }
10424 
10425   unicodedata_filename = argv[1];
10426   proplist_filename = argv[2];
10427   derivedproplist_filename = argv[3];
10428   arabicshaping_filename = argv[4];
10429   scripts_filename = argv[5];
10430   blocks_filename = argv[6];
10431   proplist30_filename = argv[7];
10432   eastasianwidth_filename = argv[8];
10433   linebreak_filename = argv[9];
10434   wordbreakproperty_filename = argv[10];
10435   graphemebreakproperty_filename = argv[11];
10436   compositionexclusions_filename = argv[12];
10437   specialcasing_filename = argv[13];
10438   casefolding_filename = argv[14];
10439   version = argv[15];
10440 
10441   fill_attributes (unicodedata_filename);
10442   clear_properties ();
10443   fill_properties (proplist_filename);
10444   fill_properties (derivedproplist_filename);
10445   fill_properties30 (proplist30_filename);
10446   fill_arabicshaping (arabicshaping_filename);
10447   fill_scripts (scripts_filename);
10448   fill_blocks (blocks_filename);
10449   fill_width (eastasianwidth_filename);
10450   fill_org_lbp (linebreak_filename);
10451   fill_org_wbp (wordbreakproperty_filename);
10452   fill_org_gbp (graphemebreakproperty_filename);
10453   fill_composition_exclusions (compositionexclusions_filename);
10454   fill_casing_rules (specialcasing_filename);
10455   fill_casefolding_rules (casefolding_filename);
10456   redistribute_casefolding_rules ();
10457   sort_casing_rules ();
10458 
10459   output_categories (version);
10460   output_category ("unictype/categ_of.h", version);
10461   output_combclass ("unictype/combiningclass.h", version);
10462   output_bidi_category ("unictype/bidi_of.h", version);
10463   output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version);
10464   output_decimal_digit ("unictype/decdigit.h", version);
10465   output_digit_test ("../tests/unictype/test-digit.h", version);
10466   output_digit ("unictype/digit.h", version);
10467   output_numeric_test ("../tests/unictype/test-numeric.h", version);
10468   output_numeric ("unictype/numeric.h", version);
10469   output_mirror ("unictype/mirror.h", version);
10470   output_properties (version);
10471   output_joining_type_test ("../tests/unictype/test-joiningtype_of.h", version);
10472   output_joining_type ("unictype/joiningtype_of.h", version);
10473   output_joining_group_test ("../tests/unictype/test-joininggroup_of.h", version);
10474   output_joining_group ("unictype/joininggroup_of.h", version);
10475 
10476   output_scripts (version);
10477   output_scripts_byname (version);
10478   output_blocks (version);
10479   output_ident_properties (version);
10480   output_nonspacing_property ("uniwidth/width.c.part");
10481   output_width_property_test ("../tests/uniwidth/test-uc_width2.sh.part");
10482   output_old_ctype (version);
10483 
10484   debug_output_lbrk_tables ("unilbrk/lbrkprop.txt");
10485   debug_output_org_lbrk_tables ("unilbrk/lbrkprop_org.txt");
10486   output_lbrk_tables ("unilbrk/lbrkprop1.h", "unilbrk/lbrkprop2.h", version);
10487 
10488   debug_output_wbrk_tables ("uniwbrk/wbrkprop.txt");
10489   debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt");
10490   output_wbrk_tables ("uniwbrk/wbrkprop.h", version);
10491 
10492   output_gbp_test ("../tests/unigbrk/test-uc-gbrk-prop.h");
10493   output_gbp_table ("unigbrk/gbrkprop.h", version);
10494 
10495   output_decomposition_tables ("uninorm/decomposition-table1.h", "uninorm/decomposition-table2.h", version);
10496   debug_output_composition_tables ("uninorm/composition.txt");
10497   output_composition_tables ("uninorm/composition-table.gperf", version);
10498 
10499   output_simple_mapping_test ("../tests/unicase/test-uc_toupper.c", "uc_toupper", to_upper, version);
10500   output_simple_mapping_test ("../tests/unicase/test-uc_tolower.c", "uc_tolower", to_lower, version);
10501   output_simple_mapping_test ("../tests/unicase/test-uc_totitle.c", "uc_totitle", to_title, version);
10502   output_simple_mapping ("unicase/toupper.h", to_upper, version);
10503   output_simple_mapping ("unicase/tolower.h", to_lower, version);
10504   output_simple_mapping ("unicase/totitle.h", to_title, version);
10505   output_simple_mapping ("unicase/tocasefold.h", to_casefold, version);
10506   output_casing_rules ("unicase/special-casing-table.gperf", version);
10507   output_casing_properties (version);
10508 
10509   return 0;
10510 }
10511 
10512 /*
10513  * Local Variables:
10514  * coding: utf-8
10515  * compile-command: "\
10516  *   gcc -O -Wall gen-uni-tables.c -Iunictype -o gen-uni-tables &&      \\
10517  *   ./gen-uni-tables                                                   \\
10518  *        /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/UnicodeData.txt \\
10519  *        /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/PropList.txt \\
10520  *        /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/DerivedCoreProperties.txt \\
10521  *        /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/ArabicShaping.txt \\
10522  *        /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/Scripts.txt \\
10523  *        /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/Blocks.txt \\
10524  *        /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \\
10525  *        /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/EastAsianWidth.txt \\
10526  *        /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/LineBreak.txt \\
10527  *        /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/auxiliary/WordBreakProperty.txt \\
10528  *        /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/auxiliary/GraphemeBreakProperty.txt \\
10529  *        /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/CompositionExclusions.txt \\
10530  *        /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/SpecialCasing.txt \\
10531  *        /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/9.0.0/ucd/CaseFolding.txt \\
10532  *        9.0.0                                                         \\
10533  *   && diff unilbrk/lbrkprop_org.txt unilbrk/lbrkprop.txt              \\
10534  *   && diff uniwbrk/wbrkprop_org.txt uniwbrk/wbrkprop.txt"
10535  * End:
10536  */

/* [previous][next][first][last][top][bottom][index][help] */