1 /* Line breaking auxiliary tables. 2 Copyright (C) 2001-2003, 2006-2021 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno@clisp.org>, 2001. 4 5 This file is free software. 6 It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". 7 You can redistribute it and/or modify it under either 8 - the terms of the GNU Lesser General Public License as published 9 by the Free Software Foundation; either version 3, or (at your 10 option) any later version, or 11 - the terms of the GNU General Public License as published by the 12 Free Software Foundation; either version 2, or (at your option) 13 any later version, or 14 - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+". 15 16 This file is distributed in the hope that it will be useful, 17 but WITHOUT ANY WARRANTY; without even the implied warranty of 18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 Lesser General Public License and the GNU General Public License 20 for more details. 21 22 You should have received a copy of the GNU Lesser General Public 23 License and of the GNU General Public License along with this 24 program. If not, see <https://www.gnu.org/licenses/>. */ 25 26 #include "unitypes.h" 27 28 /* Line breaking classification. */ 29 30 enum 31 { 32 /* Values >= 30 are resolved at run time. */ 33 LBP_BK = 30, /* mandatory break */ 34 /*LBP_CR, carriage return - not used here because it's a DOSism */ 35 /*LBP_LF, line feed - not used here because it's a DOSism */ 36 LBP_CM = 31, /* attached characters and combining marks */ 37 /*LBP_NL, next line - not used here because it's equivalent to LBP_BK */ 38 /*LBP_SG, surrogates - not used here because they are not characters */ 39 LBP_WJ = 0, /* word joiner */ 40 LBP_ZW = 32, /* zero width space */ 41 LBP_GL = 1, /* non-breaking (glue) */ 42 LBP_SP = 33, /* space */ 43 LBP_B2 = 2, /* break opportunity before and after */ 44 LBP_BA = 3, /* break opportunity after */ 45 LBP_BB = 4, /* break opportunity before */ 46 LBP_HY = 5, /* hyphen */ 47 LBP_CB = 34, /* contingent break opportunity */ 48 LBP_CL = 6, /* closing punctuation */ 49 LBP_CP = 7, /* closing parenthesis */ 50 LBP_EX = 8, /* exclamation/interrogation */ 51 LBP_IN = 9, /* inseparable */ 52 LBP_NS = 10, /* non starter */ 53 LBP_OP = 11, /* opening punctuation */ 54 LBP_QU = 12, /* ambiguous quotation */ 55 LBP_IS = 13, /* infix separator (numeric) */ 56 LBP_NU = 14, /* numeric */ 57 LBP_PO = 15, /* postfix (numeric) */ 58 LBP_PR = 16, /* prefix (numeric) */ 59 LBP_SY = 17, /* symbols allowing breaks */ 60 LBP_AI = 35, /* ambiguous (alphabetic or ideograph) */ 61 LBP_AL = 18, /* ordinary alphabetic and symbol characters */ 62 /*LBP_CJ, conditional Japanese starters, resolved to NS */ 63 LBP_H2 = 19, /* Hangul LV syllable */ 64 LBP_H3 = 20, /* Hangul LVT syllable */ 65 LBP_HL = 25, /* Hebrew letter */ 66 LBP_ID = 21, /* ideographic */ 67 LBP_JL = 22, /* Hangul L Jamo */ 68 LBP_JV = 23, /* Hangul V Jamo */ 69 LBP_JT = 24, /* Hangul T Jamo */ 70 LBP_RI = 26, /* regional indicator */ 71 LBP_SA = 36, /* complex context (South East Asian) */ 72 LBP_ZWJ = 27, /* zero width joiner */ 73 LBP_EB = 28, /* emoji base */ 74 LBP_EM = 29, /* emoji modifier */ 75 LBP_XX = 37 /* unknown */ 76 }; 77 78 #include "lbrkprop1.h" 79 80 static inline unsigned char 81 unilbrkprop_lookup (ucs4_t uc) /* */ 82 { 83 unsigned int index1 = uc >> lbrkprop_header_0; 84 if (index1 < lbrkprop_header_1) 85 { 86 int lookup1 = unilbrkprop.level1[index1]; 87 if (lookup1 >= 0) 88 { 89 unsigned int index2 = (uc >> lbrkprop_header_2) & lbrkprop_header_3; 90 int lookup2 = unilbrkprop.level2[lookup1 + index2]; 91 if (lookup2 >= 0) 92 { 93 unsigned int index3 = uc & lbrkprop_header_4; 94 return unilbrkprop.level3[lookup2 + index3]; 95 } 96 } 97 } 98 return LBP_XX; 99 } 100 101 /* Table indexed by two line breaking classifications. */ 102 #define D 1 /* direct break opportunity, empty in table 7.3 of UTR #14 */ 103 #define I 2 /* indirect break opportunity, '%' in table 7.3 of UTR #14 */ 104 #define P 3 /* prohibited break, '^' in table 7.3 of UTR #14 */ 105 106 extern const unsigned char unilbrk_table[30][30]; 107 108 /* We don't support line breaking of complex-context dependent characters 109 (Thai, Lao, Myanmar, Khmer) yet, because it requires dictionary lookup. */