1 /* Word breaks in Unicode strings. 2 Copyright (C) 2001-2003, 2005-2021 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno@clisp.org>, 2009. 4 5 This file is free software. 6 It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". 7 You can redistribute it and/or modify it under either 8 - the terms of the GNU Lesser General Public License as published 9 by the Free Software Foundation; either version 3, or (at your 10 option) any later version, or 11 - the terms of the GNU General Public License as published by the 12 Free Software Foundation; either version 2, or (at your option) 13 any later version, or 14 - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+". 15 16 This file is distributed in the hope that it will be useful, 17 but WITHOUT ANY WARRANTY; without even the implied warranty of 18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 Lesser General Public License and the GNU General Public License 20 for more details. 21 22 You should have received a copy of the GNU Lesser General Public 23 License and of the GNU General Public License along with this 24 program. If not, see <https://www.gnu.org/licenses/>. */ 25 26 #ifndef _UNIWBRK_H 27 #define _UNIWBRK_H 28 29 /* Get size_t. */ 30 #include <stddef.h> 31 32 #include "unitypes.h" 33 34 35 #ifdef __cplusplus 36 extern "C" { 37 #endif 38 39 /* ========================================================================= */ 40 41 /* Property defined in Unicode Standard Annex #29, section "Word Boundaries" 42 <https://www.unicode.org/reports/tr29/#Word_Boundaries> */ 43 44 /* Possible values of the Word_Break property. 45 This enumeration may be extended in the future. */ 46 enum 47 { 48 WBP_OTHER = 0, 49 WBP_CR = 11, 50 WBP_LF = 12, 51 WBP_NEWLINE = 10, 52 WBP_EXTEND = 8, 53 WBP_FORMAT = 9, 54 WBP_KATAKANA = 1, 55 WBP_ALETTER = 2, 56 WBP_MIDNUMLET = 3, 57 WBP_MIDLETTER = 4, 58 WBP_MIDNUM = 5, 59 WBP_NUMERIC = 6, 60 WBP_EXTENDNUMLET = 7, 61 WBP_RI = 13, 62 WBP_DQ = 14, 63 WBP_SQ = 15, 64 WBP_HL = 16, 65 WBP_ZWJ = 17, 66 WBP_EB = 18, 67 WBP_EM = 19, 68 WBP_GAZ = 20, 69 WBP_EBG = 21 70 }; 71 72 /* Return the Word_Break property of a Unicode character. */ 73 extern int 74 uc_wordbreak_property (ucs4_t uc) 75 _UC_ATTRIBUTE_CONST; 76 77 /* ========================================================================= */ 78 79 /* Word breaks. */ 80 81 /* Determine the word break points in S, and store the result at p[0..n-1]. 82 p[i] = 1 means that there is a word boundary between s[i-1] and s[i]. 83 p[i] = 0 means that s[i-1] and s[i] must not be separated. 84 */ 85 extern void 86 u8_wordbreaks (const uint8_t *s, size_t n, char *p); 87 extern void 88 u16_wordbreaks (const uint16_t *s, size_t n, char *p); 89 extern void 90 u32_wordbreaks (const uint32_t *s, size_t n, char *p); 91 extern void 92 ulc_wordbreaks (const char *s, size_t n, char *_UC_RESTRICT p); 93 94 /* ========================================================================= */ 95 96 #ifdef __cplusplus 97 } 98 #endif 99 100 101 #endif /* _UNIWBRK_H */