1 /* Normalization forms (composition and decomposition) of Unicode strings. 2 Copyright (C) 2001-2002, 2009-2021 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno@clisp.org>, 2009. 4 5 This file is free software: you can redistribute it and/or modify 6 it under the terms of the GNU Lesser General Public License as 7 published by the Free Software Foundation; either version 2.1 of the 8 License, or (at your option) any later version. 9 10 This file is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public License 16 along with this program. If not, see <https://www.gnu.org/licenses/>. */ 17 18 #ifndef _UNINORM_H 19 #define _UNINORM_H 20 21 /* Get size_t. */ 22 #include <stddef.h> 23 24 #include "unitypes.h" 25 26 27 #ifdef __cplusplus 28 extern "C" { 29 #endif 30 31 32 /* Conventions: 33 34 All functions prefixed with u8_ operate on UTF-8 encoded strings. 35 Their unit is an uint8_t (1 byte). 36 37 All functions prefixed with u16_ operate on UTF-16 encoded strings. 38 Their unit is an uint16_t (a 2-byte word). 39 40 All functions prefixed with u32_ operate on UCS-4 encoded strings. 41 Their unit is an uint32_t (a 4-byte word). 42 43 All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly 44 n units. 45 46 Functions returning a string result take a (resultbuf, lengthp) argument 47 pair. If resultbuf is not NULL and the result fits into *lengthp units, 48 it is put in resultbuf, and resultbuf is returned. Otherwise, a freshly 49 allocated string is returned. In both cases, *lengthp is set to the 50 length (number of units) of the returned string. In case of error, 51 NULL is returned and errno is set. */ 52 53 54 enum 55 { 56 UC_DECOMP_CANONICAL,/* Canonical decomposition. */ 57 UC_DECOMP_FONT, /* <font> A font variant (e.g. a blackletter form). */ 58 UC_DECOMP_NOBREAK, /* <noBreak> A no-break version of a space or hyphen. */ 59 UC_DECOMP_INITIAL, /* <initial> An initial presentation form (Arabic). */ 60 UC_DECOMP_MEDIAL, /* <medial> A medial presentation form (Arabic). */ 61 UC_DECOMP_FINAL, /* <final> A final presentation form (Arabic). */ 62 UC_DECOMP_ISOLATED,/* <isolated> An isolated presentation form (Arabic). */ 63 UC_DECOMP_CIRCLE, /* <circle> An encircled form. */ 64 UC_DECOMP_SUPER, /* <super> A superscript form. */ 65 UC_DECOMP_SUB, /* <sub> A subscript form. */ 66 UC_DECOMP_VERTICAL,/* <vertical> A vertical layout presentation form. */ 67 UC_DECOMP_WIDE, /* <wide> A wide (or zenkaku) compatibility character. */ 68 UC_DECOMP_NARROW, /* <narrow> A narrow (or hankaku) compatibility character. */ 69 UC_DECOMP_SMALL, /* <small> A small variant form (CNS compatibility). */ 70 UC_DECOMP_SQUARE, /* <square> A CJK squared font variant. */ 71 UC_DECOMP_FRACTION,/* <fraction> A vulgar fraction form. */ 72 UC_DECOMP_COMPAT /* <compat> Otherwise unspecified compatibility character. */ 73 }; 74 75 /* Maximum size of decomposition of a single Unicode character. */ 76 #define UC_DECOMPOSITION_MAX_LENGTH 32 77 78 /* Return the character decomposition mapping of a Unicode character. 79 DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH 80 ucs_t elements. 81 When a decomposition exists, DECOMPOSITION[0..N-1] and *DECOMP_TAG are 82 filled and N is returned. Otherwise -1 is returned. */ 83 extern int 84 uc_decomposition (ucs4_t uc, int *decomp_tag, ucs4_t *decomposition); 85 86 /* Return the canonical character decomposition mapping of a Unicode character. 87 DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH 88 ucs_t elements. 89 When a decomposition exists, DECOMPOSITION[0..N-1] is filled and N is 90 returned. Otherwise -1 is returned. */ 91 extern int 92 uc_canonical_decomposition (ucs4_t uc, ucs4_t *decomposition); 93 94 95 /* Attempt to combine the Unicode characters uc1, uc2. 96 uc1 is known to have canonical combining class 0. 97 Return the combination of uc1 and uc2, if it exists. 98 Return 0 otherwise. 99 Not all decompositions can be recombined using this function. See the 100 Unicode file CompositionExclusions.txt for details. */ 101 extern ucs4_t 102 uc_composition (ucs4_t uc1, ucs4_t uc2) 103 _UC_ATTRIBUTE_CONST; 104 105 106 /* An object of type uninorm_t denotes a Unicode normalization form. */ 107 struct unicode_normalization_form; 108 typedef const struct unicode_normalization_form *uninorm_t; 109 110 /* UNINORM_NFD: Normalization form D: canonical decomposition. */ 111 extern const struct unicode_normalization_form uninorm_nfd; 112 #define UNINORM_NFD (&uninorm_nfd) 113 114 /* UNINORM_NFC: Normalization form C: canonical decomposition, then 115 canonical composition. */ 116 extern const struct unicode_normalization_form uninorm_nfc; 117 #define UNINORM_NFC (&uninorm_nfc) 118 119 /* UNINORM_NFKD: Normalization form KD: compatibility decomposition. */ 120 extern const struct unicode_normalization_form uninorm_nfkd; 121 #define UNINORM_NFKD (&uninorm_nfkd) 122 123 /* UNINORM_NFKC: Normalization form KC: compatibility decomposition, then 124 canonical composition. */ 125 extern const struct unicode_normalization_form uninorm_nfkc; 126 #define UNINORM_NFKC (&uninorm_nfkc) 127 128 /* Test whether a normalization form does compatibility decomposition. */ 129 #define uninorm_is_compat_decomposing(nf) \ 130 ((* (const unsigned int *) (nf) >> 0) & 1) 131 132 /* Test whether a normalization form includes canonical composition. */ 133 #define uninorm_is_composing(nf) \ 134 ((* (const unsigned int *) (nf) >> 1) & 1) 135 136 /* Return the decomposing variant of a normalization form. 137 This maps NFC,NFD -> NFD and NFKC,NFKD -> NFKD. */ 138 extern uninorm_t 139 uninorm_decomposing_form (uninorm_t nf) 140 _UC_ATTRIBUTE_PURE; 141 142 143 /* Return the specified normalization form of a string. */ 144 extern uint8_t * 145 u8_normalize (uninorm_t nf, const uint8_t *s, size_t n, 146 uint8_t *_UC_RESTRICT resultbuf, size_t *lengthp); 147 extern uint16_t * 148 u16_normalize (uninorm_t nf, const uint16_t *s, size_t n, 149 uint16_t *_UC_RESTRICT resultbuf, size_t *lengthp); 150 extern uint32_t * 151 u32_normalize (uninorm_t nf, const uint32_t *s, size_t n, 152 uint32_t *_UC_RESTRICT resultbuf, size_t *lengthp); 153 154 155 /* Compare S1 and S2, ignoring differences in normalization. 156 NF must be either UNINORM_NFD or UNINORM_NFKD. 157 If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and 158 return 0. Upon failure, return -1 with errno set. */ 159 extern int 160 u8_normcmp (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2, 161 uninorm_t nf, int *resultp); 162 extern int 163 u16_normcmp (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2, 164 uninorm_t nf, int *resultp); 165 extern int 166 u32_normcmp (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2, 167 uninorm_t nf, int *resultp); 168 169 170 /* Converts the string S of length N to a NUL-terminated byte sequence, in such 171 a way that comparing uN_normxfrm (S1) and uN_normxfrm (S2) with uN_cmp2() is 172 equivalent to comparing S1 and S2 with uN_normcoll(). 173 NF must be either UNINORM_NFC or UNINORM_NFKC. */ 174 extern char * 175 u8_normxfrm (const uint8_t *s, size_t n, uninorm_t nf, 176 char *resultbuf, size_t *lengthp); 177 extern char * 178 u16_normxfrm (const uint16_t *s, size_t n, uninorm_t nf, 179 char *resultbuf, size_t *lengthp); 180 extern char * 181 u32_normxfrm (const uint32_t *s, size_t n, uninorm_t nf, 182 char *resultbuf, size_t *lengthp); 183 184 185 /* Compare S1 and S2, ignoring differences in normalization, using the 186 collation rules of the current locale. 187 NF must be either UNINORM_NFC or UNINORM_NFKC. 188 If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and 189 return 0. Upon failure, return -1 with errno set. */ 190 extern int 191 u8_normcoll (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2, 192 uninorm_t nf, int *resultp); 193 extern int 194 u16_normcoll (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2, 195 uninorm_t nf, int *resultp); 196 extern int 197 u32_normcoll (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2, 198 uninorm_t nf, int *resultp); 199 200 201 /* Normalization of a stream of Unicode characters. 202 203 A "stream of Unicode characters" is essentially a function that accepts an 204 ucs4_t argument repeatedly, optionally combined with a function that 205 "flushes" the stream. */ 206 207 /* Data type of a stream of Unicode characters that normalizes its input 208 according to a given normalization form and passes the normalized character 209 sequence to the encapsulated stream of Unicode characters. */ 210 struct uninorm_filter; 211 212 /* Bring data buffered in the filter to its destination, the encapsulated 213 stream, then close and free the filter. 214 Return 0 if successful, or -1 with errno set upon failure. */ 215 extern int 216 uninorm_filter_free (struct uninorm_filter *filter); 217 218 /* Create and return a normalization filter for Unicode characters. 219 The pair (stream_func, stream_data) is the encapsulated stream. 220 stream_func (stream_data, uc) receives the Unicode character uc 221 and returns 0 if successful, or -1 with errno set upon failure. 222 Return the new filter, or NULL with errno set upon failure. */ 223 extern struct uninorm_filter * 224 uninorm_filter_create (uninorm_t nf, 225 int (*stream_func) (void *stream_data, ucs4_t uc), 226 void *stream_data) 227 _GL_ATTRIBUTE_DEALLOC (uninorm_filter_free, 1); 228 229 /* Stuff a Unicode character into a normalizing filter. 230 Return 0 if successful, or -1 with errno set upon failure. */ 231 extern int 232 uninorm_filter_write (struct uninorm_filter *filter, ucs4_t uc); 233 234 /* Bring data buffered in the filter to its destination, the encapsulated 235 stream. 236 Return 0 if successful, or -1 with errno set upon failure. 237 Note! If after calling this function, additional characters are written 238 into the filter, the resulting character sequence in the encapsulated stream 239 will not necessarily be normalized. */ 240 extern int 241 uninorm_filter_flush (struct uninorm_filter *filter); 242 243 244 #ifdef __cplusplus 245 } 246 #endif 247 248 249 #endif /* _UNINORM_H */