1 /* Search character in piece of UTF-8 string. 2 Copyright (C) 1999, 2002, 2006-2007, 2009-2021 Free Software Foundation, 3 Inc. 4 Written by Bruno Haible <bruno@clisp.org>, 2002. 5 6 This file is free software. 7 It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". 8 You can redistribute it and/or modify it under either 9 - the terms of the GNU Lesser General Public License as published 10 by the Free Software Foundation; either version 3, or (at your 11 option) any later version, or 12 - the terms of the GNU General Public License as published by the 13 Free Software Foundation; either version 2, or (at your option) 14 any later version, or 15 - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+". 16 17 This file is distributed in the hope that it will be useful, 18 but WITHOUT ANY WARRANTY; without even the implied warranty of 19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 Lesser General Public License and the GNU General Public License 21 for more details. 22 23 You should have received a copy of the GNU Lesser General Public 24 License and of the GNU General Public License along with this 25 program. If not, see <https://www.gnu.org/licenses/>. */ 26 27 #include <config.h> 28 29 /* Specification. */ 30 #include "unistr.h" 31 32 #include <string.h> 33 34 uint8_t * 35 u8_chr (const uint8_t *s, size_t n, ucs4_t uc) /* */ 36 { 37 if (uc < 0x80) 38 { 39 uint8_t c0 = uc; 40 41 return (uint8_t *) memchr ((const char *) s, c0, n); 42 } 43 44 { 45 uint8_t c[6]; 46 size_t uc_size; 47 uc_size = u8_uctomb_aux (c, uc, 6); 48 49 if (n < uc_size) 50 return NULL; 51 52 /* For multibyte character matching we use a Boyer-Moore like 53 algorithm that searches for the last byte, skipping multi-byte 54 jumps, and matches back from there. 55 56 Instead of using a table as is usual for Boyer-Moore, we compare 57 the candidate last byte s[UC_SIZE-1] with each of the possible 58 bytes in the UTF-8 representation of UC. If the final byte does 59 not match, we will perform up to UC_SIZE comparisons per memory 60 load---but each comparison lets us skip one byte in the input! 61 62 If the final byte matches, the "real" Boyer-Moore algorithm 63 is approximated. Instead, u8_chr just looks for other cN that 64 are equal to the final byte and uses those to try realigning to 65 another possible match. For example, when searching for 0xF0 66 0xAA 0xBB 0xAA it will always skip forward by two bytes, even if 67 the character in the string was for example 0xF1 0xAA 0xBB 0xAA. 68 The advantage of this scheme is that the skip count after a failed 69 match can be computed outside the loop, and that it keeps the 70 complexity low for a pretty rare case. In particular, since c[0] 71 is never between 0x80 and 0xBF, c[0] is never equal to c[UC_SIZE-1] 72 and this is optimal for two-byte UTF-8 characters. */ 73 switch (uc_size) 74 { 75 case 2: 76 { 77 uint8_t c0 = c[0]; 78 uint8_t c1 = c[1]; 79 const uint8_t *end = s + n - 1; 80 81 do 82 { 83 /* Here s < end. 84 Test whether s[0..1] == { c0, c1 }. */ 85 uint8_t s1 = s[1]; 86 if (s1 == c1) 87 { 88 if (*s == c0) 89 return (uint8_t *) s; 90 else 91 /* Skip the search at s + 1, because s[1] = c1 < c0. */ 92 s += 2; 93 } 94 else 95 { 96 if (s1 == c0) 97 s++; 98 else 99 /* Skip the search at s + 1, because s[1] != c0. */ 100 s += 2; 101 } 102 } 103 while (s < end); 104 break; 105 } 106 107 case 3: 108 { 109 uint8_t c0 = c[0]; 110 uint8_t c1 = c[1]; 111 uint8_t c2 = c[2]; 112 const uint8_t *end = s + n - 2; 113 size_t skip; 114 115 if (c2 == c1) 116 skip = 1; 117 else 118 skip = 3; 119 120 do 121 { 122 /* Here s < end. 123 Test whether s[0..2] == { c0, c1, c2 }. */ 124 uint8_t s2 = s[2]; 125 if (s2 == c2) 126 { 127 if (s[1] == c1 && *s == c0) 128 return (uint8_t *) s; 129 else 130 /* If c2 != c1: 131 Skip the search at s + 1, because s[2] == c2 != c1. 132 Skip the search at s + 2, because s[2] == c2 < c0. */ 133 s += skip; 134 } 135 else 136 { 137 if (s2 == c1) 138 s++; 139 else if (s2 == c0) 140 /* Skip the search at s + 1, because s[2] != c1. */ 141 s += 2; 142 else 143 /* Skip the search at s + 1, because s[2] != c1. 144 Skip the search at s + 2, because s[2] != c0. */ 145 s += 3; 146 } 147 } 148 while (s < end); 149 break; 150 } 151 152 case 4: 153 { 154 uint8_t c0 = c[0]; 155 uint8_t c1 = c[1]; 156 uint8_t c2 = c[2]; 157 uint8_t c3 = c[3]; 158 const uint8_t *end = s + n - 3; 159 size_t skip; 160 161 if (c3 == c2) 162 skip = 1; 163 else if (c3 == c1) 164 skip = 2; 165 else 166 skip = 4; 167 168 do 169 { 170 /* Here s < end. 171 Test whether s[0..3] == { c0, c1, c2, c3 }. */ 172 uint8_t s3 = s[3]; 173 if (s3 == c3) 174 { 175 if (s[2] == c2 && s[1] == c1 && *s == c0) 176 return (uint8_t *) s; 177 else 178 /* If c3 != c2: 179 Skip the search at s + 1, because s[3] == c3 != c2. 180 If c3 != c1: 181 Skip the search at s + 2, because s[3] == c3 != c1. 182 Skip the search at s + 3, because s[3] == c3 < c0. */ 183 s += skip; 184 } 185 else 186 { 187 if (s3 == c2) 188 s++; 189 else if (s3 == c1) 190 /* Skip the search at s + 1, because s[3] != c2. */ 191 s += 2; 192 else if (s3 == c0) 193 /* Skip the search at s + 1, because s[3] != c2. 194 Skip the search at s + 2, because s[3] != c1. */ 195 s += 3; 196 else 197 /* Skip the search at s + 1, because s[3] != c2. 198 Skip the search at s + 2, because s[3] != c1. 199 Skip the search at s + 3, because s[3] != c0. */ 200 s += 4; 201 } 202 } 203 while (s < end); 204 break; 205 } 206 } 207 return NULL; 208 } 209 }