1 /* Search character in piece of UTF-8 string.
2 Copyright (C) 1999, 2002, 2006-2007, 2009-2021 Free Software Foundation,
3 Inc.
4 Written by Bruno Haible <bruno@clisp.org>, 2002.
5
6 This file is free software.
7 It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
8 You can redistribute it and/or modify it under either
9 - the terms of the GNU Lesser General Public License as published
10 by the Free Software Foundation; either version 3, or (at your
11 option) any later version, or
12 - the terms of the GNU General Public License as published by the
13 Free Software Foundation; either version 2, or (at your option)
14 any later version, or
15 - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
16
17 This file is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 Lesser General Public License and the GNU General Public License
21 for more details.
22
23 You should have received a copy of the GNU Lesser General Public
24 License and of the GNU General Public License along with this
25 program. If not, see <https://www.gnu.org/licenses/>. */
26
27 #include <config.h>
28
29 /* Specification. */
30 #include "unistr.h"
31
32 #include <string.h>
33
34 uint8_t *
35 u8_chr (const uint8_t *s, size_t n, ucs4_t uc)
/* ![[previous]](../icons/n_left.png)
![[next]](../icons/n_right.png)
![[first]](../icons/n_first.png)
![[last]](../icons/n_last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
36 {
37 if (uc < 0x80)
38 {
39 uint8_t c0 = uc;
40
41 return (uint8_t *) memchr ((const char *) s, c0, n);
42 }
43
44 {
45 uint8_t c[6];
46 size_t uc_size;
47 uc_size = u8_uctomb_aux (c, uc, 6);
48
49 if (n < uc_size)
50 return NULL;
51
52 /* For multibyte character matching we use a Boyer-Moore like
53 algorithm that searches for the last byte, skipping multi-byte
54 jumps, and matches back from there.
55
56 Instead of using a table as is usual for Boyer-Moore, we compare
57 the candidate last byte s[UC_SIZE-1] with each of the possible
58 bytes in the UTF-8 representation of UC. If the final byte does
59 not match, we will perform up to UC_SIZE comparisons per memory
60 load---but each comparison lets us skip one byte in the input!
61
62 If the final byte matches, the "real" Boyer-Moore algorithm
63 is approximated. Instead, u8_chr just looks for other cN that
64 are equal to the final byte and uses those to try realigning to
65 another possible match. For example, when searching for 0xF0
66 0xAA 0xBB 0xAA it will always skip forward by two bytes, even if
67 the character in the string was for example 0xF1 0xAA 0xBB 0xAA.
68 The advantage of this scheme is that the skip count after a failed
69 match can be computed outside the loop, and that it keeps the
70 complexity low for a pretty rare case. In particular, since c[0]
71 is never between 0x80 and 0xBF, c[0] is never equal to c[UC_SIZE-1]
72 and this is optimal for two-byte UTF-8 characters. */
73 switch (uc_size)
74 {
75 case 2:
76 {
77 uint8_t c0 = c[0];
78 uint8_t c1 = c[1];
79 const uint8_t *end = s + n - 1;
80
81 do
82 {
83 /* Here s < end.
84 Test whether s[0..1] == { c0, c1 }. */
85 uint8_t s1 = s[1];
86 if (s1 == c1)
87 {
88 if (*s == c0)
89 return (uint8_t *) s;
90 else
91 /* Skip the search at s + 1, because s[1] = c1 < c0. */
92 s += 2;
93 }
94 else
95 {
96 if (s1 == c0)
97 s++;
98 else
99 /* Skip the search at s + 1, because s[1] != c0. */
100 s += 2;
101 }
102 }
103 while (s < end);
104 break;
105 }
106
107 case 3:
108 {
109 uint8_t c0 = c[0];
110 uint8_t c1 = c[1];
111 uint8_t c2 = c[2];
112 const uint8_t *end = s + n - 2;
113 size_t skip;
114
115 if (c2 == c1)
116 skip = 1;
117 else
118 skip = 3;
119
120 do
121 {
122 /* Here s < end.
123 Test whether s[0..2] == { c0, c1, c2 }. */
124 uint8_t s2 = s[2];
125 if (s2 == c2)
126 {
127 if (s[1] == c1 && *s == c0)
128 return (uint8_t *) s;
129 else
130 /* If c2 != c1:
131 Skip the search at s + 1, because s[2] == c2 != c1.
132 Skip the search at s + 2, because s[2] == c2 < c0. */
133 s += skip;
134 }
135 else
136 {
137 if (s2 == c1)
138 s++;
139 else if (s2 == c0)
140 /* Skip the search at s + 1, because s[2] != c1. */
141 s += 2;
142 else
143 /* Skip the search at s + 1, because s[2] != c1.
144 Skip the search at s + 2, because s[2] != c0. */
145 s += 3;
146 }
147 }
148 while (s < end);
149 break;
150 }
151
152 case 4:
153 {
154 uint8_t c0 = c[0];
155 uint8_t c1 = c[1];
156 uint8_t c2 = c[2];
157 uint8_t c3 = c[3];
158 const uint8_t *end = s + n - 3;
159 size_t skip;
160
161 if (c3 == c2)
162 skip = 1;
163 else if (c3 == c1)
164 skip = 2;
165 else
166 skip = 4;
167
168 do
169 {
170 /* Here s < end.
171 Test whether s[0..3] == { c0, c1, c2, c3 }. */
172 uint8_t s3 = s[3];
173 if (s3 == c3)
174 {
175 if (s[2] == c2 && s[1] == c1 && *s == c0)
176 return (uint8_t *) s;
177 else
178 /* If c3 != c2:
179 Skip the search at s + 1, because s[3] == c3 != c2.
180 If c3 != c1:
181 Skip the search at s + 2, because s[3] == c3 != c1.
182 Skip the search at s + 3, because s[3] == c3 < c0. */
183 s += skip;
184 }
185 else
186 {
187 if (s3 == c2)
188 s++;
189 else if (s3 == c1)
190 /* Skip the search at s + 1, because s[3] != c2. */
191 s += 2;
192 else if (s3 == c0)
193 /* Skip the search at s + 1, because s[3] != c2.
194 Skip the search at s + 2, because s[3] != c1. */
195 s += 3;
196 else
197 /* Skip the search at s + 1, because s[3] != c2.
198 Skip the search at s + 2, because s[3] != c1.
199 Skip the search at s + 3, because s[3] != c0. */
200 s += 4;
201 }
202 }
203 while (s < end);
204 break;
205 }
206 }
207 return NULL;
208 }
209 }