maint/gnulib/lib/unistr/u8-chr.c

/* */
This source file includes following definitions.
u8_chr
   1 /* Search character in piece of UTF-8 string.
   2    Copyright (C) 1999, 2002, 2006-2007, 2009-2021 Free Software Foundation,
   3    Inc.
   4    Written by Bruno Haible <bruno@clisp.org>, 2002.
   5 
   6    This file is free software.
   7    It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
   8    You can redistribute it and/or modify it under either
   9      - the terms of the GNU Lesser General Public License as published
  10        by the Free Software Foundation; either version 3, or (at your
  11        option) any later version, or
  12      - the terms of the GNU General Public License as published by the
  13        Free Software Foundation; either version 2, or (at your option)
  14        any later version, or
  15      - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
  16 
  17    This file is distributed in the hope that it will be useful,
  18    but WITHOUT ANY WARRANTY; without even the implied warranty of
  19    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20    Lesser General Public License and the GNU General Public License
  21    for more details.
  22 
  23    You should have received a copy of the GNU Lesser General Public
  24    License and of the GNU General Public License along with this
  25    program.  If not, see <https://www.gnu.org/licenses/>.  */
  26 
  27 #include <config.h>
  28 
  29 /* Specification.  */
  30 #include "unistr.h"
  31 
  32 #include <string.h>
  33 
  34 uint8_t *
  35 u8_chr (const uint8_t *s, size_t n, ucs4_t uc)
     /*  */
  36 {
  37   if (uc < 0x80)
  38     {
  39       uint8_t c0 = uc;
  40 
  41       return (uint8_t *) memchr ((const char *) s, c0, n);
  42     }
  43 
  44   {
  45     uint8_t c[6];
  46     size_t uc_size;
  47     uc_size = u8_uctomb_aux (c, uc, 6);
  48 
  49     if (n < uc_size)
  50       return NULL;
  51 
  52     /* For multibyte character matching we use a Boyer-Moore like
  53        algorithm that searches for the last byte, skipping multi-byte
  54        jumps, and matches back from there.
  55 
  56        Instead of using a table as is usual for Boyer-Moore, we compare
  57        the candidate last byte s[UC_SIZE-1] with each of the possible
  58        bytes in the UTF-8 representation of UC.  If the final byte does
  59        not match, we will perform up to UC_SIZE comparisons per memory
  60        load---but each comparison lets us skip one byte in the input!
  61 
  62        If the final byte matches, the "real" Boyer-Moore algorithm
  63        is approximated.  Instead, u8_chr just looks for other cN that
  64        are equal to the final byte and uses those to try realigning to
  65        another possible match.  For example, when searching for 0xF0
  66        0xAA 0xBB 0xAA it will always skip forward by two bytes, even if
  67        the character in the string was for example 0xF1 0xAA 0xBB 0xAA.
  68        The advantage of this scheme is that the skip count after a failed
  69        match can be computed outside the loop, and that it keeps the
  70        complexity low for a pretty rare case.  In particular, since c[0]
  71        is never between 0x80 and 0xBF, c[0] is never equal to c[UC_SIZE-1]
  72        and this is optimal for two-byte UTF-8 characters.  */
  73     switch (uc_size)
  74       {
  75       case 2:
  76         {
  77           uint8_t c0 = c[0];
  78           uint8_t c1 = c[1];
  79           const uint8_t *end = s + n - 1;
  80 
  81           do
  82             {
  83               /* Here s < end.
  84                  Test whether s[0..1] == { c0, c1 }.  */
  85               uint8_t s1 = s[1];
  86               if (s1 == c1)
  87                 {
  88                   if (*s == c0)
  89                     return (uint8_t *) s;
  90                   else
  91                     /* Skip the search at s + 1, because s[1] = c1 < c0.  */
  92                     s += 2;
  93                 }
  94               else
  95                 {
  96                   if (s1 == c0)
  97                     s++;
  98                   else
  99                     /* Skip the search at s + 1, because s[1] != c0.  */
 100                     s += 2;
 101                 }
 102             }
 103           while (s < end);
 104           break;
 105         }
 106 
 107       case 3:
 108         {
 109           uint8_t c0 = c[0];
 110           uint8_t c1 = c[1];
 111           uint8_t c2 = c[2];
 112           const uint8_t *end = s + n - 2;
 113           size_t skip;
 114 
 115           if (c2 == c1)
 116             skip = 1;
 117           else
 118             skip = 3;
 119 
 120           do
 121             {
 122               /* Here s < end.
 123                  Test whether s[0..2] == { c0, c1, c2 }.  */
 124               uint8_t s2 = s[2];
 125               if (s2 == c2)
 126                 {
 127                   if (s[1] == c1 && *s == c0)
 128                     return (uint8_t *) s;
 129                   else
 130                     /* If c2 != c1:
 131                          Skip the search at s + 1, because s[2] == c2 != c1.
 132                        Skip the search at s + 2, because s[2] == c2 < c0.  */
 133                     s += skip;
 134                 }
 135               else
 136                 {
 137                   if (s2 == c1)
 138                     s++;
 139                   else if (s2 == c0)
 140                     /* Skip the search at s + 1, because s[2] != c1.  */
 141                     s += 2;
 142                   else
 143                     /* Skip the search at s + 1, because s[2] != c1.
 144                        Skip the search at s + 2, because s[2] != c0.  */
 145                     s += 3;
 146                 }
 147             }
 148           while (s < end);
 149           break;
 150         }
 151 
 152       case 4:
 153         {
 154           uint8_t c0 = c[0];
 155           uint8_t c1 = c[1];
 156           uint8_t c2 = c[2];
 157           uint8_t c3 = c[3];
 158           const uint8_t *end = s + n - 3;
 159           size_t skip;
 160 
 161           if (c3 == c2)
 162             skip = 1;
 163           else if (c3 == c1)
 164             skip = 2;
 165           else
 166             skip = 4;
 167 
 168           do
 169             {
 170               /* Here s < end.
 171                  Test whether s[0..3] == { c0, c1, c2, c3 }.  */
 172               uint8_t s3 = s[3];
 173               if (s3 == c3)
 174                 {
 175                   if (s[2] == c2 && s[1] == c1 && *s == c0)
 176                     return (uint8_t *) s;
 177                   else
 178                     /* If c3 != c2:
 179                          Skip the search at s + 1, because s[3] == c3 != c2.
 180                        If c3 != c1:
 181                          Skip the search at s + 2, because s[3] == c3 != c1.
 182                        Skip the search at s + 3, because s[3] == c3 < c0.  */
 183                     s += skip;
 184                 }
 185               else
 186                 {
 187                   if (s3 == c2)
 188                     s++;
 189                   else if (s3 == c1)
 190                     /* Skip the search at s + 1, because s[3] != c2.  */
 191                     s += 2;
 192                   else if (s3 == c0)
 193                     /* Skip the search at s + 1, because s[3] != c2.
 194                        Skip the search at s + 2, because s[3] != c1.  */
 195                     s += 3;
 196                   else
 197                     /* Skip the search at s + 1, because s[3] != c2.
 198                        Skip the search at s + 2, because s[3] != c1.
 199                        Skip the search at s + 3, because s[3] != c0.  */
 200                     s += 4;
 201                 }
 202             }
 203           while (s < end);
 204           break;
 205         }
 206       }
 207     return NULL;
 208   }
 209 }
/* */
root/maint/gnulib/lib/unistr/u8-chr.c

DEFINITIONS