root/maint/gnulib/lib/mbrtoc32.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. mbrtoc32
  2. mbrtoc32

   1 /* Convert multibyte character to 32-bit wide character.
   2    Copyright (C) 2020-2021 Free Software Foundation, Inc.
   3 
   4    This file is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU Lesser General Public License as
   6    published by the Free Software Foundation; either version 2.1 of the
   7    License, or (at your option) any later version.
   8 
   9    This file is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU Lesser General Public License for more details.
  13 
  14    You should have received a copy of the GNU Lesser General Public License
  15    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
  16 
  17 /* Written by Bruno Haible <bruno@clisp.org>, 2020.  */
  18 
  19 #include <config.h>
  20 
  21 /* Specification.  */
  22 #include <uchar.h>
  23 
  24 #include "attribute.h"
  25 
  26 #include <errno.h>
  27 #include <stdlib.h>
  28 
  29 #if GNULIB_defined_mbstate_t /* AIX, IRIX */
  30 /* Implement mbrtoc32() on top of mbtowc() for the non-UTF-8 locales
  31    and directly for the UTF-8 locales.  */
  32 
  33 # if defined _WIN32 && !defined __CYGWIN__
  34 
  35 #  define WIN32_LEAN_AND_MEAN  /* avoid including junk */
  36 #  include <windows.h>
  37 
  38 # elif HAVE_PTHREAD_API
  39 
  40 #  include <pthread.h>
  41 #  if HAVE_THREADS_H && HAVE_WEAK_SYMBOLS
  42 #   include <threads.h>
  43 #   pragma weak thrd_exit
  44 #   define c11_threads_in_use() (thrd_exit != NULL)
  45 #  else
  46 #   define c11_threads_in_use() 0
  47 #  endif
  48 
  49 # elif HAVE_THREADS_H
  50 
  51 #  include <threads.h>
  52 
  53 # endif
  54 
  55 # include "verify.h"
  56 # include "lc-charset-dispatch.h"
  57 # include "mbtowc-lock.h"
  58 
  59 verify (sizeof (mbstate_t) >= 4);
  60 static char internal_state[4];
  61 
  62 size_t
  63 mbrtoc32 (char32_t *pwc, const char *s, size_t n, mbstate_t *ps)
     /* [previous][next][first][last][top][bottom][index][help] */
  64 {
  65 # define FITS_IN_CHAR_TYPE(wc)  1
  66 # include "mbrtowc-impl.h"
  67 }
  68 
  69 #else /* glibc, macOS, FreeBSD, NetBSD, OpenBSD, HP-UX, Solaris, Cygwin, mingw, MSVC, Minix, Android */
  70 
  71 /* Implement mbrtoc32() based on the original mbrtoc32() or on mbrtowc().  */
  72 
  73 # include <wchar.h>
  74 
  75 # include "localcharset.h"
  76 # include "streq.h"
  77 
  78 # if MBRTOC32_IN_C_LOCALE_MAYBE_EILSEQ
  79 #  include "hard-locale.h"
  80 #  include <locale.h>
  81 # endif
  82 
  83 static mbstate_t internal_state;
  84 
  85 size_t
  86 mbrtoc32 (char32_t *pwc, const char *s, size_t n, mbstate_t *ps)
     /* [previous][next][first][last][top][bottom][index][help] */
  87 # undef mbrtoc32
  88 {
  89   /* It's simpler to handle the case s == NULL upfront, than to worry about
  90      this case later, before every test of pwc and n.  */
  91   if (s == NULL)
  92     {
  93       pwc = NULL;
  94       s = "";
  95       n = 1;
  96     }
  97 
  98 # if MBRTOC32_EMPTY_INPUT_BUG || _GL_LARGE_CHAR32_T
  99   if (n == 0)
 100     return (size_t) -2;
 101 # endif
 102 
 103   if (ps == NULL)
 104     ps = &internal_state;
 105 
 106 # if HAVE_WORKING_MBRTOC32
 107   /* mbrtoc32() may produce different values for wc than mbrtowc().  Therefore
 108      use mbrtoc32().  */
 109 
 110 #  if defined _WIN32 && !defined __CYGWIN__
 111   char32_t wc;
 112   size_t ret = mbrtoc32 (&wc, s, n, ps);
 113   if (ret < (size_t) -2 && pwc != NULL)
 114     *pwc = wc;
 115 #  else
 116   size_t ret = mbrtoc32 (pwc, s, n, ps);
 117 #  endif
 118 
 119 #  if MBRTOC32_IN_C_LOCALE_MAYBE_EILSEQ
 120   if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE))
 121     {
 122       if (pwc != NULL)
 123         *pwc = (unsigned char) *s;
 124       return 1;
 125     }
 126 #  endif
 127 
 128   return ret;
 129 
 130 # elif _GL_LARGE_CHAR32_T
 131 
 132   /* Special-case all encodings that may produce wide character values
 133      > WCHAR_MAX.  */
 134   const char *encoding = locale_charset ();
 135   if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
 136     {
 137       /* Special-case the UTF-8 encoding.  Assume that the wide-character
 138          encoding in a UTF-8 locale is UCS-2 or, equivalently, UTF-16.  */
 139       /* Here n > 0.  */
 140       char *pstate = (char *)ps;
 141       size_t nstate = pstate[0];
 142       char buf[4];
 143       const char *p;
 144       size_t m;
 145       int res;
 146 
 147       switch (nstate)
 148         {
 149         case 0:
 150           p = s;
 151           m = n;
 152           break;
 153         case 3:
 154           buf[2] = pstate[3];
 155           FALLTHROUGH;
 156         case 2:
 157           buf[1] = pstate[2];
 158           FALLTHROUGH;
 159         case 1:
 160           buf[0] = pstate[1];
 161           p = buf;
 162           m = nstate;
 163           buf[m++] = s[0];
 164           if (n >= 2 && m < 4)
 165             {
 166               buf[m++] = s[1];
 167               if (n >= 3 && m < 4)
 168                 buf[m++] = s[2];
 169             }
 170           break;
 171         default:
 172           errno = EINVAL;
 173           return (size_t)(-1);
 174         }
 175 
 176       /* Here m > 0.  */
 177 
 178       {
 179 #  define FITS_IN_CHAR_TYPE(wc)  1
 180 #  include "mbrtowc-impl-utf8.h"
 181       }
 182 
 183      success:
 184       if (nstate >= (res > 0 ? res : 1))
 185         abort ();
 186       res -= nstate;
 187       /* Set *ps to the initial state.  */
 188 #  if defined _WIN32 && !defined __CYGWIN__
 189       /* Native Windows.  */
 190       /* MSVC defines 'mbstate_t' as an 8-byte struct; the first 4 bytes matter.
 191          On mingw, 'mbstate_t' is sometimes defined as 'int', sometimes defined
 192          as an 8-byte struct, of which the first 4 bytes matter.  */
 193       *(unsigned int *)pstate = 0;
 194 #  elif defined __CYGWIN__
 195       /* Cygwin defines 'mbstate_t' as an 8-byte struct; the first 4 bytes
 196          matter.  */
 197       ps->__count = 0;
 198 #  else
 199       pstate[0] = 0;
 200 #  endif
 201       return res;
 202 
 203      incomplete:
 204       {
 205         size_t k = nstate;
 206         /* Here 0 <= k < m < 4.  */
 207         pstate[++k] = s[0];
 208         if (k < m)
 209           {
 210             pstate[++k] = s[1];
 211             if (k < m)
 212               pstate[++k] = s[2];
 213           }
 214         if (k != m)
 215           abort ();
 216       }
 217       pstate[0] = m;
 218       return (size_t)(-2);
 219 
 220      invalid:
 221       errno = EILSEQ;
 222       /* The conversion state is undefined, says POSIX.  */
 223       return (size_t)(-1);
 224     }
 225   else
 226     {
 227       wchar_t wc;
 228       size_t ret = mbrtowc (&wc, s, n, ps);
 229       if (ret < (size_t) -2 && pwc != NULL)
 230         *pwc = wc;
 231       return ret;
 232     }
 233 
 234 # else
 235 
 236   /* char32_t and wchar_t are equivalent.  Use mbrtowc().  */
 237   wchar_t wc;
 238   size_t ret = mbrtowc (&wc, s, n, ps);
 239   if (ret < (size_t) -2 && pwc != NULL)
 240     *pwc = wc;
 241   return ret;
 242 
 243 # endif
 244 }
 245 
 246 #endif

/* [previous][next][first][last][top][bottom][index][help] */